In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
from data_loader import dataloader
user,ratings,movies = dataloader()
ratings = ratings.drop("timestamp",axis=1)
movie_index_by_id = {id: i for i, id in enumerate(movies["movie_id"])}


In [2]:
class MF(object):
    """docstring for CF"""
    def __init__(self, Y_data, K, lam = 0.1, Xinit = None, Winit = None, 
            learning_rate = 0.5, max_iter = 1000, print_every = 100, user_based = 1):
        self.Y_raw_data = Y_data
        self.K = K
        # regularization parameter
        self.lam = lam
        # learning rate for gradient descent
        self.learning_rate = learning_rate
        # maximum number of iterations
        self.max_iter = max_iter
        # print results after print_every iterations
        self.print_every = print_every
        # user-based or item-based
        self.user_based = user_based
        # number of users, items, and ratings. 
        self.n_users = int(np.max(Y_data[:, 0])) + 1 
        self.n_items = 3901
        self.n_ratings = Y_data.shape[0]
        
        if Xinit is None: # new
            self.X = np.random.randn(self.n_items, K)
        else: # or from saved data
            self.X = Xinit 
        
        if Winit is None: 
            self.W = np.random.randn(K, self.n_users)
        else: # from daved data
            self.W = Winit
            
        # normalized data, update later in normalized_Y function
        self.Y_data_n = self.Y_raw_data.copy()


    def normalize_Y(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users

        # if we want to normalize based on item, just switch first two columns of data
        else: # item bas
            user_col = 1
            item_col = 0 
            n_objects = self.n_items

        users = self.Y_raw_data[:, user_col] 
        self.mu = np.zeros((n_objects,))
        for n in range(n_objects):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data_n[ids, item_col] 
            # and the corresponding ratings 
            ratings = self.Y_data_n[ids, 2]
            # take mean
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            self.mu[n] = m
            # normalize
            self.Y_data_n[ids, 2] = ratings - self.mu[n]
    def loss(self):
        L = 0 
        for i in range(self.n_ratings):
            # user, item, rating
            n, m, rate = int(self.Y_data_n[i, 0]), int(self.Y_data_n[i, 1]), self.Y_data_n[i, 2]
            L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2
        
        # take average
        L /= self.n_ratings
        # regularization, don't ever forget this 
        L += 0.5*self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
        return L 
    def get_items_rated_by_user(self, user_id):
        ids = np.where(self.Y_data_n[:,0] == user_id)[0] 
        item_ids = self.Y_data_n[ids, 1].astype(np.int32) # indices need to be integers
        ratings = self.Y_data_n[ids, 2]
        return (item_ids, ratings)
            
        
    def get_users_who_rate_item(self, item_id):
        """
        get all users who rated item item_id and get the corresponding ratings
        """
        ids = np.where(self.Y_data_n[:,1] == item_id)[0] 
        user_ids = self.Y_data_n[ids, 0].astype(np.int32)
        ratings = self.Y_data_n[ids, 2]
        return (user_ids, ratings)
    def updateX(self):
        for m in range(self.n_items):
            user_ids, ratings = self.get_users_who_rate_item(m)
            Wm = self.W[:, user_ids]
            # gradient
            grad_xm = -(ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + \
                                               self.lam*self.X[m, :]
            self.X[m, :] -= self.learning_rate*grad_xm.reshape((self.K,))
    
    def updateW(self):
        for n in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(n)
            Xn = self.X[item_ids, :]
            # gradient
            grad_wn = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + \
                        self.lam*self.W[:, n]
            self.W[:, n] -= self.learning_rate*grad_wn.reshape((self.K,))
    def fit(self):
        self.normalize_Y()
        for it in range(self.max_iter):
            self.updateX()
            self.updateW()
            if (it + 1) % self.print_every == 0:
                rmse_train = self.evaluate_RMSE(self.Y_raw_data)
                print ('iter =', it + 1, ', loss =', self.loss(), ', RMSE train =', rmse_train)
    def pred(self, u, i):
        """ 
        predict the rating of user u for item i 
        if you need the un
        """
        u = int(u)
        i = int(i)
        if self.user_based:
            bias = self.mu[u]
        else: 
            bias = self.mu[i]
        pred = self.X[i, :].dot(self.W[:, u]) + bias 
        # truncate if results are out of range [0, 5]
        if pred < 0:
            return 0 
        if pred > 5: 
            return 5 
        return pred 
        
    
    def pred_for_user(self, user_id):
        """
        predict ratings one user give all unrated items
        """
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()              
        
        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings= []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((y_pred[i],i))
        
        return predicted_ratings
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0 # squared error
        for n in range(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        return RMSE

In [3]:
ratings = ratings.values
print(ratings)

[[   1 1193    5]
 [   1  661    3]
 [   1  914    3]
 ...
 [6040  562    5]
 [6040 1096    4]
 [6040 1097    4]]


In [4]:
ratings1 = ratings.copy()

In [5]:
ratings[:,1]= [movie_index_by_id[movie_id] for movie_id in ratings1[:,1]]


In [6]:
print(ratings)
a= np.max(ratings[:,1])

print(a)

[[   1 1176    5]
 [   1  655    3]
 [   1  902    3]
 ...
 [6040  558    5]
 [6040 1080    4]
 [6040 1081    4]]
3882


In [7]:
user_prof1 = pd.read_csv('u_dict.csv')
print(user_prof1.dtypes)
user_prof1 = user_prof1.values
print(user_prof1.shape)

user_id    int64
genres     int64
weight     int64
dtype: object
(93887, 3)


In [8]:
print(ratings[:, 0])
ratings[:, 0] -= 1
user_prof1[:, 0] -= 1
ratings = np.concatenate((ratings,user_prof1), axis = 0)
print(ratings[:, :2])
from sklearn.model_selection import train_test_split

rate_train, rate_test = train_test_split(ratings, test_size=0.33, random_state=42)
print(rate_train.shape)

print(len(rate_train))

rs = MF(rate_train, K = 2, lam = 0.1, print_every = 1, learning_rate = 1, max_iter = 12, user_based = 1)
rs.fit()
# evaluate on test data
RMSE = rs.evaluate_RMSE(rate_test)
print ('\nItem-based MF, RMSE =', RMSE)

[   1    1    1 ... 6040 6040 6040]
[[   0 1176]
 [   0  655]
 [   0  902]
 ...
 [6039 3898]
 [6039 3899]
 [6039 3900]]
(733044, 3)
733044


iter = 1 , loss = 9.74111964529772 , RMSE train = 1.4107672586176834
iter = 2 , loss = 8.630394269476977 , RMSE train = 1.3138935554652746
iter = 3 , loss = 7.686570031753345 , RMSE train = 1.2349875807633561
iter = 4 , loss = 6.873587505640125 , RMSE train = 1.1735527990327954
iter = 5 , loss = 6.1657394817893 , RMSE train = 1.1276568505608509
iter = 6 , loss = 5.544273989494859 , RMSE train = 1.0943873285795775
iter = 7 , loss = 4.995169411752363 , RMSE train = 1.0709530621647831
iter = 8 , loss = 4.507668505358983 , RMSE train = 1.0548369287773014
iter = 9 , loss = 4.073305916009741 , RMSE train = 1.0439633277932059
iter = 10 , loss = 3.685257918060917 , RMSE train = 1.0367147313368634
iter = 11 , loss = 3.3379034242034447 , RMSE train = 1.0319250872727899
iter = 12 , loss = 3.026524138009614 , RMSE train = 1.0287695099777294

Item-based MF, RMSE = 1.0388693368122008


In [9]:

z = movie_index_by_id.copy()
z=list(z.items())
print(z[0][1])

0


In [10]:
a = rs.pred_for_user(1999)
a.sort(reverse= True)
for i in range(10):
    print(a[i][0],"-",movies.iloc[z[ a[i][1]][1],1],"-",movies.iloc[z[a[i][1]][1],2])

4.108572103615547 - Nightmare Before Christmas, The (1993) - Children's|Comedy|Musical
4.094487821540663 - Benji the Hunted (1987) - Adventure|Children's
4.088572418164023 - Starman (1984) - Adventure|Drama|Romance|Sci-Fi
4.06926895078934 - Rising Sun (1993) - Action|Drama|Mystery
4.056261448636962 - Superstar (1999) - Comedy
4.049865902547036 - See the Sea (Regarde la mer) (1997) - Drama
4.037102898620157 - Spanking the Monkey (1994) - Comedy|Drama
4.035415832999261 - Killing Fields, The (1984) - Drama|War
4.03498213412894 - Jeremiah Johnson (1972) - Western
4.027808071123727 - Forces of Nature (1999) - Comedy|Romance


In [11]:
import pickle
filename = 'rs_mf.sav'
pickle.dump(rs,open(filename,'wb'))

In [12]:
import pickle
rs1 = pickle.load((open('rs_mf.sav','rb')))
z = movie_index_by_id.copy()
z=list(z.items())
print(z[0][1])

0


In [13]:
b = rs1.pred_for_user(16)
b.sort(reverse= True)
for i in range(10):
    print(b[i][0],"-",movies.iloc[z[ b[i][1]][1],1],"-",movies.iloc[z[b[i][1]][1],2])


4.542791414482149 - Haunted Honeymoon (1986) - Comedy
4.51222311564403 - Two Thousand Maniacs! (1964) - Horror
4.505240465097792 - Alley Cats, The (1968) - Drama
4.501279845692339 - Hunted, The (1995) - Action
4.47670812849396 - Best Man, The (1999) - Drama
4.465052778347367 - Poltergeist II: The Other Side (1986) - Horror|Thriller
4.46073760272556 - Open Season (1996) - Comedy
4.460515946541754 - I Love Trouble (1994) - Action|Comedy
4.458353986310125 - Microcosmos (Microcosmos: Le peuple de l'herbe) (1996) - Documentary
4.447418352469531 - Widows' Peak (1994) - Drama


In [14]:
b = rs1.pred_for_user(18)
b.sort(reverse= True)
for i in range(10):
    print(b[i][0],"-",movies.iloc[z[ b[i][1]][1],1],"-",movies.iloc[z[b[i][1]][1],2])

3.992200963609919 - Haunted Honeymoon (1986) - Comedy
3.951563822921604 - Alley Cats, The (1968) - Drama
3.951120949893606 - Hunted, The (1995) - Action
3.946181304153809 - Two Thousand Maniacs! (1964) - Horror
3.9271539025806197 - Widows' Peak (1994) - Drama
3.922241364135445 - Best Man, The (1999) - Drama
3.921506273977291 - Open Season (1996) - Comedy
3.921358124819325 - Microcosmos (Microcosmos: Le peuple de l'herbe) (1996) - Documentary
3.9191196243661293 - Return to Oz (1985) - Adventure|Children's|Fantasy|Sci-Fi
3.91719946069416 - I Love Trouble (1994) - Action|Comedy


In [15]:
import time
start = time.time()
for i in range(1000):
    rs1.pred_for_user(i)
elapsed = (time.time() - start)/1000
print(elapsed)
    

0.009589749336242676
