In [8]:
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
from data_loader import dataloader
user,ratings,movies = dataloader()
ratings = ratings.drop("timestamp",axis=1)
movie_index_by_id = {id: i for i, id in enumerate(movies["movie_id"])}

In [9]:
class CF(object):
    def __init__(self, Y_data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1
    
    def add(self, new_data):
        self.Y_data = np.concatenate((self.Y_data, new_data), axis = 0)
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            ids = np.where(users == n)[0].astype(np.int32)
            item_ids = self.Y_data[ids, 1]  
            ratings = self.Y_data[ids, 2]
            m = np.mean(ratings) 
            if np.isnan(m):
                m = 0
            self.mu[n] = m
            self.Ybar_data[ids, 2] = ratings - self.mu[n]

        self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
            (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()

    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)
    
        
    def refresh(self):

        self.normalize_Y()
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    
    def __pred(self, u, i, normalized = 1):


        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)

        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)

        sim = self.S[u, users_rated_i]
        a = np.argsort(sim)[-self.k:] 
        nearest_s = sim[a]
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
            
    
    def recommend(self, u):
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 
    
    def recommend2(self, u):
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
    
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 

    def print_recommendation(self):
        print ('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print ('Recommend item(s):', recommended_items, 'for user', u)
            else: 
                print ('Recommend item', u, 'for user(s) : ', recommended_items)

In [10]:
ratings = ratings.values
ratings

array([[   1, 1193,    5],
       [   1,  661,    3],
       [   1,  914,    3],
       ...,
       [6040,  562,    5],
       [6040, 1096,    4],
       [6040, 1097,    4]], dtype=int64)

In [11]:
ratings1 = ratings.copy()

In [12]:
ratings[:,1]= [movie_index_by_id[movie_id] for movie_id in ratings1[:,1]]

In [13]:
user_prof1 = pd.read_csv('u_dict.csv')
print(user_prof1.dtypes)
user_prof1 = user_prof1.values
print(user_prof1.shape)

user_id    int64
genres     int64
weight     int64
dtype: object
(93887, 3)


In [14]:
import time
print(ratings[:, 0])
ratings[:, 0] -= 1
user_prof1[:, 0] -= 1
# ratings = np.concatenate((ratings,user_prof1), axis = 0)
# print(ratings[:, :2])
from sklearn.model_selection import train_test_split
rate_train, rate_test = train_test_split(ratings, test_size=0.33, random_state=42)
rate_train = np.concatenate((rate_train,user_prof1), axis = 0)
rs = CF(ratings, k = 30, uuCF = 1)
rs.fit()
n_tests = rate_test.shape[0]
print(n_tests)
SE = 0 # squared error
start = time.time()
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    print(pred)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print ('User-user CF, RMSE =', RMSE)
elapsed = (time.time() - start)/60
print(elapsed)

[   1    1    1 ... 6040 6040 6040]
330069
3.5784771825982586
4.421774507989248
3.68673751840369
3.5811389071004265
2.755635583328438
4.130038255701944
3.5618371423717976
3.1915987850096768
2.4978627278319303
2.2070049992981766
3.075923287248955
4.126910123639422
3.9762679942229533
4.237819032068825
3.990135021267335
3.820209305173103
3.3692530673530645
3.753138354901697
3.6477594356595717
3.2326171450209475
4.071110110597068
3.0700975240731854
4.02480805721133
3.36020149364875
3.179013748506573
3.6096807958493295
4.281653889421875
4.288158336621146
4.029994837938068
3.987963853505117
3.8920471452780014
3.4104689946906235
3.288811903377333
4.667126522364154
4.417812644776066
4.478903776190009
3.5648172116152894
4.138133825863913
4.2330403846710665
2.893429130843751
3.478209614857584
4.02421104603563
4.507978475512503
3.589062824693203
2.5623174491869043
4.20892722274792
1.750733644891869
2.988140814919658
4.243675458454433
3.8967713289154458
3.6196481648764895
3.824794971607099
3.35919