<a href="https://colab.research.google.com/github/ndq3004/machine_learning/blob/master/MachineLearningCoBan/RecommendationSystem/NeighborHood_BasedCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
"""
user-user CF
"""

from __future__ import print_function 
import pandas as pd 
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse 
class uuCF(object):
    def __init__(self, Y_data, k, sim_func = cosine_similarity):
        self.Y_data = Y_data # a 2d array of shape (n_users, 3)
                     # each row of Y_data has form [user_id, item_id, rating]
        self.k         = k # number of neighborhood
        self.sim_func  = sim_func # similarity function, default: cosine_similarity
        self.Ybar      = None   # normalize data 
        self.n_users   = int(np.max(self.Y_data[:, 0])) + 1 # number of users
        self.n_items   = int(np.max(self.Y_data[:, 1])) + 1 # number of items
    
    def fit(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar = self.Y_data.copy()
        self.mu = np.zeros((self.n_users,))
        for n in range(self.n_users):
            # row indices of ratings made by user n
            ids = np.where(users == n)[0].astype(np.int32)
            item_ids = self.Y_data[ids, 1] # indices of all items rated by user n 
            ratings = self.Y_data[ids, 2]  # ratings made by user n 
            self.mu[n] = np.mean(ratings) if ids.size > 0 else 0 # avoid zero division 
            self.Ybar[ids, 2] = ratings - self.mu[n]
            
        ## form the rating matrix as a sparse matrix. 
        self.Ybar = sparse.coo_matrix((self.Ybar[:, 2],
            (self.Ybar[:, 1], self.Ybar[:, 0])), (self.n_items, self.n_users)).tocsr()
        self.S = self.sim_func(self.Ybar.T, self.Ybar.T)
    
    def pred(self, u, i):
        """ predict the rating of user u for item i"""
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32) # find item i 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32) # all users who rated i
        sim       = self.S[u, users_rated_i] # similarity of u and users who rated i
        nns       = np.argsort(sim)[-self.k:] # most k similar users 
        nearest_s = sim[nns] # and the corresponding similarities
        r         = self.Ybar[i, users_rated_i[nns]] # the corresponding ratings 
        eps       = 1e-8 # a small number to avoid zero division 
        return (r*nearest_s).sum()/(np.abs(nearest_s).sum() + eps) + self.mu[u]

In [41]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
link = '/content/drive/My Drive/Dataset/ml-100k/'
ratings_base = pd.read_csv(link + 'ml-100k/ua.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv(link + 'ml-100k/ua.test', sep='\t', names=r_cols)

rate_train = ratings_base.values
rate_test = ratings_test.values

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

rs = uuCF(rate_train, k = 40)
rs.fit()
n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
  pred = rs.pred(rate_test[n, 0], rate_test[n, 1])
  SE += (pred - rate_test[n, 2])**2
RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)


User-user CF, RMSE = 0.9766140289287265


In [8]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [14]:
!ls '/content/drive/My Drive/Dataset/ml-100k/ml-100k'

allbut.pl  u1.base  u2.test  u4.base  u5.test  ub.base	u.genre  u.occupation
mku.sh	   u1.test  u3.base  u4.test  ua.base  ub.test	u.info	 u.user
README	   u2.base  u3.test  u5.base  ua.test  u.data	u.item
