# Neighborhood-based Collaborative Filtering

In [None]:
import numpy as np
import pandas as pd
import datetime
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

# Reading ratings file
* `rating_matrix = (user_id, item_id, rating)`

In [2]:
rating_base = pd.read_csv('../Content-Based/ml-100k/ua.base', sep='\t', names=['user_id', 'item id', 'rating', 'timestamp'])
rating_test = pd.read_csv('../Content-Based/ml-100k/ua.test', sep='\t', names=['user_id', 'item id', 'rating', 'timestamp'])
rating_base.head()

Unnamed: 0,user_id,item id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [3]:
rating_base['timestamp'] = rating_base['timestamp'].apply(lambda stamp: datetime.datetime \
                                                          .fromtimestamp(int(stamp)).strftime('%Y-%m-%d %H:%M:%S'))
rating_test['timestamp'] = rating_test['timestamp'].apply(lambda stamp: datetime.datetime \
                                                          .fromtimestamp(int(stamp)).strftime('%Y-%m-%d %H:%M:%S'))
rating_base.head()

Unnamed: 0,user_id,item id,rating,timestamp
0,1,1,5,1997-09-23 05:02:38
1,1,2,3,1997-10-15 12:26:11
2,1,3,4,1997-11-03 14:42:40
3,1,4,3,1997-10-15 12:25:19
4,1,5,3,1998-03-13 08:15:12


* The indices of `user_id` and `item_id` starts from 1, it should be start from 0 to be compatible with indices in Python

In [4]:
rating_train = rating_base.as_matrix()
type(rating_train)

numpy.ndarray

In [5]:
rating_test = rating_test.as_matrix()
print('Number of training rate:', rating_train.shape[0])
print('Number of test rate:', rating_test.shape[0])

Number of training rate: 90570
Number of test rate: 9430


In [6]:
rating_train[:, :2] -= 1
rating_test[:, :2] -= 1

# User-User Collaberative Filtering

In [8]:
class UUCF():
    def __init__(self, rating_matrix, k, sim_func = cosine_similarity):
        self.rating_matrix = rating_matrix
        self.k = k # Number of Neighborhood
        self.sim_func = sim_func # similarity function, default: cosine_similarity
        self.n_users = np.max(rating_matrix[:, 0]) + 1# Number of users, +1 because indices starts from 0
        self.n_items = np.max(rating_matrix[:, 1]) + 1# Number of items
    
    def fit(self):
        """Calculate the mean of users ratings, users similarity matrix and sparse the utility matrix based on the original rating_matrix"""
        self.mu = np.zeros((self.n_users))#user's mean ratings
        self.rating_matrix_bar = self.rating_matrix.copy()
        for user in range(self.n_users):
            ids = np.where(self.rating_matrix_bar[:, 0] == user)[0]#Find row indices has "user_id"
            item_ids = self.rating_matrix_bar[ids.tolist(), 1]
            ratings = self.rating_matrix_bar[ids.tolist(), 2]
            self.mu[user] = np.mean(ratings) if ids.size > 0 else 0
            self.rating_matrix_bar[ids.tolist(), 2] = ratings - self.mu[user]
        # From rating matrix to sparse matrix
        self.rating_matrix_bar = sparse.coo_matrix((self.rating_matrix_bar[:, 2].astype('float'), \
                                                (self.rating_matrix_bar[:, 1], self.rating_matrix_bar[:, 0])), \
                                               shape=(self.n_items, self.n_users)).tocsr()
        self.user_sim = cosine_similarity(self.rating_matrix_bar.T)
    
    def pred(self, user_id, item_id):
        """predict rating of user id and item id based on the mean of users ratings, users sim matrix"""
        ids = np.where(self.rating_matrix[:, 1] == item_id)[0]#Find row indices has "item_id"
        users_rated_item_id = self.rating_matrix[ids.tolist(), 0]# All users who rated item_id
        sim = self.user_sim[user_id, users_rated_item_id.tolist()]# Similarity of user_id and users who rated item_id
        most_sim_users = np.argsort(sim)[-self.k:]# most k similar users 
        sim_most_sim_users = sim[most_sim_users.tolist()]# and the corresponding similarities
        rating_most_sim_users = self.rating_matrix_bar[item_id, users_rated_item_id[most_sim_users.tolist()].tolist()].toarray().reshape(sim_most_sim_users.shape)# the corresponding ratings 
        eps = 1e-8 # avoid to divide to 0
        return np.sum(sim_most_sim_users * rating_most_sim_users)/(np.sum(np.abs(sim_most_sim_users)) + eps) + self.mu[user_id]

In [136]:
recommender = UUCF(rating_train, k=40)
recommender.fit()

In [138]:
tot = 0
for i in range(rating_test.shape[0]):
    err = rating_test[i, 2] - recommender.pred(rating_test[i, 0], rating_test[i, 1])
    tot += (err**2)
print('RMSE:', np.sqrt(tot/len(rating_test)))

RMSE: 0.9580454278556474


# Item-Item Collaborative Filtering
* We just need to switch the role of user and item, don't need to modify the UUCF class

In [14]:
rating_train = rating_train[:, [1, 0, 2]]
rating_test = rating_test[:, [1, 0, 2]]

In [15]:
recommender = UUCF(rating_train, k=40)
recommender.fit()

In [16]:
tot = 0
for i in range(rating_test.shape[0]):
    err = rating_test[i, 2] - recommender.pred(rating_test[i, 0], rating_test[i, 1])
    tot += (err**2)
print('RMSE:', np.sqrt(tot/len(rating_test)))

RMSE: 0.935935038919037
