In [49]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances,cosine_similarity
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('/Users/i303138/Documents/Learning/MachineLearning/data/ml-100k/u.data', sep = '\t', names = header)

In [3]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))


Number of users = 943 | Number of movies = 1682


In [5]:
train_data,test_data = train_test_split(df, test_size = 0.25)

In [6]:
train_data_matrix = np.zeros((n_users,n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]


In [7]:
train_data_matrix

array([[5., 3., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [8]:
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [10]:
test_data_matrix

array([[0., 0., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [52]:
#user_similarity = pairwise_distances(train_data_matrix, metric = "cosine")
#item_similarity = pairwise_distances(train_data_matrix.T, metric = "cosine")

user_similarity = cosine_similarity(train_data_matrix)
item_similarity = cosine_similarity(train_data_matrix.T)

In [63]:
item_similarity

array([[1.        , 0.28604258, 0.23663628, ..., 0.        , 0.05460019,
        0.05460019],
       [0.28604258, 1.        , 0.18429461, ..., 0.        , 0.09482093,
        0.09482093],
       [0.23663628, 0.18429461, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.05460019, 0.09482093, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.05460019, 0.09482093, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [92]:
def predict(rating, similarity, type = 'user'):
    if type == 'user':
        mean_user_rating = rating.mean(axis = 1)
        rating_diff = (rating - mean_user_rating[:,np.newaxis])
        pred = mean_user_rating[:,np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        temp = np.array([np.abs(similarity).sum(axis=1)])
        div_nozero = np.where(temp==0,1,temp)
        pred = rating.dot(similarity) / div_nozero
    return pred

In [93]:
item_prediction = predict(train_data_matrix, item_similarity, type = 'item')
user_prediction = predict(train_data_matrix, user_similarity, type = 'user')

In [96]:
item_prediction

array([[0.88525714, 0.81002696, 0.76870678, ..., 0.06503111, 0.77594078,
        0.95077524],
       [0.17171017, 0.10266931, 0.14573742, ..., 0.1999044 , 0.0797311 ,
        0.08658766],
       [0.07667584, 0.05765557, 0.08728972, ..., 0.46624807, 0.06966919,
        0.02576145],
       ...,
       [0.13195429, 0.06610599, 0.09742051, ..., 0.02101741, 0.0849996 ,
        0.08180563],
       [0.29673754, 0.23887779, 0.22039964, ..., 0.18396344, 0.26278655,
        0.11363775],
       [0.5911629 , 0.60851559, 0.56490615, ..., 0.        , 0.56643475,
        0.58155   ]])

In [95]:
user_prediction

array([[ 2.01155661,  0.742672  ,  0.4644316 , ...,  0.1776152 ,
         0.18418002,  0.18424466],
       [ 1.36217545,  0.18458235,  0.15261047, ..., -0.06631487,
        -0.06678317, -0.06659652],
       [ 0.89256329,  0.13880943,  0.09857283, ..., -0.0651794 ,
        -0.07231047, -0.07359577],
       ...,
       [ 1.96102931,  0.17817235,  0.13956439, ..., -0.12620485,
        -0.12263092, -0.12335277],
       [ 1.43150518,  0.35634029,  0.15266018, ..., -0.05345691,
        -0.0499793 , -0.05292474],
       [ 1.86459955,  0.64182959,  0.32313332, ...,  0.00887424,
         0.01606567,  0.01516827]])

In [57]:
user_similarity.shape

(943, 943)

In [97]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))


In [98]:
print ('User based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print ('Item based CF RMSe: ' + str(rmse(item_prediction, test_data_matrix)))


User based CF RMSE: 2.9653591142490128
Item based CF RMSe: 3.1715060293936177
