In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep = '\t', names = column_names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [4]:
movie_titles = pd.read_csv('Movie_Id_Titles')
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
df = pd.merge(df, movie_titles, on = 'item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [17]:
n_users = df['user_id'].nunique()
n_items = df['item_id'].nunique()
print('Num of users: ' + str(n_users))
print('Num of items: '+ str(n_items))

Num of users: 944
Num of items: 1682


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
train_data, test_data = train_test_split(df, test_size = 0.25)

In [21]:
# user-item similiarity
# creating 2 similiarity matrices, first for training second for testing

In [27]:
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
    
    
test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [28]:
from sklearn.metrics.pairwise import pairwise_distances

In [40]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [45]:
def predict( ratings, similarity, type = 'user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:,np.newaxis])
        pred = mean_user_rating[:,np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [46]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict( train_data_matrix, user_similarity, type='user')

In [48]:
from sklearn.metrics import mean_squared_error
from math import sqrt


In [61]:
def rsme(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [62]:
print('User based CF RMSE: ' + str(rsme(user_prediction, test_data_matrix)))
print('Item based CF RMSE: ' + str(rsme(item_prediction, test_data_matrix)))

User based CF RMSE: 3.1187290482066143
Item based CF RMSE: 3.4471770419752463


In [63]:
# model based CF 

In [64]:
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print('The sparsity level of MovieLens100K is ' +  str(sparsity*100) + '%')

The sparsity level of MovieLens100K is 93.7%


In [65]:
# SVD

In [67]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds


u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rsme(X_pred, test_data_matrix)))

User-based CF MSE: 2.7122700518766027
