In [1]:
import numpy as np
import pandas as pd

In [2]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('data/u.data', sep='\t', names=header)

In [3]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print ('Number of users = ' , str(n_users) , ' | Number of movies = ' , str(n_items))

Number of users =  943  | Number of movies =  1682


In [4]:
from sklearn import cross_validation as cv
train_data, test_data = cv.train_test_split(df, test_size=0.25)



In [5]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

In [6]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [14]:
user_similarity

array([[0.        , 0.82941021, 0.95950094, ..., 0.84713953, 0.89029438,
        0.65351556],
       [0.82941021, 0.        , 0.89788253, ..., 0.90101988, 0.88277492,
        0.86474209],
       [0.95950094, 0.89788253, 0.        , ..., 0.91635267, 0.85991338,
        0.96744945],
       ...,
       [0.84713953, 0.90101988, 0.91635267, ..., 0.        , 0.85807155,
        0.90343555],
       [0.89029438, 0.88277492, 0.85991338, ..., 0.85807155, 0.        ,
        0.84593715],
       [0.65351556, 0.86474209, 0.96744945, ..., 0.90343555, 0.84593715,
        0.        ]])

In [15]:
item_similarity

array([[0.        , 0.65510669, 0.72531986, ..., 1.        , 0.94533866,
        0.94533866],
       [0.65510669, 0.        , 0.82368606, ..., 1.        , 0.9100764 ,
        0.9100764 ],
       [0.72531986, 0.82368606, 0.        , ..., 1.        , 1.        ,
        0.88313659],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [0.94533866, 0.9100764 , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [0.94533866, 0.9100764 , 0.88313659, ..., 1.        , 1.        ,
        0.        ]])

In [7]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [8]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [16]:
user_prediction

array([[ 1.55748798,  0.56567974,  0.46871764, ...,  0.28719064,
         0.28726498,  0.28693998],
       [ 1.29858569,  0.28336656,  0.13947489, ..., -0.06398376,
        -0.06277266, -0.06259686],
       [ 1.32777891,  0.25177526,  0.11813132, ..., -0.0951794 ,
        -0.09326023, -0.09321056],
       ...,
       [ 1.22166072,  0.20772303,  0.07334028, ..., -0.12660756,
        -0.12580766, -0.12570408],
       [ 1.34854841,  0.30963363,  0.19257357, ..., -0.01491464,
        -0.01406531, -0.01379738],
       [ 1.40577905,  0.3968981 ,  0.309361  , ...,  0.12670701,
         0.1265048 ,  0.1265656 ]])

In [9]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [10]:
print ('User-based CF RMSE: ' , str(rmse(user_prediction, test_data_matrix)))
print ('Item-based CF RMSE: ' , str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE:  3.1215895050060984
Item-based CF RMSE:  3.4485095926374356


In [11]:
sparsity=round(1.0-len(df)/float(n_users*n_items),3)
print ('The sparsity level of MovieLens100K is ' ,  str(sparsity*100) , '%')

The sparsity level of MovieLens100K is  93.7 %


In [12]:
import scipy.sparse as sp
from scipy.sparse.linalg import svds

#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 20)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print ('User-based CF MSE: ' , str(rmse(X_pred, test_data_matrix)))

User-based CF MSE:  2.7164763009353674
