In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import csv
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import requests
import json
from IPython.display import Image
from IPython.display import display



In [94]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
#df = pd.read_csv('ml-1m/ratings.dat', sep='\:\:', names=header)
df = pd.read_csv('~/Downloads/ml-100k/u.data', sep='\t', names=header)

In [95]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [96]:
n_users = df.user_id.unique().shape[0]
n_items = df.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [97]:
train_data, test_data = cv.train_test_split(df, test_size=0.25)


In [98]:
test_data.shape

(25000, 4)

In [99]:
n_users_train = train_data.user_id.unique().shape[0]
n_items_train = train_data.item_id.unique().shape[0]
print('Number of users = ' + str(n_users_train) + ' | Number of movies = ' + str(n_items_train))

Number of users = 943 | Number of movies = 1639


In [100]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
    
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')


In [101]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
item_prediction

array([[ 0.36501589,  0.37668984,  0.38749899, ...,  0.43649247,
         0.42731584,  0.43367043],
       [ 0.09162839,  0.10627646,  0.10384746, ...,  0.10787418,
         0.10933179,  0.11005354],
       [ 0.06465374,  0.0670453 ,  0.06621839, ...,  0.06391549,
         0.06710212,  0.06722189],
       ..., 
       [ 0.02857525,  0.03625525,  0.03473561, ...,  0.03988253,
         0.03944428,  0.03985723],
       [ 0.13191393,  0.1400843 ,  0.14768928, ...,  0.15199389,
         0.15098606,  0.15288519],
       [ 0.21150747,  0.20140605,  0.22831859, ...,  0.26525269,
         0.25581942,  0.26293873]])

In [102]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.1281738792319196
Item-based CF RMSE: 3.453316211592415
