In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import csv
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import requests
import json
from IPython.display import Image
from IPython.display import display



In [2]:
ds = '100K'

if ds=='1M':
    # 1M
    df = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python')
    df = df.rename(columns={'UserID': 'user_id', 'MovieID': 'item_id', 'Rating' : 'rating', 'Timestamp' : 'timestamp'})

elif ds=='100K':
    # 100K
    header = ['user_id', 'item_id', 'rating', 'timestamp']
    df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)

In [3]:
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
#n_users = df.user_id.unique().shape[0]
#n_items = df.item_id.unique().shape[0]
n_users = df['user_id'].max()
n_items = df['item_id'].max()
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items))

Number of users = 943 | Number of movies = 1682


In [6]:
train_data, test_data = cv.train_test_split(df, test_size=0.25)

In [7]:
#n_users_train = train_data.user_id.unique().shape[0]
#n_items_train = train_data.item_id.unique().shape[0]
n_users_train= train_data['user_id'].max()
n_items_train = train_data['item_id'].max()
print('Number of users = ' + str(n_users_train) + ' | Number of movies = ' + str(n_items_train))

Number of users = 943 | Number of movies = 1682


In [8]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]
    
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [9]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')
item_prediction

array([[ 0.3714561 ,  0.3805387 ,  0.39872752, ...,  0.44473651,
         0.43433249,  0.43363508],
       [ 0.08203693,  0.09475081,  0.09193374, ...,  0.09535686,
         0.09720583,  0.0970468 ],
       [ 0.06146359,  0.06394765,  0.06181115, ...,  0.0599151 ,
         0.06273316,  0.06356404],
       ..., 
       [ 0.0307374 ,  0.03960028,  0.0387683 , ...,  0.0438821 ,
         0.04380487,  0.04347873],
       [ 0.12432843,  0.13215857,  0.14254854, ...,  0.14750234,
         0.14572377,  0.14793966],
       [ 0.19264516,  0.19084273,  0.21551276, ...,  0.24401746,
         0.23557226,  0.23675785]])

In [10]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 3.130823172348005
Item-based CF RMSE: 3.457526499465235


### visualization

In [12]:
from ipywidgets import interact
from ipywidgets import fixed

In [13]:
def compare_prediction(prediction, ground_truth, uid=0, filter_prediction=True):
    if filter_prediction:
        mask = ground_truth==0
        prediction[mask] = 0
    plt.figure(figsize=(16,7))
    plt.subplot(1,2,1)
    plt.title("Prediction")
    plt.plot(prediction[uid], 'r--')
    plt.subplot(1,2,2)
    plt.title("Ground truth")
    plt.plot(ground_truth[uid], 'b--')
    plt.show()

In [15]:
interact(compare_prediction, 
         prediction=fixed(item_prediction),
         ground_truth=fixed(test_data_matrix),
         uid=(0,1000));

### Memory-Based Collaborative Filtering