In [4]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index), 
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


In [5]:
#USER BASED CF

from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_absolute_error
import math

#NaNs must be replaced. using avg
user_ratings_train['avg'] = user_ratings_train.mean(axis=1)
user_ratings_train_new = user_ratings_train.T.fillna(user_ratings_train['avg'], axis=0).T

pearson_sim_train = 1-pairwise_distances(user_ratings_train_new, metric="correlation")

train_model = NearestNeighbors(n_neighbors=5)
train_model.fit(pearson_sim_train)

n_distance, n_ind = train_model.kneighbors()
n_ind += 1 # fixes indices

pred = []
actual = []

for user_id, row in user_ratings_test.iterrows():
    
    for movie, rating in row.iteritems():
        
        if not pd.isnull(rating):
            
            pred_rating = 0
            sim = 0
            
            for x in range(0,5):
                
                nid = n_ind[user_id-1][x]
                n_rating = user_ratings_train.loc[nid,movie]
                
                if not pd.isnull(n_rating):
                    
                    distance = n_distance[user_id-1][x]
                    sim += distance
                    pred_rating += distance*(n_rating-user_ratings_train.loc[nid, 'avg'])
            
            if (sim != 0):
                
                pred_rating = pred_rating/sim
                pred_rating += user_ratings_train.loc[user_id, 'avg']
                pred.append(pred_rating)
                actual.append(rating)

mae = mean_absolute_error(pred, actual)
print('MAE: ' + str(mae))

MAE: 0.8630244696309352


In [6]:
#ITEM BASED CF

item_ratings_train = user_ratings_train.T
item_ratings_test = user_ratings_test.T

#NaNs must be replaced. using avg
item_ratings_train['avg'] = item_ratings_train.mean(axis=1)
item_ratings_train_new = item_ratings_train.T.fillna(item_ratings_train['avg'], axis=0).T

#drop rows w no data
dropped = item_ratings_train_new[item_ratings_train_new.isna().any(axis=1)]
item_ratings_train_new = item_ratings_train_new.drop(dropped.index)
item_ratings_test = item_ratings_test.drop(dropped.index)

pearson_sim_train = 1-pairwise_distances(item_ratings_train_new, metric="cosine")

train_model = NearestNeighbors(n_neighbors=5) # using 5 nearest neighbors
train_model.fit(pearson_sim_train)

n_distance, n_ind = train_model.kneighbors()
n_ind += 1 # fixing indices

pred = [] #predicted ratings
actual = [] #actual ratings

for movie_id, row in item_ratings_test.iterrows():
    
    item_id = item_ratings_test.index.get_loc(movie_id)
    
    for user, rating in row.iteritems():
        
        if not pd.isnull(rating):
            
            pred_rating = 0
            sim = 0
            
            for x in range(0,5):
                
                nid = n_ind[item_id][x]
                n_rating = item_ratings_train.iloc[nid].loc[user]
                
                if not pd.isnull(n_rating):
                    
                    distance = n_distance[item_id][x]
                    sim += distance
                    pred_rating += distance*(n_rating)
            
            if (sim != 0):
                
                pred_rating = pred_rating/sim
                pred.append(pred_rating)
                actual.append(rating)

mae = mean_absolute_error(pred, actual)
print('MAE: ' + str(mae))

MAE: 1.052891972039883
