In [149]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.metrics import pairwise_distances, precision_recall_fscore_support, mean_absolute_error, mean_squared_error
from operator import itemgetter
from tqdm.notebook import tqdm
import copy
import heapq
import sys, os
import pickle
import itertools
import operator
import math

In [150]:
ratings = pd.read_csv("./datasets/ratings.csv", sep=",")

In [183]:
print(type(ratings))
ratings.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [158]:
movies = pd.read_csv("./datasets/movies.csv", sep=",")
print(movies.shape)
movies.head()

(62423, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [153]:
rating_map = {}
for i in range(len(ratings)):
    key = str(ratings.iloc[i,0]) + '/' +str(ratings.iloc[i,1])
    rating_map[key]=ratings.iloc[i,2]

In [161]:
user_ratings_map = {}

items = ratings.movieId.unique()
for i in items:
    userids = ratings.query("movieId == @i").userId.array
    user_ratings = ratings.query("movieId == @i").rating.array
    user_ratings_map[i] = (userids,user_ratings)

In [162]:
def NNCF_based_rating_prediction(u, i, metric):
    r = 0
    sum_sim = 0
    # find the movies rated by u
    movies = ratings[ratings["userId"]==u].movieId
    for j in movies:
        sim = calc_sim(i, j, metric)
        key = str(u)+"/"+str(j)
        r += sim*rating_map[key]
        sum_sim += sim
    if sum_sim == 0:
        return 0
    else:
        return r / sum_sim

In [163]:
# finds the similary of items i and j 
def calc_sim(i,j, metric):
    # users who rated item i
    users_rated_i = user_ratings_map[i][0]
    ratings_i = user_ratings_map[i][1]
    # users who rated item j
    users_rated_j = user_ratings_map[j][0]
    ratings_j = user_ratings_map[j][1]

    # Jaccard ignores rating values.
    if metric == "Jaccard":
        intersection_size = len(set(users_rated_i).intersection(users_rated_j))
        union_size = len(set(users_rated_i).union(users_rated_j))
        return intersection_size / union_size
    elif metric == "Euclidian_Distance":
        return math.sqrt(sum([(item_one - item_two ) ** 2 for item_one, item_two in zip(ratings_i, ratings_j)]))

    

## Evaluation of Rating Prediction

How can we measure the performance of a recommender algorithm? This is similar to the evaluation used in machine learning.

- Make a train/test split
- Build the model on the training set
- Make predictions for the ratings in the test set
- Find the mean absolute error (MAE)

For more metrics other then MAE loo at the "Metrics for Regression" section of [this notebook](http://localhost:8888/notebooks/PycharmProjects/data_science/evaluation.ipynb)

In [187]:
X_train, X_test = train_test_split(ratings, test_size=100)
train_size = X_train.shape[0]
test_size = X_test.shape[0]
print("Test size:", test_size)
error1 = 0
error2 = 0
error3 = 0
preds = []

c = 1
avg_rating = X_train.iloc[:,2].mean()
for k in tqdm(range(test_size)): 
    c+=1
    u = X_test.iloc[k,0]
    i = X_test.iloc[k,1]
    r = X_test.iloc[k,2]
    
    err = NNCF_based_rating_prediction(u,i,"Euclidian_Distance")
    if (np.isnan(err)):
        error1 += (r - avg_rating)**2
        print("x")
        preds.append(avg_rating)
    else:
        preds.append(err)
        error1 += (r - NNCF_based_rating_prediction(u,i,"Euclidian_Distance"))**2
        error2 += (r - NNCF_based_rating_prediction(u,i,"Jaccard"))**2


print("RMSE for Euclidian Distance Similarity Metric: ", np.sqrt(error1/test_size))
print("RMSE for Jaccard Similarity Metric: ", np.sqrt(error2/test_size))

Test size: 100


  0%|          | 0/100 [00:00<?, ?it/s]

RMSE for Euclidian Distance Similarity Metric:  1.0058466606513001
RMSE for Jaccard Similarity Metric:  0.8218831636026663


In [188]:
#calculating mae, rmse and mse values using sklearn library
X_test["preds"] = preds
MAE = mean_absolute_error(X_test["rating"], X_test["preds"])
MSE = mean_squared_error(X_test["rating"], X_test["preds"])
RMSE = np.sqrt(mean_squared_error(X_test["rating"], X_test["preds"]))
print('MAE: {}, \nMSE: {}, \nRMSE: {} \nby using sklearn.metrics\'s functions'.format(MAE, MSE, RMSE))

MAE: 0.7836739091745878, 
MSE: 1.011727504743372, 
RMSE: 1.0058466606513004 
by using sklearn.metrics's functions


In [184]:
type(X_train)

pandas.core.frame.DataFrame

In [178]:
X_test

Unnamed: 0,userId,movieId,rating,timestamp,preds,relevant_test,relevant_pred
20539252,133571,3264,2.0,948200436,3.639611,0,1
6608956,42801,1676,4.0,1208375503,3.776854,1,1
15592650,100977,344,2.5,1129383549,3.194659,0,0
2644211,17588,157296,4.5,1547110139,4.144660,1,1
8568186,55819,6365,3.5,1257019291,3.364667,0,0
...,...,...,...,...,...,...,...
11369285,73802,377,3.0,944984131,3.790503,0,1
15190904,98446,1266,5.0,1115335993,3.880368,1,1
4700206,30734,2929,3.0,940786701,4.191106,0,1
17553906,113791,678,5.0,1176625128,3.743435,1,1


In [186]:
#calculating precise and recall values 
X_test['relevant_test'] = (X_test['rating'] > 3.5) * 1
X_test['relevant_pred'] = (X_test['preds'] > 3.5) * 1
print(X_test['relevant_test'].head())
print(X_test['relevant_pred'].head())
results = precision_recall_fscore_support(X_test['relevant_test'] , X_test['relevant_pred'] , average='macro')
print('Precision: {} and Recall: {}'.format(results[0], results[1]))

20539252    0
6608956     1
15592650    0
2644211     1
8568186     0
Name: relevant_test, dtype: int32
20539252    1
6608956     1
15592650    0
2644211     1
8568186     0
Name: relevant_pred, dtype: int32
Precision: 0.7218453188602443 and Recall: 0.7001223990208079


In [170]:
def top_N_pred_sort(N, u):
    preds = pd.Series([], dtype='float')
    # find the movies not rated by u
    movies_not_rated = ratings.query("userId != @u").movieId.unique()
    sample_movies = np.random.choice(movies_not_rated, 1000)
    for m in tqdm(sample_movies):
        preds[m] = NNCF_based_rating_prediction(u, m, "Euclidian_Distance")
    return preds.sort_values(ascending=False)[:N]    

In [171]:
top_N_pred_sort(10, 1)

  0%|          | 0/1000 [00:00<?, ?it/s]

203268    5.000000
144890    5.000000
169628    5.000000
204066    5.000000
181503    4.954360
204820    4.814240
184745    4.754053
204224    4.733326
163012    4.693986
179805    4.688951
dtype: float64

In [172]:
top_N_pred_sort(10, 2)

  0%|          | 0/1000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [182]:
X_test.to_csv("./datasets/X_test_8.csv")