In [1]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
from surprise import SVDpp
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from collections import defaultdict
from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
#reading files
df_ratings = pd.read_csv('input/ratings.csv')
df_movies = pd.read_csv('input/movies.csv')
df_ratings = df_ratings.drop(columns= 'timestamp')

print(df_movies.head(5))
print(df_ratings.head(5))

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0


In [3]:
#splitting data into train and test sets
train_split, test_split = train_test_split(df_ratings, test_size = 0.25, random_state = 20)
print("Training data size:", train_split.shape)   
print("Test data size:", test_split.shape)
#reader for parsing the ratings file
reader = Reader(rating_scale=(1, 5))
#building the train and test set, loading the data from dataframe
train_build = Dataset.load_from_df(train_split, reader)
test_build = Dataset.load_from_df(test_split, reader)
trainset = train_build.build_full_trainset()
testset = test_build.build_full_trainset().build_testset()
print("Test set size:", len(testset))

Training data size: (75627, 3)
Test data size: (25209, 3)
Test set size: 25209


In [4]:
#model building
#takes in factors, epochs, learning rate and regularization parameter
model = SVDpp(n_factors=20,n_epochs=5,lr_all=0.09,reg_all=0.5) 
model.fit(trainset) 
#making predictions
predictions = model.test(testset) 
#calculating rmse
accuracy.rmse(predictions, verbose = True)

RMSE: 0.9052


0.9051798911917065

In [5]:
#Save all the predicted ratings and convert it to a dataframe
all_recommendations_list = defaultdict(list)
all_recommendations_df = pd.DataFrame([])
for uid, iid, true_r, est, _ in predictions:
    all_recommendations_list[uid].append((iid, est))
    all_recommendations_df = all_recommendations_df.append(pd.DataFrame({'user': uid, 'movieId': iid, 'predicted_rating' : est}, index=[0]), ignore_index=True);
print(all_recommendations_df.head(5))
print(all_recommendations_df.shape)
#Merging with movies file to get genre, title information for predictions
all_recommendations_df_details = pd.merge(all_recommendations_df,df_movies, on='movieId', how='inner')
print(all_recommendations_df_details.head(5))

   user  movieId  predicted_rating
0    57     1562          2.595082
1    57      588          3.294351
2    57     2088          2.875424
3    57     3479          3.027482
4    57     2941          3.617714
(25209, 3)
   user  movieId  predicted_rating                  title  \
0    57     1562          2.595082  Batman & Robin (1997)   
1   605     1562          2.685039  Batman & Robin (1997)   
2   608     1562          2.625385  Batman & Robin (1997)   
3   122     1562          3.313525  Batman & Robin (1997)   
4   534     1562          2.968605  Batman & Robin (1997)   

                              genres  
0  Action|Adventure|Fantasy|Thriller  
1  Action|Adventure|Fantasy|Thriller  
2  Action|Adventure|Fantasy|Thriller  
3  Action|Adventure|Fantasy|Thriller  
4  Action|Adventure|Fantasy|Thriller  


In [6]:
#List of top n recommendations list as per SVD++ for all users
def get_top_n_recommendation_list_df(all_recommendations_df_details, n=10):
    top_n_recommendations_df = all_recommendations_df_details.sort_values(['user','predicted_rating'],ascending=[True, False])
    return top_n_recommendations_df
top_n_recommendations_df = get_top_n_recommendation_list_df(all_recommendations_df_details, 10)
print(top_n_recommendations_df.head())

      user  movieId  predicted_rating                       title  \
7018     1      608          4.724769                Fargo (1996)   
7721     1     2959          4.629565           Fight Club (1999)   
5005     1     1089          4.608889       Reservoir Dogs (1992)   
8453     1     1206          4.602794  Clockwork Orange, A (1971)   
1527     1     2858          4.590449      American Beauty (1999)   

                           genres  
7018  Comedy|Crime|Drama|Thriller  
7721  Action|Crime|Drama|Thriller  
5005       Crime|Mystery|Thriller  
8453  Crime|Drama|Sci-Fi|Thriller  
1527                Drama|Romance  


In [7]:
metrics=[]
true_positives_array = []
est_array = []
for rating_threshold in np.arange(0,5.5,0.5):
    truePositives = 0
    trueNegatives = 0
    falseNegatives = 0
    falsePositives = 0
    accuracy =0
    precision =0
    recall =0
    f1_score = 0
    for uid,_, true_r, est, _ in predictions:
        if(true_r >= rating_threshold and est >= rating_threshold):
            truePositives = truePositives + 1
            true_positives_array.append(true_r)
            est_array.append(est)
            #here
        elif(true_r >= rating_threshold and est <= rating_threshold):
            falseNegatives = falseNegatives + 1
        elif(true_r <= rating_threshold and est >= rating_threshold):
            falsePositives = falsePositives + 1
        elif(true_r <= rating_threshold and est <= rating_threshold):
            trueNegatives = trueNegatives + 1
        if(truePositives > 0):
            accuracy = (truePositives + trueNegatives ) / (truePositives + trueNegatives + falsePositives + falseNegatives) 
            precision = truePositives / (truePositives + falsePositives)
            recall = truePositives / (truePositives + falseNegatives)
            f1_score = 2 * (precision * recall) / (precision + recall) 
            
    metrics.append([rating_threshold,truePositives,trueNegatives,falsePositives,falseNegatives,accuracy,precision,recall,f1_score])
    metrics_df = pd.DataFrame(metrics)
    metrics_df.rename(columns={0:'rating_threshold', 1:'truePositives', 2: 'trueNegatives', 3: 'falsePositives', 4:'falseNegatives', 5: 'Accuracy', 6: 'Precision', 7:'Recall', 8:'F1 Score'},inplace=True)
true_bin_array =[]
for x in true_positives_array:
    if x >= rating_threshold:
        x = 1
    else:
        x = 0
    true_bin_array.append(x)
auc_score = roc_auc_score(true_bin_array,est_array,multi_class='raise',average='macro')    
print('AUC Score: ',auc_score)

AUC Score:  0.7384121640712146


In [8]:
#Calculate precision and recall at n
def get_precision_recall_at_n(predictions,topn,rating_threshold):
    all_actual_predicted_list = defaultdict(list)
    precision = dict()
    recall= dict()
    no_of_relevant_items = 0
    no_of_recommended_items_at_top_n = 0
    no_of_relevant_recommended_items_at_top_n = 0
    for uid, iid, true_r, est, _ in predictions:
        all_actual_predicted_list[uid].append((est, true_r))
    for uid, user_ratings in all_actual_predicted_list.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        no_of_relevant_items = sum((true_r >= rating_threshold) for (_, true_r) in user_ratings)
        no_of_recommended_items_at_top_n = sum((est >= rating_threshold) for (est, _) in user_ratings[:topn])
        no_of_relevant_recommended_items_at_top_n = sum(((true_r >= rating_threshold) and (est >= rating_threshold)) for (est, true_r) in user_ratings[:topn])
        precision[uid] = no_of_relevant_recommended_items_at_top_n / no_of_recommended_items_at_top_n if no_of_recommended_items_at_top_n != 0 else 1
        recall[uid] = no_of_relevant_recommended_items_at_top_n / no_of_relevant_items if no_of_relevant_items != 0 else 1       
    return precision, recall

In [9]:
rating_threshold=3
precision_recall_at_n = []
for topn in range(2,20):
    precision, recall = get_precision_recall_at_n(predictions,topn,rating_threshold)
    precision_at_n = sum(prec for prec in precision.values()) / len(precision)
    recall_at_n = sum(rec for rec in recall.values()) / len(recall)   
    precision_recall_at_n.append({'topN' : topn, 'Precision' : precision_at_n, 'Recall': recall_at_n})
for n in range(3,9):
    print(precision_recall_at_n[n])    

{'topN': 5, 'Precision': 0.9099180327868863, 'Recall': 0.37562546361121896}
{'topN': 6, 'Precision': 0.911885245901639, 'Recall': 0.4284256230601821}
{'topN': 7, 'Precision': 0.9098555815768921, 'Recall': 0.4729948654244685}
{'topN': 8, 'Precision': 0.9089773614363774, 'Recall': 0.510080854703955}
{'topN': 9, 'Precision': 0.9077933905802761, 'Recall': 0.5424371332202211}
{'topN': 10, 'Precision': 0.9052068696330985, 'Recall': 0.5688625587081293}


In [15]:
#get user high rated and liked movies
all_movie_df_details = pd.merge(df_ratings,df_movies, on='movieId', how='inner')
all_movie_df_details = all_movie_df_details.sort_values(['userId','rating'],ascending=[True, False])
print("Top 10 high rated movies")
all_movie_df_details.loc[all_movie_df_details['userId'] == 10].head(10) #user 10 top 10 rated movies

Top 10 high rated movies


Unnamed: 0,userId,movieId,rating,title,genres
17660,10,91529,5.0,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
39281,10,33794,5.0,Batman Begins (2005),Action|Crime|IMAX
39681,10,49272,5.0,Casino Royale (2006),Action|Adventure|Thriller
39782,10,49286,5.0,"Holiday, The (2006)",Comedy|Romance
41443,10,7458,5.0,Troy (2004),Action|Adventure|Drama|War
41538,10,8533,5.0,"Notebook, The (2004)",Drama|Romance
41576,10,8869,5.0,First Daughter (2004),Comedy|Romance
42525,10,71579,5.0,"Education, An (2009)",Drama|Romance
42757,10,79091,5.0,Despicable Me (2010),Animation|Children|Comedy|Crime
42828,10,81845,5.0,"King's Speech, The (2010)",Drama


In [14]:
#user 10 top 10 movie recommendations list
print("Top 10 recommendations")
top_n_recommendations_df.loc[top_n_recommendations_df['user'] == 10].head(10)

Top 10 recommendations


Unnamed: 0,user,movieId,predicted_rating,title,genres
4228,10,2571,4.079966,"Matrix, The (1999)",Action|Sci-Fi|Thriller
15727,10,4995,3.978044,"Beautiful Mind, A (2001)",Drama|Romance
1656,10,356,3.950057,Forrest Gump (1994),Comedy|Drama|Romance|War
2741,10,104374,3.947834,About Time (2013),Drama|Fantasy|Romance
19986,10,81847,3.921018,Tangled (2010),Animation|Children|Comedy|Fantasy|Musical|Roma...
2084,10,68954,3.910803,Up (2009),Adventure|Animation|Children|Drama
8053,10,6377,3.899414,Finding Nemo (2003),Adventure|Animation|Children|Comedy
19967,10,70183,3.763792,"Ugly Truth, The (2009)",Comedy|Drama|Romance
9710,10,7293,3.743305,50 First Dates (2004),Comedy|Romance
6359,10,5952,3.733885,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
