In [1]:
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pandas as pd
from collections import defaultdict
import numpy as np
from surprise import KNNBasic

df_ratings = pd.read_csv('input/ratings.csv')
df_movies = pd.read_csv('input/movies.csv')
df_ratings = df_ratings.drop(columns='timestamp')

# loading the data
train_split, test_split = train_test_split(df_ratings, test_size = 0.3, random_state=42)
reader = Reader(rating_scale=(1,5))
train_build = Dataset.load_from_df(train_split, reader)
test_build = Dataset.load_from_df(test_split, reader)
trainset = train_build.build_full_trainset()
testset = test_build.build_full_trainset().build_testset()

model = KNNBasic(k=50,min_k=20)
model.fit(trainset)
predictions = model.test(testset)
accuracy.rmse(predictions, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9851


0.9851391182692557

In [2]:
metrics=[]
true_pos_array = []
est_array = []
for rating_threshold in np.arange(0,5.5,0.5):
    truePositives = 0
    trueNegatives = 0
    falseNegatives = 0
    falsePositives = 0
    accuracy =0
    precision =0
    recall =0
    f1_score = 0
    for uid,_, true_r, est, _ in predictions:
        if(true_r >= rating_threshold and est >= rating_threshold):
            truePositives = truePositives + 1
            true_pos_array.append(true_r)
            est_array.append(est)
        elif(true_r >= rating_threshold and est <= rating_threshold):
            falseNegatives = falseNegatives + 1
        elif(true_r <= rating_threshold and est >= rating_threshold):
            falsePositives = falsePositives + 1
        elif(true_r <= rating_threshold and est <= rating_threshold):
            trueNegatives = trueNegatives + 1
        if(truePositives > 0):
            accuracy = (truePositives + trueNegatives ) / (truePositives + trueNegatives + falsePositives + falseNegatives)     
            precision = truePositives / (truePositives + falsePositives)
            recall = truePositives / (truePositives + falseNegatives)
            f1_score = 2 * (precision * recall) / (precision + recall)      
    metrics.append([rating_threshold,truePositives,trueNegatives,falsePositives,falseNegatives,accuracy,precision,recall,f1_score])
    metrics_df = pd.DataFrame(metrics)
    metrics_df.rename(columns={0:'rating_threshold', 1:'truePositives', 2: 'trueNegatives', 3: 'falsePositives', 4:'falseNegatives', 5: 'Accuracy', 6: 'Precision', 7:'Recall', 8:'F1 Score'},inplace=True)
true_bin_array =[]
for x in true_pos_array:
    if x >= rating_threshold:
        x = 1
    else:
        x = 0
    true_bin_array.append(x)
auc_score = roc_auc_score(true_bin_array,est_array,multi_class='raise',average='macro')    
print('AUC Score: ',auc_score)

AUC Score:  0.6881931075720249


In [3]:
metrics_df

Unnamed: 0,rating_threshold,truePositives,trueNegatives,falsePositives,falseNegatives,Accuracy,Precision,Recall,F1 Score
0,0.0,30251,0,0,0,1.0,1.0,1.0,1.0
1,0.5,30251,0,0,0,1.0,1.0,1.0,1.0
2,1.0,29842,0,409,0,0.98648,0.98648,1.0,0.993194
3,1.5,28998,0,1253,0,0.95858,0.95858,1.0,0.978852
4,2.0,28441,3,1807,0,0.940266,0.940261,1.0,0.969211
5,2.5,26072,71,4049,59,0.864203,0.865576,0.997742,0.926971
6,3.0,23765,522,5286,678,0.802849,0.818044,0.972262,0.888511
7,3.5,16271,2864,8929,2187,0.632541,0.645675,0.881515,0.745385
8,4.0,3648,14703,1067,10833,0.606625,0.773701,0.251916,0.380079
9,4.5,94,23693,25,6439,0.786321,0.789916,0.014388,0.028262


In [4]:
def get_precision_recall_at_n(predictions,topn,rating_threshold):
    all_actual_predicted_list = defaultdict(list)
    precision = dict()
    recall= dict()
    no_of_relevant_items = 0
    no_of_recommended_items_at_top_n = 0
    no_of_relevant_recommended_items_at_top_n = 0
    for uid, iid, true_r, est, _ in predictions:
        all_actual_predicted_list[uid].append((est, true_r))
    for uid, user_ratings in all_actual_predicted_list.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        no_of_relevant_items = sum((true_r >= rating_threshold) for (_, true_r) in user_ratings)
        no_of_recommended_items_at_top_n = sum((est >= rating_threshold) for (est, _) in user_ratings[:topn])
        no_of_relevant_recommended_items_at_top_n = sum(((true_r >= rating_threshold) and (est >= rating_threshold)) for (est, true_r) in user_ratings[:topn])
        
        precision[uid] = no_of_relevant_recommended_items_at_top_n / no_of_recommended_items_at_top_n if no_of_recommended_items_at_top_n != 0 else 1
        recall[uid] = no_of_relevant_recommended_items_at_top_n / no_of_relevant_items if no_of_relevant_items != 0 else 1
        
    return precision, recall

In [5]:
rating_threshold=3
precision_recall_at_n = []
all_precision = 0
all_recall = 0
for topn in range(2,20):
    precision, recall = get_precision_recall_at_n(predictions,topn,rating_threshold)
    precision_at_n = sum(prec for prec in precision.values()) / len(precision)
    recall_at_n = sum(rec for rec in recall.values()) / len(recall)   
    precision_recall_at_n.append({'topN' : topn, 'Precision' : precision_at_n, 'Recall': recall_at_n})
precision_recall_at_n

[{'topN': 2, 'Precision': 0.9377049180327869, 'Recall': 0.14798911497140263},
 {'topN': 3, 'Precision': 0.9229508196721311, 'Recall': 0.212382472826808},
 {'topN': 4, 'Precision': 0.9174863387978142, 'Recall': 0.2756918521553819},
 {'topN': 5, 'Precision': 0.917732240437159, 'Recall': 0.3350062804028265},
 {'topN': 6, 'Precision': 0.912486338797814, 'Recall': 0.3851047082765852},
 {'topN': 7, 'Precision': 0.9069437939110057, 'Recall': 0.42978142646920114},
 {'topN': 8, 'Precision': 0.9051288056206088, 'Recall': 0.4693828466156005},
 {'topN': 9, 'Precision': 0.9023510278428328, 'Recall': 0.5039661088855079},
 {'topN': 10, 'Precision': 0.9006206088992961, 'Recall': 0.5335213639205003},
 {'topN': 11, 'Precision': 0.9000989993612958, 'Recall': 0.55893545466625},
 {'topN': 12, 'Precision': 0.899937548790008, 'Recall': 0.5826123028545311},
 {'topN': 13, 'Precision': 0.8981300666546567, 'Recall': 0.6025723699086144},
 {'topN': 14, 'Precision': 0.896013330931363, 'Recall': 0.6207243668580433},

In [6]:
userId = 10

# Display top rated movies
def get_top_n_rated_movies_for(userId, n=10):
    r = df_ratings[df_ratings['userId'] == userId]
    r_sorted = r.sort_values('rating', ascending=False)
    top_rated_movies = r_sorted.head(n)
    movie_info = df_movies[df_movies['movieId'].isin(top_rated_movies['movieId'])]
    return movie_info

top_rated_movies = get_top_n_rated_movies_for(userId)
top_rated_movies.head(20)

Unnamed: 0,movieId,title,genres
4948,7458,Troy (2004),Action|Adventure|Drama|War
5227,8533,"Notebook, The (2004)",Drama|Romance
5332,8869,First Daughter (2004),Comedy|Romance
5917,33794,Batman Begins (2005),Action|Crime|IMAX
7156,71579,"Education, An (2009)",Drama|Romance
7371,79091,Despicable Me (2010),Animation|Children|Comedy|Crime
7466,81845,"King's Speech, The (2010)",Drama
7768,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX
7955,96079,Skyfall (2012),Action|Adventure|Thriller|IMAX
9006,140110,The Intern (2015),Comedy


In [7]:
# Display predictions

def get_top_n_recommended_movies_for(userId, n=10):
    movies = list(filter(lambda p: p[0] == userId, predictions))
    movies_sorted = sorted(movies, key=lambda p: p.est, reverse=True)
    movieIds = [p.iid for p in movies_sorted]
    return df_movies[df_movies['movieId'].isin(movieIds)][:n]

recommended_movies = get_top_n_recommended_movies_for(userId)
recommended_movies.head(20)

Unnamed: 0,movieId,title,genres
506,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
514,597,Pretty Woman (1990),Comedy|Romance
694,912,Casablanca (1942),Drama|Romance
946,1247,"Graduate, The (1967)",Comedy|Drama|Romance
3157,4246,Bridget Jones's Diary (2001),Comedy|Drama|Romance
3287,4447,Legally Blonde (2001),Comedy|Romance
3640,4995,"Beautiful Mind, A (2001)",Drama|Romance
4137,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
4644,6942,Love Actually (2003),Comedy|Drama|Romance
4799,7151,Girl with a Pearl Earring (2003),Drama|Romance
