In [25]:
from surprise import Dataset, Reader, SVD, NMF, KNNBasic, SlopeOne, CoClustering
from surprise import accuracy
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from sklearn.metrics.pairwise import pairwise_distances
import time
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import os

# Data Loading

## Ratings

In [26]:
df_ratings = pd.read_csv('ml-100k/ratings.csv')

In [27]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### Data cleanup

In [28]:
del df_ratings['timestamp']

In [29]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


## Movies Data

In [30]:
df_movies = pd.read_csv('ml-100k/movies.csv')

In [31]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Expanding genres for each movie and encoding it

In [32]:
df_movies_encoded = df_movies.copy()

In [33]:
for i in range(len(df_movies_encoded['genres'])):
    for item in df_movies_encoded['genres'][i].split('|'):
        if item not in df_movies_encoded:
            df_movies_encoded[item]=0

for i in range(len(df_movies_encoded.genres)):
    for item in df_movies_encoded['genres'][i].split('|'):
        df_movies_encoded[item][i]=1

In [34]:
df_movies_encoded.head()

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Data cleanup

In [35]:
del df_movies_encoded['genres']

## Creating a joint table of users and movie data

In [36]:
df_movies_ratings = pd.merge(df_movies_encoded,df_ratings, how="outer", on=["movieId"])

In [37]:
df_movies_ratings.head()

Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),userId,rating
0,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1.0,4.0
1,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,5.0,4.0
2,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,7.0,4.5
3,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,15.0,2.5
4,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,17.0,4.5


# Initializing training and test data sets

In [38]:
reader = Reader(rating_scale=(0, 5))

In [39]:
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
type(data)

surprise.dataset.DatasetAutoFolds

## Training set

In [40]:
train = data.build_full_trainset()

## Test set

In [41]:
test = train.build_testset()

# Function returning top N recommendations

In [42]:
# top_n = defaultdict(list)
# for uid, iid, true_r, est, _ in knn_def_predictions_list:
#     top_n[uid].append((iid, est))
# for uid, user_ratings in top_n.items():
#     user_ratings.sort(key=lambda x: x[1], reverse=True)
#     top_n[uid] = user_ratings[10]
# top_n
# print("==========")
# print(top_n[1])
# print("==========")
# rec_df = pd.DataFrame(top_n[num_user], columns=['movieId', 'estimated_rating'])
# rec_df["title"] = rec_df["movieId"].map(df_movies["title"])
# rec_df = rec_df[['movieId','title','estimated_rating']]
# return rec_df

In [43]:
def get_top_n(predictions, user_id, n=10):
    num_user = pd.to_numeric(user_id)
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    rec_df = pd.DataFrame(top_n[num_user], columns=['movieId', 'estimated_rating'])
    rec_df["title"] = rec_df["movieId"].map(df_movies["title"])
    rec_df = rec_df[['movieId','title','estimated_rating']]
    return rec_df

# KNNBasic Recommendation Model

## Default run

In [44]:
knn_def_model = KNNBasic()

In [45]:
start = time.time()
knn_def_model.fit(train)
stop = time.time()
knn_def_model_fit_time = stop - start
knn_def_model_fit_time

Computing the msd similarity matrix...
Done computing similarity matrix.


0.11544394493103027

In [46]:
knn_def_predictions_list = knn_def_model.test(test)
knn_def_predictions_list[:1]

[Prediction(uid=1, iid=1, r_ui=4.0, est=4.194241094435477, details={'actual_k': 40, 'was_impossible': False})]

In [47]:
knn_def_accuracy = accuracy.rmse(knn_def_predictions_list, verbose=True)
knn_def_accuracy

RMSE: 0.7262


0.7262389271085168

In [49]:
print("===================================")
print("Recommendation for user : ")
user_id = input()
print("===================================")
get_top_n(knn_def_predictions_list,user_id,10)

Recommendation for user : 


 1




Unnamed: 0,movieId,title,estimated_rating
0,2571,Teenage Mutant Ninja Turtles II: The Secret of...,4.757041
1,260,Quiz Show (1994),4.730315
2,47,Mighty Aphrodite (1995),4.654646
3,1198,Career Girls (1997),4.639253
4,50,Georgia (1995),4.629333
5,2959,Billy Elliot (2000),4.622299
6,527,"Aristocats, The (1970)",4.619707
7,1196,Picture Perfect (1997),4.611186
8,608,Heavy (1995),4.587934
9,1210,187 (One Eight Seven) (1997),4.58436


## Best Params using Cross CV

In [50]:
start = time.time()
knn_cv_results = cross_validate(knn_def_model, data, measures=['RMSE'],verbose=True)
stop = time.time()
knn_cv_fit_time = stop - start
knn_cv_fit_time

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9488  0.9353  0.9469  0.9510  0.9519  0.9468  0.0060  
Fit time          0.07    0.08    0.08    0.08    0.08    0.08    0.00    
Test time         0.94    0.95    1.02    0.95    0.92    0.96    0.03    


5.683795213699341

In [51]:
knn_cv_results

{'test_rmse': array([0.94881034, 0.93530228, 0.94694667, 0.95095449, 0.95190487]),
 'fit_time': (0.0704658031463623,
  0.07779526710510254,
  0.07808303833007812,
  0.07546710968017578,
  0.07727479934692383),
 'test_time': (0.9367721080780029,
  0.9501519203186035,
  1.0206098556518555,
  0.9518430233001709,
  0.9220991134643555)}

In [52]:
knn_cv_mean_rmse = knn_cv_results['test_rmse'].mean()
knn_cv_mean_fit_time = np.asarray(knn_cv_results['fit_time']).mean()
knn_cv_mean_test_time = np.asarray(knn_cv_results['test_time']).mean()
print('knn_cv_mean_rmse','knn_cv_mean_fit_time','knn_cv_mean_test_time')
print(knn_cv_mean_rmse,knn_cv_mean_fit_time,knn_cv_mean_test_time)

knn_cv_mean_rmse knn_cv_mean_fit_time knn_cv_mean_test_time
0.9467837284774555 0.07581720352172852 0.9562952041625976


## Best Params using Grid Search CV

In [53]:
names = ('msd', 'cosine', 'pearson')
options = list()
# fill options with dictionaries
for name in names:
    d = dict()
    d['name'] = name
    options.append(d)
options    

[{'name': 'msd'}, {'name': 'cosine'}, {'name': 'pearson'}]

In [54]:
knn_param_grid = {'k':[10,20,30,40,100,200],'sim_options': {'name': ['msd', 'cosine','pearson']}}
# knn_param_grid = {'k':[40,100,200],'sim_options': {'name': ['cosine']}}
knn_gs = GridSearchCV(KNNBasic, knn_param_grid, cv=3, refit=True)
knn_gs

<surprise.model_selection.search.GridSearchCV at 0x7fda709a8670>

In [55]:
start = time.time()
knn_gs.fit(data)
stop = time.time()
knn_gs_fit_time = stop - start
knn_gs_fit_time

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine simila

77.42693400382996

In [56]:
print(knn_gs.best_score['rmse'])
print(knn_gs.best_params['rmse'])

0.9537472635426192
{'k': 20, 'sim_options': {'name': 'msd', 'user_based': True}}


In [57]:
knn_gs_predictions_list = knn_gs.test(test)
knn_gs_predictions_list[:1]

[Prediction(uid=1, iid=1, r_ui=4.0, est=4.166807606099762, details={'actual_k': 20, 'was_impossible': False})]

In [58]:
print("===================================")
print("Recommendation for user : ")
user_id = input()
print("===================================")
get_top_n(knn_gs_predictions_list,user_id,10)

Recommendation for user : 


 1




Unnamed: 0,movieId,title,estimated_rating
0,260,Quiz Show (1994),4.817907
1,2571,Teenage Mutant Ninja Turtles II: The Secret of...,4.782302
2,608,Heavy (1995),4.772138
3,1198,Career Girls (1997),4.742136
4,527,"Aristocats, The (1970)",4.737801
5,50,Georgia (1995),4.734306
6,47,Mighty Aphrodite (1995),4.726042
7,1208,Kull the Conqueror (1997),4.699737
8,1196,Picture Perfect (1997),4.678509
9,1210,187 (One Eight Seven) (1997),4.65271


## Storing observations

In [65]:
comp_dict = {'model':['KNNBasic','SVD','CoClustering'],
'Default Fit Time':[knn_def_model_fit_time,'',''],
'Default RMSE':[knn_def_accuracy,'',''],
'CrossCV Fit Time':[knn_cv_mean_fit_time,'',''],
'CrossCV RMSE':[knn_cv_mean_rmse,'',''],
'GridSearchCV Fit Time':[knn_gs_fit_time,'',''],
'GridSearchCV RMSE':[knn_gs.best_score['rmse'],'',''],
'GridSearchCV Best Params':[knn_gs.best_params['rmse'],'','']}

comp_dict

{'model': ['KNNBasic', 'SVD', 'CoClustering'],
 'Default Fit Time': [0.11544394493103027, '', ''],
 'Default RMSE': [0.7262389271085168, '', ''],
 'CrossCV Fit Time': [0.07581720352172852, '', ''],
 'CrossCV RMSE': [0.9467837284774555, '', ''],
 'GridSearchCV Fit Time': [77.42693400382996, '', ''],
 'GridSearchCV RMSE': [0.9537472635426192, '', ''],
 'GridSearchCV Best Params': [{'k': 20,
   'sim_options': {'name': 'msd', 'user_based': True}},
  '',
  '']}

In [66]:
pd.set_option('display.max_colwidth', None)
comp_dict_df = pd.DataFrame(comp_dict)
# comp_dict_df.sort_values(by='CrossCV RMSE')
comp_dict_df

Unnamed: 0,model,Default Fit Time,Default RMSE,CrossCV Fit Time,CrossCV RMSE,GridSearchCV Fit Time,GridSearchCV RMSE,GridSearchCV Best Params
0,KNNBasic,0.115444,0.726239,0.075817,0.946784,77.426934,0.953747,"{'k': 20, 'sim_options': {'name': 'msd', 'user_based': True}}"
1,SVD,,,,,,,
2,CoClustering,,,,,,,


In [67]:
comp_dict_df.to_csv('ml-100k/perf_nums.csv',index=False)  