In [1]:
from surprise import Dataset, Reader, SVD, NMF, KNNBasic, SlopeOne, CoClustering
from surprise import accuracy
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from sklearn.metrics.pairwise import pairwise_distances
import time
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import os

# Data Loading

## Ratings

In [2]:
df_ratings = pd.read_csv('ml-100k/ratings.csv')

In [3]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### Data cleanup

In [4]:
del df_ratings['timestamp']

In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


## Movies Data

In [6]:
df_movies = pd.read_csv('ml-100k/movies.csv')

In [7]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Expanding genres for each movie and encoding it

In [8]:
df_movies_encoded = df_movies.copy()

In [9]:
for i in range(len(df_movies_encoded['genres'])):
    for item in df_movies_encoded['genres'][i].split('|'):
        if item not in df_movies_encoded:
            df_movies_encoded[item]=0

for i in range(len(df_movies_encoded.genres)):
    for item in df_movies_encoded['genres'][i].split('|'):
        df_movies_encoded[item][i]=1

In [10]:
df_movies_encoded.head()

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Data cleanup

In [11]:
del df_movies_encoded['genres']

## Creating a joint table of users and movie data

In [12]:
df_movies_ratings = pd.merge(df_movies_encoded,df_ratings, how="outer", on=["movieId"])

In [13]:
df_movies_ratings.head()

Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),userId,rating
0,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1.0,4.0
1,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,5.0,4.0
2,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,7.0,4.5
3,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,15.0,2.5
4,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,17.0,4.5


# Initializing training and test data sets

In [14]:
reader = Reader(rating_scale=(0, 5))

In [15]:
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
type(data)

surprise.dataset.DatasetAutoFolds

## Training set

In [16]:
train = data.build_full_trainset()

## Test set

In [17]:
test = train.build_testset()

# Function returning top N recommendations

In [18]:
# top_n = defaultdict(list)
# for uid, iid, true_r, est, _ in knn_def_predictions_list:
#     top_n[uid].append((iid, est))
# for uid, user_ratings in top_n.items():
#     user_ratings.sort(key=lambda x: x[1], reverse=True)
#     top_n[uid] = user_ratings[10]
# top_n
# print("==========")
# print(top_n[1])
# print("==========")
# rec_df = pd.DataFrame(top_n[num_user], columns=['movieId', 'estimated_rating'])
# rec_df["title"] = rec_df["movieId"].map(df_movies["title"])
# rec_df = rec_df[['movieId','title','estimated_rating']]
# return rec_df

In [19]:
def get_top_n(predictions, user_id, n=10):
    num_user = pd.to_numeric(user_id)
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    rec_df = pd.DataFrame(top_n[num_user], columns=['movieId', 'estimated_rating'])
    rec_df["title"] = rec_df["movieId"].map(df_movies["title"])
    rec_df = rec_df[['movieId','title','estimated_rating']]
    return rec_df

# SVD Recommendation Model

## Default run

In [20]:
svd_def_model = SVD()

In [21]:
start = time.time()
svd_def_model.fit(train)
stop = time.time()
svd_def_model_fit_time = stop - start
svd_def_model_fit_time

5.296315908432007

In [22]:
svd_def_predictions_list = svd_def_model.test(test)
svd_def_predictions_list[:1]

[Prediction(uid=1, iid=1, r_ui=4.0, est=4.690175459279249, details={'was_impossible': False})]

In [23]:
svd_def_accuracy = accuracy.rmse(svd_def_predictions_list, verbose=True)
svd_def_accuracy

RMSE: 0.6412


0.64118761303731

In [24]:
print("===================================")
print("Recommendation for user : ")
user_id = input()
print("===================================")
get_top_n(svd_def_predictions_list,user_id,10)

Recommendation for user : 


 1




Unnamed: 0,movieId,title,estimated_rating
0,50,Georgia (1995),5
1,260,Quiz Show (1994),5
2,527,"Aristocats, The (1970)",5
3,608,Heavy (1995),5
4,923,Full Metal Jacket (1987),5
5,1136,Selena (1997),5
6,1196,Picture Perfect (1997),5
7,1197,In the Company of Men (1997),5
8,1208,Kull the Conqueror (1997),5
9,1213,"Kiss Me, Guido (1997)",5


## Best Params using Cross CV

In [25]:
start = time.time()
svd_cv_results = cross_validate(svd_def_model, data, measures=['RMSE'],verbose=True)
stop = time.time()
svd_cv_fit_time = stop - start
svd_cv_fit_time

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8813  0.8640  0.8653  0.8827  0.8820  0.8751  0.0085  
Fit time          4.60    4.55    4.87    4.69    4.62    4.67    0.11    
Test time         0.09    0.09    0.09    0.16    0.09    0.11    0.03    


24.409573078155518

In [26]:
svd_cv_results

{'test_rmse': array([0.88133674, 0.86395204, 0.86531036, 0.88266785, 0.88199737]),
 'fit_time': (4.597661972045898,
  4.55354905128479,
  4.871888160705566,
  4.6937549114227295,
  4.6210081577301025),
 'test_time': (0.09341907501220703,
  0.09327483177185059,
  0.09339165687561035,
  0.16299700736999512,
  0.09180521965026855)}

In [27]:
svd_cv_mean_rmse = svd_cv_results['test_rmse'].mean()
svd_cv_mean_fit_time = np.asarray(svd_cv_results['fit_time']).mean()
svd_cv_mean_test_time = np.asarray(svd_cv_results['test_time']).mean()
print('svd_cv_mean_rmse','svd_cv_mean_fit_time','svd_cv_mean_test_time')
print(svd_cv_mean_rmse,svd_cv_mean_fit_time,svd_cv_mean_test_time)

svd_cv_mean_rmse svd_cv_mean_fit_time svd_cv_mean_test_time
0.8750528736782393 4.667572450637818 0.10697755813598633


## Best Params using Grid Search CV

In [28]:
svd_param_grid = {'n_factors':[2,5,10],'n_epochs': [10,15,20,25], 'lr_all': [0.005, 0.010, 0.020],
              'reg_all': [0.02,0.04]}
svd_gs = GridSearchCV(SVD, svd_param_grid, measures=['rmse', 'mae'], cv=3, refit=True)
svd_gs

<surprise.model_selection.search.GridSearchCV at 0x7f92fab73ee0>

In [29]:
start = time.time()
svd_gs.fit(data)
stop = time.time()
svd_gs_fit_time = stop - start
svd_gs_fit_time

205.63737511634827

In [30]:
print(svd_gs.best_score['rmse'])
print(svd_gs.best_params['rmse'])

0.8692325852497578
{'n_factors': 2, 'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.04}


In [31]:
svd_gs_predictions_list = svd_gs.test(test)
svd_gs_predictions_list[:1]

[Prediction(uid=1, iid=1, r_ui=4.0, est=4.7292459884067215, details={'was_impossible': False})]

In [32]:
print("===================================")
print("Recommendation for user : ")
user_id = input()
print("===================================")
get_top_n(svd_gs_predictions_list,user_id,10)

Recommendation for user : 


 1




Unnamed: 0,movieId,title,estimated_rating
0,260,Quiz Show (1994),5.0
1,1197,In the Company of Men (1997),5.0
2,1213,"Kiss Me, Guido (1997)",5.0
3,2329,Babes in Toyland (1934),5.0
4,2959,Billy Elliot (2000),5.0
5,1089,Mother (1996),4.984823
6,50,Georgia (1995),4.980347
7,356,"Age of Innocence, The (1993)",4.958964
8,1198,Career Girls (1997),4.955601
9,1136,Selena (1997),4.950265


## Storing observations

In [57]:
comp_dict_df  = pd.read_csv('ml-100k/perf_nums.csv')

In [58]:
comp_dict_df

Unnamed: 0,model,Default Fit Time,Default RMSE,CrossCV Fit Time,CrossCV RMSE,GridSearchCV Fit Time,GridSearchCV RMSE,GridSearchCV Best Params
0,KNNBasic,0.115444,0.726239,0.075817,0.946784,77.426934,0.953747,"{'k': 20, 'sim_options': {'name': 'msd', 'user..."
1,SVD,,,,,,,
2,CoClustering,,,,,,,


In [59]:
comp_dict_df = comp_dict_df.set_index('model')

In [60]:
comp_dict_df

Unnamed: 0_level_0,Default Fit Time,Default RMSE,CrossCV Fit Time,CrossCV RMSE,GridSearchCV Fit Time,GridSearchCV RMSE,GridSearchCV Best Params
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNNBasic,0.115444,0.726239,0.075817,0.946784,77.426934,0.953747,"{'k': 20, 'sim_options': {'name': 'msd', 'user..."
SVD,,,,,,,
CoClustering,,,,,,,


In [61]:
comp_dict_df.loc['SVD'] = [svd_def_model_fit_time,svd_def_accuracy,
                       svd_cv_mean_fit_time,svd_cv_mean_rmse,
                       svd_gs_fit_time,
                       svd_gs.best_score['rmse'],svd_gs.best_params['rmse']]
# comp_dict_df.loc['SVD']

In [63]:
comp_dict_df = comp_dict_df.reset_index()

In [64]:
comp_dict_df.to_csv('ml-100k/perf_nums.csv',index=False)  

In [None]:
start = time.time()
knn_cv_results = cross_validate(knn_def_model, data, measures=['RMSE'],verbose=True)
stop = time.time()
knn_cv_fit_time = stop - start
knn_cv_fit_time

In [None]:
knn_cv_results

In [None]:
knn_cv_mean_rmse = knn_cv_results['test_rmse'].mean()
knn_cv_mean_fit_time = np.asarray(knn_cv_results['fit_time']).mean()
knn_cv_mean_test_time = np.asarray(knn_cv_results['test_time']).mean()

#### KNNBasic Gridsearch

In [None]:
names = ('msd', 'cosine', 'pearson')
options = list()
# fill options with dictionaries
for name in names:
    d = dict()
    d['name'] = name
    options.append(d)
options    

In [None]:
knn_param_grid = {'k':[40,100,200],'sim_options': {'name': ['msd', 'cosine','pearson']}}
knn_gs = GridSearchCV(KNNBasic, knn_param_grid, cv=3)
knn_gs

In [None]:
start = time.time()
knn_gs.fit(data)
stop = time.time()
knn_gs_fit_time = stop - start
knn_gs_fit_time

In [None]:
print(knn_gs.best_score['rmse'])
print(knn_gs.best_params['rmse'])

In [None]:
svd_def_model = SVD()

In [None]:
start = time.time()
svd_def_model.fit(train)
stop = time.time()
svd_def_model_fit_time = stop - start
svd_def_model_fit_time

In [None]:
svd_def_predictions_list = svd_def_model.test(test)

#### SVD Cross CV

In [None]:
start = time.time()
svd_cv_results = cross_validate(svd_def_model, data, measures=['RMSE'],verbose=True)
stop = time.time()
svd_cv_fit_time = stop - start
svd_cv_fit_time

In [None]:
svd_cv_results

In [None]:
svd_cv_mean_rmse = svd_cv_results['test_rmse'].mean()
svd_cv_mean_fit_time = np.asarray(svd_cv_results['fit_time']).mean()
svd_cv_mean_test_time = np.asarray(svd_cv_results['test_time']).mean()

#### SVD Gridsearch

In [None]:
svd_param_grid = {'n_factors':[2,100,500],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.02,0.4, 0.6]}
svd_gs = GridSearchCV(SVD, svd_param_grid, measures=['rmse', 'mae'], cv=3)
svd_gs

In [None]:
start = time.time()
svd_gs.fit(data)
stop = time.time()
svd_gs_fit_time = stop - start
svd_gs_fit_time

In [None]:
print(svd_gs.best_score['rmse'])
print(svd_gs.best_params['rmse'])

In [None]:
start = time.time()
so_gs.fit(data)
stop = time.time()
so_gs_fit_time = stop - start
so_gs_fit_time

In [None]:
print(so_gs.best_score['rmse'])
print(so_gs.best_params['rmse'])