In [1]:
from surprise import Dataset, Reader, SVD, NMF, KNNBasic, SlopeOne, CoClustering
from surprise import accuracy
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from sklearn.metrics.pairwise import pairwise_distances
import time
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')
import os

# Data Loading

## Ratings

In [2]:
df_ratings = pd.read_csv('ml-100k/ratings.csv')

In [3]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### Data cleanup

In [4]:
del df_ratings['timestamp']

In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


## Movies Data

In [6]:
df_movies = pd.read_csv('ml-100k/movies.csv')

In [7]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Expanding genres for each movie and encoding it

In [8]:
df_movies_encoded = df_movies.copy()

In [9]:
for i in range(len(df_movies_encoded['genres'])):
    for item in df_movies_encoded['genres'][i].split('|'):
        if item not in df_movies_encoded:
            df_movies_encoded[item]=0

for i in range(len(df_movies_encoded.genres)):
    for item in df_movies_encoded['genres'][i].split('|'):
        df_movies_encoded[item][i]=1

In [10]:
df_movies_encoded.head()

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Data cleanup

In [11]:
del df_movies_encoded['genres']

## Creating a joint table of users and movie data

In [12]:
df_movies_ratings = pd.merge(df_movies_encoded,df_ratings, how="outer", on=["movieId"])

In [13]:
df_movies_ratings.head()

Unnamed: 0,movieId,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed),userId,rating
0,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,1.0,4.0
1,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,5.0,4.0
2,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,7.0,4.5
3,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,15.0,2.5
4,1,Toy Story (1995),1,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,17.0,4.5


# Initializing training and test data sets

In [14]:
reader = Reader(rating_scale=(0, 5))

In [15]:
data = Dataset.load_from_df(df_ratings[['userId', 'movieId', 'rating']], reader)
type(data)

surprise.dataset.DatasetAutoFolds

## Training set

In [16]:
train = data.build_full_trainset()

## Test set

In [17]:
test = train.build_testset()

# Function returning top N recommendations

In [18]:
# top_n = defaultdict(list)
# for uid, iid, true_r, est, _ in knn_def_predictions_list:
#     top_n[uid].append((iid, est))
# for uid, user_ratings in top_n.items():
#     user_ratings.sort(key=lambda x: x[1], reverse=True)
#     top_n[uid] = user_ratings[10]
# top_n
# print("==========")
# print(top_n[1])
# print("==========")
# rec_df = pd.DataFrame(top_n[num_user], columns=['movieId', 'estimated_rating'])
# rec_df["title"] = rec_df["movieId"].map(df_movies["title"])
# rec_df = rec_df[['movieId','title','estimated_rating']]
# return rec_df

In [19]:
def get_top_n(predictions, user_id, n=10):
    num_user = pd.to_numeric(user_id)
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    rec_df = pd.DataFrame(top_n[num_user], columns=['movieId', 'estimated_rating'])
    rec_df["title"] = rec_df["movieId"].map(df_movies["title"])
    rec_df = rec_df[['movieId','title','estimated_rating']]
    return rec_df

# CoClustering Recommendation Model

## Default run

In [20]:
co_def_model = CoClustering()

In [21]:
start = time.time()
co_def_model.fit(train)
stop = time.time()
co_def_model_fit_time = stop - start
co_def_model_fit_time

1.4205589294433594

In [22]:
co_def_predictions_list = co_def_model.test(test)
co_def_predictions_list[:1]

[Prediction(uid=1, iid=1, r_ui=4.0, est=4.680627484665839, details={'was_impossible': False})]

In [23]:
co_def_accuracy = accuracy.rmse(co_def_predictions_list, verbose=True)
co_def_accuracy

RMSE: 0.8263


0.8263016437666547

In [24]:
print("===================================")
print("Recommendation for user : ")
user_id = input()
print("===================================")
get_top_n(co_def_predictions_list,user_id,10)

Recommendation for user : 


 1




Unnamed: 0,movieId,title,estimated_rating
0,954,Bad Taste (1987),5
1,1197,In the Company of Men (1997),5
2,1198,Career Girls (1997),5
3,1208,Kull the Conqueror (1997),5
4,1213,"Kiss Me, Guido (1997)",5
5,1222,Wishmaster (1997),5
6,1258,Boogie Nights (1997),5
7,1278,Artemisia (1997),5
8,2028,"South Park: Bigger, Longer and Uncut (1999)",5
9,2502,Blood Feast (1963),5


## Best Params using Cross CV

In [25]:
start = time.time()
co_cv_results = cross_validate(co_def_model, data, measures=['RMSE'],verbose=True)
stop = time.time()
co_cv_fit_time = stop - start
co_cv_fit_time

Evaluating RMSE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9455  0.9469  0.9508  0.9379  0.9587  0.9480  0.0068  
Fit time          1.46    1.38    1.43    1.39    1.55    1.44    0.06    
Test time         0.07    0.06    0.07    0.13    0.07    0.08    0.03    


8.156671047210693

In [26]:
co_cv_results

{'test_rmse': array([0.94553129, 0.94686831, 0.95078226, 0.93792969, 0.95874608]),
 'fit_time': (1.4559109210968018,
  1.3787977695465088,
  1.4306659698486328,
  1.3948402404785156,
  1.550793170928955),
 'test_time': (0.07216882705688477,
  0.06454801559448242,
  0.06728291511535645,
  0.13221192359924316,
  0.06837606430053711)}

In [27]:
co_cv_mean_rmse = co_cv_results['test_rmse'].mean()
co_cv_mean_fit_time = np.asarray(co_cv_results['fit_time']).mean()
co_cv_mean_test_time = np.asarray(co_cv_results['test_time']).mean()
print('co_cv_mean_rmse','co_cv_mean_fit_time','co_cv_mean_test_time')
print(co_cv_mean_rmse,co_cv_mean_fit_time,co_cv_mean_test_time)

co_cv_mean_rmse co_cv_mean_fit_time co_cv_mean_test_time
0.947971526819406 1.4422016143798828 0.08091754913330078


## Best Params using Grid Search CV

In [33]:
co_param_grid = {'n_epochs': [80,100,150,200], 'n_cltr_u': [3,5,7]}
co_gs = GridSearchCV(CoClustering, co_param_grid, cv=3, refit=True)
co_gs

<surprise.model_selection.search.GridSearchCV at 0x7fb63aeec190>

In [34]:
start = time.time()
co_gs.fit(data)
stop = time.time()
co_gs_fit_time = stop - start
co_gs_fit_time

296.95802879333496

In [35]:
print(co_gs.best_score['rmse'])
print(co_gs.best_params['rmse'])

0.9543537754184542
{'n_epochs': 150, 'n_cltr_u': 5}


In [36]:
co_gs_predictions_list = co_gs.test(test)
co_gs_predictions_list[:1]

[Prediction(uid=1, iid=1, r_ui=4.0, est=4.935938091551188, details={'was_impossible': False})]

In [37]:
print("===================================")
print("Recommendation for user : ")
user_id = input()
print("===================================")
get_top_n(co_gs_predictions_list,user_id,10)

Recommendation for user : 


 1




Unnamed: 0,movieId,title,estimated_rating
0,6,Sabrina (1995),5.0
1,1197,In the Company of Men (1997),5.0
2,1224,"Game, The (1997)",5.0
3,1270,"I Love You, I Love You Not (1996)",5.0
4,1278,Artemisia (1997),5.0
5,2761,Starman (1984),5.0
6,3703,John Q (2002),5.0
7,3729,Royal Wedding (1951),5.0
8,940,Dead Alive (Braindead) (1992),4.996985
9,1090,"Thieves (Voleurs, Les) (1996)",4.995


## Storing observations

In [49]:
comp_dict_df  = pd.read_csv('ml-100k/perf_nums.csv')

In [50]:
comp_dict_df

Unnamed: 0,model,Default Fit Time,Default RMSE,CrossCV Fit Time,CrossCV RMSE,GridSearchCV Fit Time,GridSearchCV RMSE,GridSearchCV Best Params
0,KNNBasic,0.115444,0.726239,0.075817,0.946784,77.426934,0.953747,"{'k': 20, 'sim_options': {'name': 'msd', 'user..."
1,SVD,5.296316,0.641188,4.667572,0.875053,205.637375,0.869233,"{'n_factors': 2, 'n_epochs': 20, 'lr_all': 0.0..."
2,CoClustering,,,,,,,


In [51]:
comp_dict_df = comp_dict_df.set_index('model')

In [52]:
comp_dict_df

Unnamed: 0_level_0,Default Fit Time,Default RMSE,CrossCV Fit Time,CrossCV RMSE,GridSearchCV Fit Time,GridSearchCV RMSE,GridSearchCV Best Params
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
KNNBasic,0.115444,0.726239,0.075817,0.946784,77.426934,0.953747,"{'k': 20, 'sim_options': {'name': 'msd', 'user..."
SVD,5.296316,0.641188,4.667572,0.875053,205.637375,0.869233,"{'n_factors': 2, 'n_epochs': 20, 'lr_all': 0.0..."
CoClustering,,,,,,,


In [53]:
comp_dict_df.loc['CoClustering'] = [co_def_model_fit_time,co_def_accuracy,
                       co_cv_mean_fit_time,co_cv_mean_rmse,
                       co_gs_fit_time,
                       co_gs.best_score['rmse'],co_gs.best_params['rmse']]
# comp_dict_df.loc['SVD']

In [54]:
comp_dict_df = comp_dict_df.reset_index()

In [55]:
comp_dict_df.to_csv('ml-100k/perf_nums.csv',index=False)  

In [56]:
comp_dict_df

Unnamed: 0,model,Default Fit Time,Default RMSE,CrossCV Fit Time,CrossCV RMSE,GridSearchCV Fit Time,GridSearchCV RMSE,GridSearchCV Best Params
0,KNNBasic,0.115444,0.726239,0.075817,0.946784,77.426934,0.953747,"{'k': 20, 'sim_options': {'name': 'msd', 'user..."
1,SVD,5.296316,0.641188,4.667572,0.875053,205.637375,0.869233,"{'n_factors': 2, 'n_epochs': 20, 'lr_all': 0.0..."
2,CoClustering,1.420559,0.826302,1.442202,0.947972,296.958029,0.954354,"{'n_epochs': 150, 'n_cltr_u': 5}"
