# 3. Model selection for recommender systems

In [1]:
import pandas as pd
import numpy as np

#import reader, crossvalidation, models ametrics from surprise
from surprise import Reader, Dataset
from surprise.model_selection import LeaveOneOut, cross_validate
from surprise import KNNBaseline, KNNBasic, SVD, NMF, accuracy

In [3]:
# Load data
df = pd.read_csv('../data/reviews_filtered.csv')
df.head()

Unnamed: 0,name,review_stars,cleaned_text,sent_rating,uid
0,Deagan's Kitchen & Bar,5,we walked into melt did you want to put your n...,3.505159,0
1,Deagan's Kitchen & Bar,4,brunch on saturday was excellent the bloody ma...,3.8,1
2,Deagan's Kitchen & Bar,4,great food great atmosphere great service some...,3.5875,2
3,Deagan's Kitchen & Bar,3,had a saturday evening dinner with friends goi...,3.436378,3
4,Deagan's Kitchen & Bar,2,i haven't been here for years i'm not from thi...,2.981408,4


In [4]:
# Rating df
df_ratings = df[['uid', 'name', 'review_stars']]
df_ratings.columns = ['uid', 'restaurant', 'rating']
df_ratings.head()

Unnamed: 0,uid,restaurant,rating
0,0,Deagan's Kitchen & Bar,5
1,1,Deagan's Kitchen & Bar,4
2,2,Deagan's Kitchen & Bar,4
3,3,Deagan's Kitchen & Bar,3
4,4,Deagan's Kitchen & Bar,2


In [5]:
# Sentiment df
df_sentiment = df[['uid', 'name', 'sent_rating']]
df_sentiment.columns = ['uid', 'restaurant', 'rating']
df_sentiment.head()

Unnamed: 0,uid,restaurant,rating
0,0,Deagan's Kitchen & Bar,3.505159
1,1,Deagan's Kitchen & Bar,3.8
2,2,Deagan's Kitchen & Bar,3.5875
3,3,Deagan's Kitchen & Bar,3.436378
4,4,Deagan's Kitchen & Bar,2.981408


In [7]:
# Models to test
algorithms_cv = {'svd': SVD(),
              'NMF': NMF(), 
              'kNN_pearson': KNNBasic(sim_options={'name': 'pearson', 'user_based': False }), 
              'kNN_cosine': KNNBasic(sim_options= {'name': 'cosine', 'user_based': False}), 
              'kNN_msd': KNNBasic(sim_options={'name': 'msd', 'user_based': False}), 
              }

In [8]:
def model_validation(df, algorithms):
    
    data = Dataset.load_from_df(df, Reader(rating_scale=(1,5)))

    benchmark = []
    # Iterate over all algorithms
    for model, algorithm in algorithms.items():
        # Perform cross validation
        results = cross_validate(algorithm, data, measures=['RMSE', 'MSE', 'MAE'], cv=10, 
                                 verbose=False)     
        
        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(pd.Series(model, index=['Algorithm']))
        benchmark.append(tmp)
    
    output = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
    output.columns = ['RMSE(K=10)','MSE(K=10)', 'MAE(K=10)', 'fit_time', 'test_time']
    return output

In [64]:
# Ratings
results_ratings = model_validation(df_ratings, algorithms_cv)
results_ratings

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Co

Unnamed: 0_level_0,RMSE(K=10),MSE(K=10),MAE(K=10),fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
svd,0.973449,0.947614,0.758114,18.564372,0.417548
kNN_msd,1.006659,1.013394,0.778598,11.720646,5.826927
NMF,1.015683,1.031625,0.791281,23.013564,0.303335
kNN_cosine,1.019681,1.039753,0.795297,21.941919,5.711867
kNN_pearson,1.048384,1.099115,0.813277,28.062884,5.360159


In [65]:
results_ratings.to_csv('../data/model_ratings.csv')

In [66]:
# Sentiment analysis
results_sentiment = model_validation(df_sentiment, algorithms_cv)
results_sentiment

Computing the pearson similarity matrix...


  sim = construction_func[name](*args)


Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Don

Unnamed: 0_level_0,RMSE(K=10),MSE(K=10),MAE(K=10),fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
svd,0.314934,0.099187,0.234481,18.636536,0.380283
kNN_msd,0.315911,0.099801,0.23373,21.014045,10.276739
kNN_cosine,0.317016,0.100501,0.234848,38.271068,10.105334
kNN_pearson,0.326278,0.106462,0.24166,37.860483,7.578257
NMF,0.328185,0.107709,0.245183,24.637818,0.383059


In [67]:
results_sentiment.to_csv('../data/model_sentiment.csv')

In [68]:
# Rounded sentiment
df_sentiment_rounded = df_sentiment.copy()
df_sentiment_rounded['rounded'] = df_sentiment_rounded['rating'].apply(round)
df_sentiment_rounded.drop('rating', axis=1, inplace=True)
df_sentiment_rounded.columns = ['uid', 'restaurant', 'rating']

In [None]:
results_sentiment_rounded = model_validation(df_sentiment_rounded, algorithms_cv)
results_sentiment_rounded

In [None]:
results_sentiment_rounded.to_csv('model_sentiment_rounded.csv')

In [15]:
algorithms_tunning = {'SVD': SVD,
              #'kNN': KNNBasic
              }

In [23]:
#Hyperparameter tunning

from surprise.model_selection import GridSearchCV
def parameter_selection(df, algorithms):
    data = Dataset.load_from_df(df, Reader(rating_scale=(1,5)))
    
    for model, algorithm in algorithms.items():
        if model == 'SVD':
            param_grid = {'n_factors':[50,100,200,500],
                      'n_epochs':[20,30,50], 'lr_all':[0.002,0.005,0.01],
                      'reg_all':[0.02,0.06,0.08]}
        
        elif model == 'kNN':
            param_grid = {'k': [20, 30, 40, 50], #number of neighbors for computation
              'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [5, 10], #minimum number of common users
                              'user_based': [False]}
              }
        else: raise ValueError('The model provided is not valid')

    gs = GridSearchCV(algorithm, param_grid, measures=['rmse'], cv=2)
    gs.fit(data)
    params = gs.best_params['rmse']
    print(f'The best combination of parameters is {params}')
    return pd.DataFrame.from_dict(gs.cv_results)

In [24]:
parameter_tunning_SVD = parameter_selection(df_ratings, algorithms_tunning)

The best combination of parameters is {'n_factors': 50, 'n_epochs': 50, 'lr_all': 0.002, 'reg_all': 0.08}


In [11]:
parameter_tunning_kNN = parameter_selection(df_sentiment, algorithms_tunning)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...


In [13]:
parameter_tunning_kNN

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_k,param_sim_options
0,0.331176,0.326122,0.3272,0.325552,0.32683,0.327376,0.001983,7,9.316437,0.483462,6.963662,0.114057,"{'k': 20, 'sim_options': {'name': 'msd', 'min_...",20,"{'name': 'msd', 'min_support': 5, 'user_based'..."
1,0.337828,0.331836,0.334213,0.331012,0.33461,0.3339,0.002393,13,9.093746,0.103283,6.56829,0.01813,"{'k': 20, 'sim_options': {'name': 'msd', 'min_...",20,"{'name': 'msd', 'min_support': 10, 'user_based..."
2,0.332308,0.327461,0.328498,0.32662,0.328066,0.328591,0.001963,8,13.982414,0.407793,6.888127,0.044124,"{'k': 20, 'sim_options': {'name': 'cosine', 'm...",20,"{'name': 'cosine', 'min_support': 5, 'user_bas..."
3,0.338881,0.332774,0.335301,0.331862,0.335539,0.334871,0.002456,16,13.369857,0.140478,6.521981,0.040281,"{'k': 20, 'sim_options': {'name': 'cosine', 'm...",20,"{'name': 'cosine', 'min_support': 10, 'user_ba..."
4,0.330047,0.324929,0.326025,0.324253,0.325598,0.32617,0.00203,3,9.056341,0.174814,7.453658,0.008212,"{'k': 30, 'sim_options': {'name': 'msd', 'min_...",30,"{'name': 'msd', 'min_support': 5, 'user_based'..."
5,0.337288,0.331343,0.333745,0.330326,0.334209,0.333382,0.002432,11,9.019775,0.122308,7.091444,0.011718,"{'k': 30, 'sim_options': {'name': 'msd', 'min_...",30,"{'name': 'msd', 'min_support': 10, 'user_based..."
6,0.331096,0.325949,0.327115,0.325077,0.326623,0.327172,0.002078,6,13.752759,0.15687,7.496678,0.025835,"{'k': 30, 'sim_options': {'name': 'cosine', 'm...",30,"{'name': 'cosine', 'min_support': 5, 'user_bas..."
7,0.338131,0.332051,0.334379,0.331112,0.334822,0.334099,0.002448,15,13.215264,0.153282,7.065522,0.058042,"{'k': 30, 'sim_options': {'name': 'cosine', 'm...",30,"{'name': 'cosine', 'min_support': 10, 'user_ba..."
8,0.329721,0.324714,0.325875,0.323905,0.325425,0.325928,0.00201,2,9.047763,0.128921,7.899655,0.011416,"{'k': 40, 'sim_options': {'name': 'msd', 'min_...",40,"{'name': 'msd', 'min_support': 5, 'user_based'..."
9,0.337264,0.331393,0.333733,0.330261,0.33409,0.333348,0.002424,10,9.042331,0.152299,7.470124,0.025896,"{'k': 40, 'sim_options': {'name': 'msd', 'min_...",40,"{'name': 'msd', 'min_support': 10, 'user_based..."


In [27]:
parameter_tunning_SVD.to_csv('../data/tunning_SVD.csv')