# Import Packages & Data

In [131]:
import pandas as pd 
import numpy as np
from scipy import sparse
import pyspark as spark
from pyspark.sql import SparkSession

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import Reader, Dataset
from surprise import accuracy

# importing relevant libraries
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline
from surprise.prediction_algorithms import KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [132]:
#For reproducibility

import random
import numpy as np

my_seed = 24
random.seed(my_seed)
np.random.seed(my_seed)

## Import Data & Train Test Split

In [133]:
ratings = pd.read_csv('ratings')

In [134]:
reader = Reader()
data = Dataset.load_from_df(ratings,reader)

In [135]:
train, test = train_test_split(data, test_size=.2)

## Recreating Model Execution formula from part 1

In [136]:
refined_models = pd.DataFrame(columns=['Model','RMSE'])

In [137]:
#formula to quickly run models

def execute(model_name,model,train_set,test_set):
    
    #Fitting model
    
    model.fit(train_set)
    
    #RMSE values
    
    predictions = model.test(test_set)
    RMSE = accuracy.rmse(predictions)
    
    #Saving values
    
    results = {'Model': model_name}
    results['RMSE'] = RMSE
    
    return results

# Vanilla Model Summary

In [138]:
vanilla_model_summary = pd.read_csv('all_models')
vanilla_model_summary = vanilla_model_summary.sort_values(by='RMSE')

In [139]:
#We'll now focus on refining the top 3 models through gridsearch

In [140]:
vanilla_model_summary[:3]

Unnamed: 0,Model,RMSE
7,SVDpp,0.861879
0,Baseline,0.876181
6,SVD,0.878338


# Model Refinement

## SVDpp

In [141]:
param_grid = {'n_factors': [20,25,50,100,125,150],
               'n_epochs': [5,10,15,20,25,30],
                'lr_all': [0.002, 0.005,0.007,0.01],
                'reg_all': [0.01,0.02,0.04, 0.06,0.08,0.10]
               }

In [142]:
# SVDpp_model = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
# SVDpp_model.fit(data)

In [143]:
# SVDpp_model.best_score['rmse']
# SVDpp_model.best_params['rmse']

In [144]:
#Hard coding to avoid the long processing time of the gridsearch above

n_factors = 100
n_epochs = 30
lr_all = 0.01
reg_all = 0.08


In [145]:
model_name = 'SVDpp_Gridsearch'
model = SVDpp(n_factors = n_factors, n_epochs = n_epochs, lr_all = lr_all, reg_all = reg_all)
SVDpp_results = execute(model_name,model,train,test)
SVDpp_df = pd.DataFrame([SVDpp_results], columns=SVDpp_results.keys())
refined_models = refined_models.append(SVDpp_df)

RMSE: 0.8523


## SVD

In [146]:
param_grid = {'n_factors': [25,50,100,125,150],
               'n_epochs': [5,10,15,20],
                'lr_all': [0.002, 0.005,0.01],
                'reg_all': [0.02,0.04, 0.06,0.08,0.10]
               }

In [147]:
# SVD_model = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
# SVD_model.fit(data)

In [148]:
# SVD_model.best_score['rmse']
# SVD_model.best_params['rmse']

In [149]:
#Hard coding to avoid the long processing time of the gridsearch above

n_factors = 100
n_epochs = 20
lr_all = 0.01
reg_all = 0.06

In [150]:
model_name = 'SVD_Gridsearch'
model = SVD(n_factors = n_factors, n_epochs = n_epochs, lr_all = lr_all, reg_all = reg_all)

SVD_results = execute(model_name,model,train,test)
SVD_df = pd.DataFrame([SVD_results], columns=SVD_results.keys())
refined_models = refined_models.append(SVD_df)

RMSE: 0.8603


## Baseline

In [151]:
param_grid = {'bsl_options':{'method': ['als','sgd'],
               'n_epochs': [5,10,15,20],
               'reg_u': [5,10,12,15],
               'reg_i': [5,10,12,15],
                'learning_rate': [0.002, 0.005,0.01],
                'reg': [0.02,0.04, 0.06,0.08,0.10]
               }}

In [152]:
# baseline_model = GridSearchCV(BaselineOnly, param_grid, measures=['rmse'], cv=3)
# baseline_model.fit(data)

In [153]:
# baseline_model.best_score['rmse']
# baseline_model.best_params['rmse']

In [154]:
#Hard coding to avoid the long processing time of the gridsearch above

Baseline_best_params = {'method': 'als','n_epochs': 20,'reg_u': 5,'reg_i': 5,'learning_rate': 0.002,'reg': 0.02}

In [155]:
model_name = 'Baseline_Gridsearch'
model = BaselineOnly(bsl_options=Baseline_best_params)
model.fit(train)

baseline_results = execute(model_name,model,train,test)
baseline_df = pd.DataFrame([baseline_results], columns=baseline_results.keys())
refined_models = refined_models.append(baseline_df)

Estimating biases using als...
Estimating biases using als...
RMSE: 0.8701


# Surprise Model Summary

In [156]:
#All models were improved by Gridsearch, however, the results all translate to predictions within +/-1 star.
#SVDpp remains the strongest performing model, now refined with Gridsearch.
#Vanilla models below for reference.

refined_models.sort_values('RMSE')

Unnamed: 0,Model,RMSE
0,SVDpp_Gridsearch,0.852285
0,SVD_Gridsearch,0.86033
0,Baseline_Gridsearch,0.870103


In [157]:
vanilla_model_summary[:3]

Unnamed: 0,Model,RMSE
7,SVDpp,0.861879
0,Baseline,0.876181
6,SVD,0.878338


Although the SVDpp model returns the lowest RMSE, the model is very slow. We'll try to create an even stronger or similar model with Spark. See notebook Project-Part-3.

# Export 

In [158]:
refined_models.to_csv('gridsearch_models',index=False)