In [1]:
# Imports
%matplotlib inline

import numpy as np
import scipy
import matplotlib.pyplot as plt
import surprise
import pandas as pd
from surprise import SVD
from surprise import Dataset
from surprise import KNNBaseline
from surprise import Reader
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithZScore
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate


%load_ext autoreload
%autoreload 2


# Set the path & read and convert

data_path = "C:/Users/js360/Desktop/train_surprise.csv"


train = pd.read_csv(data_path, header=None)
train = Dataset.load_from_df(train, reader=Reader())
trainset = train.build_full_trainset()

In [None]:
# ALS-using Baseline grid searching
param_dict = {'bsl_options':{'method' : ['als'] , 
              'n_epochs' : [20, 100],
              'reg_u' : [10, 15, 20],
              'reg_i' : [5, 10, 15],
                            }
          
}

gs = GridSearchCV(BaselineOnly, param_dict, cv=5)
gs.fit(trainset)
pd.DataFrame(gs.cv_results)

In [None]:
# SGD-using Baseline grid searching
param_dict = {'bsl_options':{'method' : ['sgd'] , 
              'n_epochs' : [20, 100],
              'reg' : [0.01, 0.02, 0.1],
              'learning_rate' : [0.005, 0.01, 0.1]
                            }
          
}
gs = GridSearchCV(BaselineOnly, param_dict, cv=5)
gs.fit(trainset)
pd.DataFrame(gs.cv_results)

In [None]:
# KNNBaseline grid searching
param_dict = {'bsl_options' : {'method' : ['als'], 'n_epochs' : [20]},
              'sim_options':{'name' : ['msd', 'cosine', 'pearson', 'pearson_baseline'], 'min_support' : [1,5], 'shrinkage':[100],
                            'k':[20,40,100]}

}
gs = GridSearchCV(KNNBaseline, param_dict, cv=5)
gs.fit(trainset)
pd.DataFrame(gs.cv_results)

In [None]:
# Best KNNBaseline, found by upper grid search
algo_knn = KNNBaseline('bsl_options': {'method': 'als', 'n_epochs': 20},
  'sim_options': {'name': 'pearson_baseline',
   'min_support': 1,
   'shrinkage': 100,
   'k': 20,
   'user_based': True})
cross_validate(algo_knn, train)

In [None]:
# SlopeOne CV
from surprise import SlopeOne
algo = SlopeOne()
cross_validate(algo, train)


In [None]:
# CoClustering CV
from surprise import CoClustering
algo = CoClustering()
cross_validate(algo, train)

In [None]:
# Submission code

def create_submission(model):
    out = open("submission.csv","w")
    out.write('Id,Prediction\n')
    with open('C:/Users/js360/Desktop/submission_rows.csv') as samples:
        for i, sample in enumerate(samples):
            if i == 0:
                continue
            tmp = sample.split('_')
            row = int(tmp[0][1:].strip())
            col = int(tmp[1][1:].strip())
            p = model.predict(col, row, verbose=False)[-2]
            p = max(min(np.rint(p),5),1)
            p_string = "r{}_c{},{}\n".format(row, col, p)
            out.write(p_string)        
    out.close()

In [None]:
# Some basic algorithms created using Surprise AlgoBase Class

class global_mean(AlgoBase):

    def __init__(self, sim_options={}, bsl_options={}, verbose=False):

        AlgoBase.__init__(self, sim_options=sim_options,
                          bsl_options=bsl_options)
        self.verbose = verbose

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)
        return self

    def estimate(self, u, i):

        return self.trainset.global_mean

class user_mean(AlgoBase):

    def __init__(self, sim_options={}, bsl_options={}, verbose=False):

        AlgoBase.__init__(self, sim_options=sim_options,
                          bsl_options=bsl_options)
        self.verbose = verbose

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)
        return self

    def estimate(self, u, i):

        if not self.trainset.knows_user(u):
            return self.trainset.global_mean
        return np.mean([r for (i,r) in self.trainset.ur[u]])

class item_mean(AlgoBase):

    def __init__(self, sim_options={}, bsl_options={}, verbose=False):

        AlgoBase.__init__(self, sim_options=sim_options,
                          bsl_options=bsl_options)
        self.verbose = verbose

    def fit(self, trainset):

        AlgoBase.fit(self, trainset)
        return self

    def estimate(self, u, i):

        if not self.trainset.knows_item(i):
            return self.trainset.global_mean
        return np.mean([r for (u,r) in self.trainset.ir[i]])

global_mean_algo = global_mean()
user_mean_algo = user_mean()
item_mean_algo = item_mean() 
baseline_algo = BaselineOnly()

In [None]:
# Manual testing of different SVDs

data_path = "C:/Users/js360/Desktop/train_surprise.csv"
train = pd.read_csv(data_path, header=None)
train = Dataset.load_from_df(train, reader=Reader())
trainset, testset = train_test_split(train)
from surprise import accuracy
i=0
err= []
for n_factors in  [5, 10, 20, 30, 40, 50, 75, 100]:
    for n_epochs in [100]:
        for lr_all in [0.01]:
            for reg_all in [0.1]:
                i += 1
                print(i)
                param = [n_factors, n_epochs, lr_all, reg_all]
                a = SVD(n_epochs=n_epochs, n_factors=n_factors, lr_all=lr_all, reg_all=reg_all)
                a.fit(trainset)
                preds = a.test(testset)
                err.append((param, accuracy.rmse(preds)))
            


In [None]:
# SVD plot

f = plt.figure(1)
x = f.add_subplot(111)
x.plot(nr_factors, results, linewidth=4, color='red')
x.set_xlabel("n_factors")
x.set_ylabel("Test rmse")
x.set_xticks(nr_factors)
x.grid()
x.set_title("Test rmse by number of factors in SVD")
f.savefig("Svd_plot")

In [None]:
# Best SVD, found by upper testing

a = SVD(n_factors = 60, n_epochs = 100, lr_all = 0.01, reg_all = 0.1)
cross_validate(a, train, cv=5)