In [15]:
from submission_to_surprise import *
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import *
from surprise.prediction_algorithms import *
import pandas as pd

In [16]:
PATH_CLEAN = 'csv/data_clean.csv'
PATH_SAMPLE = 'csv/sampleSubmission.csv'
PATH_SUBMISSION = 'csv/submission.csv'
data_folder = 'csv/'

In [17]:
df = pd.read_csv(PATH_CLEAN)
df.head()

Unnamed: 0,User,Item,Rating
0,44,1,4
1,61,1,3
2,67,1,4
3,72,1,3
4,86,1,5


In [18]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User', 'Item', 'Rating']], reader)

# Surprise algorithms comparison

In [20]:
# !!!!!!!!!!! This cell takes a very long time to run !!!!!!!!!!!
trainset, testset = train_test_split(data, test_size=.20, random_state=2018)

algos = {'Normal Predictor': NormalPredictor(),
         'Baseline Only': BaselineOnly(bsl_options={'reg_i': 10, 'reg_u': 15, 'n_epochs': 10}, verbose=False),
         'k-NN Basic': KNNBasic(k=60, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN with means': KNNWithMeans(k=50, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN with ZScore': KNNWithZScore(k=70, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN Baseline': KNNBaseline(k=59, min_k=1, sim_options={'user_based': True}, verbose=False),
         'SVD': SVD(n_factors=100, reg_bu=0.01, reg_bi=0.1, reg_pu=0.1, reg_qi=0.01, random_state=2018),
         'NMF': NMF(n_factors=22, n_epochs=50, biased=False, reg_pu=0.06, reg_qi=0.06, reg_bu=0.02, reg_bi=0.02, lr_bu=0.005, lr_bi=0.005, init_low=0, init_high=1, random_state=2018),
         'Slope one': SlopeOne(),
         'Co-clustering': CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=2018)}

for algo in algos:

    predictions = algos[algo].fit(trainset).test(testset)    
    
    rmse = accuracy.rmse(predictions, verbose=False)
    print(f'RMSE for {algo} is {rmse}')

RMSE for Normal Predictor is 1.483746524138451
RMSE for Baseline Only is 1.0007649954780635
RMSE for k-NN Basic is 1.0240415317295288
RMSE for k-NN with means is 1.0216516207526856
RMSE for k-NN with ZScore is 1.0184726068301606
RMSE for k-NN Baseline is 1.0068216570125905
RMSE for SVD is 1.0238049285552246
RMSE for NMF is 1.0247684552622467
RMSE for Slope one is 1.0016787898058894
RMSE for Co-clustering is 1.0130267733860856


# Convertion to discrete algoithms

In [21]:
class SVD_discrete(SVD):
    def estimate(self, u, i):
        return round(SVD.estimate(self, u, i))
    
class BaselineOnly_discrete(BaselineOnly):
    def estimate(self, u, i):
        return round(BaselineOnly.estimate(self, u, i))

class KNNBasic_discrete(KNNBasic):
    def estimate(self, u, i):
        est, details = KNNBasic.estimate(self, u, i)
        return round(est), details
    
class KNNWithMeans_discrete(KNNWithMeans):
    def estimate(self, u, i):
        est, details = KNNWithMeans.estimate(self, u, i)
        return round(est), details
    
class KNNWithZScore_discrete(KNNWithZScore):
    def estimate(self, u, i):
        est, details = KNNWithZScore.estimate(self, u, i)
        return round(est), details
    
class KNNBaseline_discrete(KNNBaseline):
    def estimate(self, u, i):
        est, details = KNNBaseline.estimate(self, u, i)
        return round(est), details
    
class NMF_discrete(NMF):
    def estimate(self, u, i):
        return round(NMF.estimate(self, u, i))
    
class SlopeOne_discrete(SlopeOne):
    def estimate(self, u, i):
        return round(SlopeOne.estimate(self, u, i))
    
class CoClustering_discrete(CoClustering):
    def estimate(self, u, i):
        return round(CoClustering.estimate(self, u, i))

# Grid-search

In [24]:
algo = BaselineOnly_discrete

param_grid = {'bsl_options': {'reg_i': [5, 10, 15], 'reg_u': [5, 10, 15], 'n_epochs': [20]}}
gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3, joblib_verbose=2)

gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Estimating biases using als...


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 14.3min remaining:    0.0s


Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed: 23.4min finished


1.041615105025082
{'bsl_options': {'reg_i': 5, 'reg_u': 10, 'n_epochs': 20}}


In [32]:
algo = KNNBaseline_discrete

param_grid = {'k': [110, 130, 150], 'verbose': [False]}

gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3, joblib_verbose=2)

gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 20.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 186.0min finished


1.0427611697713854
{'k': 150, 'verbose': False}


In [31]:
algo = SVD_discrete

param_grid = {'n_factors': [40], 'random_state': [2018], 'n_epochs': [60], 'reg_all': [0.01, 0.02, 0.05], 'lr_all': [0.002]}
#param_grid = {'n_factors': [35, 36, 37, 38, 39, 40, 42], 'random_state': [2018], 'n_epochs': [60],
#              'reg_bu': [0.01], 'reg_bi':[0.1], 'reg_pu': [0.1], 'reg_qi': [0.01]}


gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=4, joblib_verbose=2)

gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s


1.0391490697728307
{'n_factors': 40, 'random_state': 2018, 'n_epochs': 60, 'reg_all': 0.05, 'lr_all': 0.002}


[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 29.3min finished


# Cross-validation

In [14]:
algo = SVD(n_factors=100, reg_bu=0.01, reg_bi=0.1, reg_pu=0.1, reg_qi=0.01)

cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9989  0.9968  0.9998  0.9984  0.9985  0.9985  0.0010  
Fit time          109.10  112.82  116.07  105.20  94.76   107.59  7.37    
Test time         4.76    3.49    5.72    3.58    3.83    4.28    0.85    


{'test_rmse': array([0.99889983, 0.99682049, 0.99983323, 0.99843902, 0.99850137]),
 'fit_time': (109.09758067131042,
  112.8232307434082,
  116.06696963310242,
  105.20380973815918,
  94.76237297058105),
 'test_time': (4.75638222694397,
  3.489480495452881,
  5.719065189361572,
  3.5795481204986572,
  3.8317439556121826)}

# Submission

In [6]:
algo = BaselineOnly(bsl_options={'reg_i': 15, 'reg_u': 0, 'n_epochs': 200})
trainset = data.build_full_trainset()
algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x4806018048>

In [7]:
ids, _ = get_ids_values(data_folder + 'sampleSubmission.csv')
item, user = replace_ids_submission(ids)
prediction = np.zeros(len(item))
for i in range(len(prediction)):
    prediction[i] = algo.predict(user[i], item[i], None, True, False).est
create_csv_submission(ids, prediction, 'submission.csv')