In [9]:
from submission_to_surprise import *
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import *
from surprise.prediction_algorithms import *
import pandas as pd

In [10]:
data_folder = 'csv/'

In [11]:
df = pd.read_csv(data_folder + 'surprise_train.csv')
df = df[['User', 'Item', 'Prediction']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User', 'Item', 'Prediction']], reader)

# Surprise algorithms comparison

In [21]:
# !!!!!!!!!!! This cell takes a very long time to run !!!!!!!!!!!
trainset, testset = train_test_split(data, test_size=.20, random_state=2018)

algos = {'Normal Predictor': NormalPredictor(),
         'Baseline Only': BaselineOnly(bsl_options={'reg_i': 10, 'reg_u': 15, 'n_epochs': 10}, verbose=False),
         'k-NN Basic': KNNBasic(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN with means': KNNWithMeans(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN with ZScore': KNNWithZScore(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN Baseline': KNNBaseline(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
         'SVD': SVD(n_epochs=20,n_factors=20, biased=True, lr_all=0.002, reg_all=0.02, random_state=2018),
         'NMF': NMF(n_factors=15, n_epochs=50, biased=False, reg_pu=0.06, reg_qi=0.06, reg_bu=0.02, reg_bi=0.02, lr_bu=0.005, lr_bi=0.005, init_low=0, init_high=1, random_state=2018),
         'Slope one': SlopeOne(),
         'Co-clustering': CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=2018)}

for algo in algos:
    predictions = algos[algo].fit(trainset).test(testset)

    rmse = accuracy.rmse(predictions, verbose=False)
    print(f'RMSE for {algo} is {rmse}')

RMSE for Normal Predictor is 1.4838478118860339
RMSE for Baseline Only is 0.9990113588899588
RMSE for k-NN Basic is 1.0256702819676673
RMSE for k-NN with means is 0.994523610559874
RMSE for k-NN with ZScore is 0.9960953658912202
RMSE for k-NN Baseline is 0.9943079485319859
RMSE for SVD is 1.000070634576365
RMSE for NMF is 1.0104056029390045
RMSE for Slope one is 1.0011204420188182
RMSE for Co-clustering is 1.0104667941922378


# Grid-search

In [4]:
algo = NMF

#param_grid = {'k': [60], 'min_k': [2], 'bsl_options': {'user_based': [True, False]}}
param_grid = {'n_factors': [5, 10, 19, 20, 25, 30], 'biased':[True, False], 'n_epochs': [100]}

gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3, joblib_verbose=2)

gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.4min remaining:    0.0s


1.0005328642957334
{'n_factors': 20, 'biased': True, 'n_epochs': 100}


[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 98.4min finished


In [6]:
algo = CoClustering

#param_grid = {'k': [60], 'min_k': [2], 'bsl_options': {'user_based': [True, False]}}
param_grid = {'n_cltr_u': [5, 10, 20, 30], 'n_cltr_i': [5, 10, 20, 30], 'n_epochs': [50, 100]}

gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3, joblib_verbose=2)

gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min remaining:    0.0s


1.0131258449518403
{'n_cltr_u': 20, 'n_cltr_i': 5, 'n_epochs': 100}


[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 473.3min finished


In [8]:
algo = SVD

#param_grid = {'k': [60], 'min_k': [2], 'bsl_options': {'user_based': [True, False]}}
param_grid = {'n_factors': [17, 50, 100], 'lr_all': [0.002], 'reg_all': [0, 0.05]}

gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3, joblib_verbose=2)

gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.3s remaining:    0.0s


1.000991743332894
{'n_factors': 17, 'lr_all': 0.002, 'reg_all': 0.05}


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 16.8min finished


# Cross-validation

In [14]:
algo = SVD(n_factors=100, reg_bu=0.01, reg_bi=0.1, reg_pu=0.1, reg_qi=0.01)

cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9989  0.9968  0.9998  0.9984  0.9985  0.9985  0.0010  
Fit time          109.10  112.82  116.07  105.20  94.76   107.59  7.37    
Test time         4.76    3.49    5.72    3.58    3.83    4.28    0.85    


{'test_rmse': array([0.99889983, 0.99682049, 0.99983323, 0.99843902, 0.99850137]),
 'fit_time': (109.09758067131042,
  112.8232307434082,
  116.06696963310242,
  105.20380973815918,
  94.76237297058105),
 'test_time': (4.75638222694397,
  3.489480495452881,
  5.719065189361572,
  3.5795481204986572,
  3.8317439556121826)}

# Submission

In [6]:
algo = BaselineOnly(bsl_options={'reg_i': 15, 'reg_u': 0, 'n_epochs': 200})
trainset = data.build_full_trainset()
algo.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x4806018048>

In [7]:
ids, _ = get_ids_values(data_folder + 'sampleSubmission.csv')
item, user = replace_ids_submission(ids)
prediction = np.zeros(len(item))
for i in range(len(prediction)):
    prediction[i] = algo.predict(user[i], item[i], None, True, False).est
create_csv_submission(ids, prediction, 'submission.csv')