In [1]:
from submission_to_surprise import *
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import *
from surprise.prediction_algorithms import *
import pandas as pd

In [2]:
data_folder = 'csv/'

In [3]:
df = pd.read_csv(data_folder + 'surprise_train.csv')
df = df[['User', 'Item', 'Prediction']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User', 'Item', 'Prediction']], reader)

# Surprise algorithms comparison

In [21]:
# !!!!!!!!!!! This cell takes a very long time to run !!!!!!!!!!!
trainset, testset = train_test_split(data, test_size=.20, random_state=2018)

algos = {'Normal Predictor': NormalPredictor(),
         'Baseline Only': BaselineOnly(bsl_options={'reg_i': 10, 'reg_u': 15, 'n_epochs': 10}, verbose=False),
         'k-NN Basic': KNNBasic(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN with means': KNNWithMeans(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN with ZScore': KNNWithZScore(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
         'k-NN Baseline': KNNBaseline(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
         'SVD': SVD(n_epochs=20,n_factors=20, biased=True, lr_all=0.002, reg_all=0.02, random_state=2018),
         'NMF': NMF(n_factors=15, n_epochs=50, biased=False, reg_pu=0.06, reg_qi=0.06, reg_bu=0.02, reg_bi=0.02, lr_bu=0.005, lr_bi=0.005, init_low=0, init_high=1, random_state=2018),
         'Slope one': SlopeOne(),
         'Co-clustering': CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=2018)}

for algo in algos:
    predictions = algos[algo].fitt(trainset).test(testset)

    rmse = accuracy.rmse(predictions, verbose=False)
    print(f'RMSE for {algo} is {rmse}')

RMSE for Normal Predictor is 1.4838478118860339
RMSE for Baseline Only is 0.9990113588899588
RMSE for k-NN Basic is 1.0256702819676673
RMSE for k-NN with means is 0.994523610559874
RMSE for k-NN with ZScore is 0.9960953658912202
RMSE for k-NN Baseline is 0.9943079485319859
RMSE for SVD is 1.000070634576365
RMSE for NMF is 1.0104056029390045
RMSE for Slope one is 1.0011204420188182
RMSE for Co-clustering is 1.0104667941922378


# Grid-search

In [4]:
algo = KNNWithZScore

#param_grid = {'k': [60], 'min_k': [2], 'bsl_options': {'user_based': [True, False]}}
param_grid = {'k': [50, 60, 70], 'min_k': [1, 2]}

gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=3, joblib_verbose=10)

gs.fit(data)

print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.0min remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  3.7min remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.

[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  5.5min remaining:    0.0s



Computing the msd similarity matrix...
Done computing similarity matrix.

[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  7.2min remaining:    0.0s



Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  8.9min remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 10.6min remaining:    0.0s



Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 12.4min remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.

[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 14.2min remaining:    0.0s



Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 16.3min remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
0.997839329158117
{'k': 70, 'min_k': 2}


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 36.1min finished


# Cross-validation

In [6]:
algo = KNNWithMeans(k=60, min_k=2)

cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9959  0.9958  0.9965  0.9909  0.9945  0.9947  0.0020  
Fit time          8.17    8.71    10.32   13.55   14.20   10.99   2.47    
Test time         62.29   76.49   86.83   105.95  108.00  87.91   17.42   


{'test_rmse': array([0.99585228, 0.99581443, 0.99650031, 0.99093284, 0.99446655]),
 'fit_time': (8.168806076049805,
  8.709189414978027,
  10.316333293914795,
  13.552631378173828,
  14.202093601226807),
 'test_time': (62.29028367996216,
  76.48936104774475,
  86.82870554924011,
  105.94529366493225,
  107.99975252151489)}

# Submission

In [None]:
algo = KNNWithZScore(k=70, min_k=2)

ids, _ = get_ids_values(data_folder + 'sampleSubmission.csv')
item, user = replace_ids_submission(ids)
prediction = np.zeros(len(item))
for i in range(len(prediction)):
    prediction[i] = algo.predict(user[i], item[i], None, True, False).est
create_csv_submission(ids, prediction, 'submission.csv')