In [1]:
import os
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from surprise import Dataset
from surprise import Reader
from surprise import KNNBasic
from surprise import SVD
from surprise import SVDpp

os.chdir('/Users/miftahulridwan/Documents/Block 3/Master Thesis/Experiment')
tag = 'SVDpp'
training_path = os.getcwd()+'/training/'
test_path = os.getcwd()+'/test/'
result_path = os.getcwd()+'/'+tag+'/RMSE/'
dump_path = os.getcwd()+'/'+tag+'/Prediction Dump/'

factors = [10, 20, 30, 40, 50]

# This update = i
i = 49

In [2]:
# Creating Dictionary for storing the result
RMSE = dict()

# Time logging
start_update = datetime.now()

training = pd.read_csv(training_path+'train {}.csv'.format(i))
test = pd.read_csv(test_path+'test {}.csv'.format(i))
print('Processing system update {}'.format(i))
print("="*80)

# Load the dataset to Surprise environment
reader = Reader(rating_scale=(1, 5))
train_set = Dataset.load_from_df(training[['user_id', 'movie_id', 'rating']], reader).build_full_trainset()
test_set = Dataset.load_from_df(test[['user_id', 'movie_id', 'rating']], reader).build_full_trainset().build_testset()

# Training Phase
for factor in factors:
    print('Running SVDpp for factor = {}'.format(factor))
    start_training = datetime.now()
    algo_svdpp = SVDpp(n_factors = factor, n_epochs = 30, random_state = 24, verbose = False)
    algo_svdpp.fit(train_set)
    cp1 = datetime.now()
    print("Training done in {}".format(cp1-start_training))

    # Prediction Phase
    svdpp_pred = algo_svdpp.test(test_set)
    cp2 = datetime.now()
    print("Testing done in {}".format(cp2-cp1))

    result = pd.DataFrame(svdpp_pred, columns=['user_id', 'item_id', 'rating', 'pred', 'details'])
    result.to_csv(dump_path+'update {} for factor {}'.format(i, factor))

    # Computing RMSE
    MAE_result = np.mean((result.pred - result.rating)**2)
    RMSE[factor] = np.sqrt(MAE_result)
    print("RMSE = {:.2f}".format(RMSE[factor]))
    print("factor = {} done in {}".format(factor, (cp2-start_training)))
    print('*'*80)

end_update = datetime.now()
print('System Update {} done in {}'.format(i, (end_update - start_update)))
print('='*80)
print('\n')

Processing system update 49
Running SVDpp for factor = 10
Training done in 6:52:05.135185
Testing done in 0:00:37.205469
RMSE = 0.91
factor = 10 done in 6:52:42.340654
********************************************************************************
Running SVDpp for factor = 20
Training done in 9:50:07.508509
Testing done in 0:00:39.100479
RMSE = 0.91
factor = 20 done in 9:50:46.608988
********************************************************************************
Running SVDpp for factor = 30
Training done in 12:53:07.740180
Testing done in 0:00:41.152261
RMSE = 0.91
factor = 30 done in 12:53:48.892441
********************************************************************************
Running SVDpp for factor = 40
Training done in 15:51:25.837461
Testing done in 0:00:40.517377
RMSE = 0.92
factor = 40 done in 15:52:06.354838
********************************************************************************
Running SVDpp for factor = 50
Training done in 15:50:07.194788
Testing done in 0:00: