In [1]:
import pandas as pd
import time
import numpy as np
import math
from sklearn import datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
import warnings, sys, os, gc

import surprise as sp
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV, RandomizedSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, SVD, NMF
from surprise import accuracy


## Surprise

In [53]:
df_train = pd.read_csv('./data/opiniones_train.csv')
df_test = pd.read_csv('./data/opiniones_test.csv')

In [54]:
#trainset, testset = train_test_split(df_train, test_size=0.30)
trainset = df_train
testset = df_test
#print(trainset.shape, testset.shape)
print(trainset.shape)

(42320, 3)


In [55]:
#testset = testset.rename(columns = {'libro': 'id'}, inplace = False)
#trainset = trainset.rename(columns = {'libro': 'id'}, inplace = False)

In [56]:
trainset.head()

Unnamed: 0,id,usuario,puntuacion
0,el-maestro-de-esgrima,216,8.0
1,el-angel-mas-tonto-del-mundo,288,3.0
2,un-millon-de-gotas,300,7.0
3,el-resplandor,120,10.0
4,el-fuego,1122,5.0


In [57]:
reader = sp.reader.Reader(rating_scale=(1.0, 10.0))
data_train= sp.dataset.Dataset.load_from_df(trainset[['id', 'usuario', 'puntuacion']], reader)

## SVD

In [58]:
def full_prediction(algo_name,algoritmo,test):
    list_pred = []
    for row in tqdm_notebook(test.itertuples()):
        row_pred = dict()
        uid = row.id
        iid = row.usuario
        pred = algoritmo.predict(uid, iid, verbose=False)
        row_pred["id"] = uid
        row_pred["puntuacion"] = np.around(pred.est, 2)
        list_pred.append(row_pred)
    df = pd.DataFrame(list_pred)    
    #print(algo_name + " MSE:      %.4f" % np.sqrt(mean_squared_error(test.puntuacion, df.puntuacion)))
    return list_pred

In [42]:
# HiperParametros
param_grid_svd = {'n_factors': list(range(40,130,5)), 'n_epochs': list(range(1,100,1)), 
              'lr_all': [0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008],
              'reg_all': [0.01,0.02,0.03],
              'biased':[True, False]}

SVD_algo = SVD()

In [43]:
t_0 = time.time()
svd_rs = RandomizedSearchCV(SVD, param_grid_svd, measures=['rmse', 'mae'],n_iter=25, n_jobs=2)
svd_rs.fit(data_train)
print("SVD")
print('RMSE =', svd_rs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)

SVD
RMSE = {'n_factors': 70, 'n_epochs': 28, 'lr_all': 0.003, 'reg_all': 0.03, 'biased': True}
tiempo:  219.4697482585907


In [59]:
SVD_algo = SVD(n_factors= 70, n_epochs= 90, lr_all= 0.003, reg_all= 0.03, biased=True)

In [60]:
t_0 = time.time()
trainset_model = data_train.build_full_trainset()
SVD_algo.fit(trainset_model)
print ("tiempo: ", time.time() - t_0)

tiempo:  8.815774202346802


In [61]:
print(len(testset))
print(len(trainset))

10584
42320


In [62]:
df_list_pred_SVD = pd.DataFrame(full_prediction("SVD",SVD_algo,testset))
#df_list_pred_SVD = pd.DataFrame(full_prediction("SVD",SVD_algo,trainset))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [23]:
trainset.head()

Unnamed: 0,id,usuario,puntuacion
0,el-maestro-de-esgrima,216,8.0
1,el-angel-mas-tonto-del-mundo,288,3.0
2,un-millon-de-gotas,300,7.0
3,el-resplandor,120,10.0
4,el-fuego,1122,5.0


In [22]:
df_list_pred_SVD.head()

Unnamed: 0,id,puntuacion
0,el-maestro-de-esgrima,7.73
1,el-angel-mas-tonto-del-mundo,3.31
2,un-millon-de-gotas,7.13
3,el-resplandor,10.0
4,el-fuego,5.12


In [30]:
df = trainset
df['svd'] = df_list_pred_SVD['puntuacion']

In [33]:
print(" MSE:      %.4f" % np.sqrt(mean_squared_error(df.puntuacion, df.svd)))

 MSE:      0.3232


In [38]:
df.isna().sum()

id            0
usuario       0
puntuacion    0
svd           0
dtype: int64

In [39]:
df.to_csv("./data/train_svd.csv", index=False)



In [49]:
df

Unnamed: 0,id,usuario,puntuacion,svd
0,el-maestro-de-esgrima,216,8.0,7.73
1,el-angel-mas-tonto-del-mundo,288,3.0,3.31
2,un-millon-de-gotas,300,7.0,7.13
3,el-resplandor,120,10.0,10.00
4,el-fuego,1122,5.0,5.12
...,...,...,...,...
42315,el-callejon-de-los-milagros,1079,8.0,7.99
42316,ponte-en-mi-piel,151,8.0,7.61
42317,veinte-anos-despues,169,7.0,7.01
42318,la-herencia-de-wilt,216,6.0,6.10


In [48]:
test = df_test
test 

Unnamed: 0,id,libro,usuario,puntuacion
0,1,los-hijos,201,
1,2,el-temor-de-un-hombre-sabio,299,
2,3,leon-bocanegra,126,
3,4,el-caballero-errante-2,107,
4,5,el-mar-el-mar,85,
...,...,...,...,...
10579,10580,la-princesa-prometida,854,
10580,10581,la-prima-bette,216,
10581,10582,el-guardian-invisible-trilogia-del-baztan-1,132,
10582,10583,un-otono-romano,133,


In [16]:
df_list_pred_SVD.to_csv("./submission/ignacio_submit_SVD2.csv", index=False)
# 1.58380