In [1]:
import pandas as pd
import time
import numpy as np
import math
from sklearn import datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
import warnings, sys, os, gc

import surprise as sp
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV, RandomizedSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, SVD, NMF
from surprise import accuracy


## Preprocesamiento

In [2]:
df_train = pd.read_csv('./data/opiniones_train.csv')
df_test = pd.read_csv('./data/opiniones_test.csv')
df_libros = pd.read_csv('./data/libros.csv')
#####################################################
train = df_train
test = df_test
train['libro'] = train['libro'].astype('category')
test['libro'] = test['libro'].astype('category')

In [3]:
#trainset, testset = train_test_split(train, test_size=0.30)
trainset = train
testset = test
print(trainset.shape, testset.shape)

(42320, 3) (10584, 4)


## Surprise

In [4]:
def full_prediction(algo_name,algoritmo,test):
    list_pred = []
    for row in tqdm_notebook(test.itertuples()):
        row_pred = dict()
        uid = row.libro
        iid = row.usuario
        pred = algoritmo.predict(uid, iid, verbose=False)
        row_pred["libro"] = uid
        row_pred["puntuacion"] = np.around(pred.est, 2)
        list_pred.append(row_pred)
    df = pd.DataFrame(list_pred)    
    #print(algo_name + " MSE:      %.4f" % np.sqrt(mean_squared_error(test.puntuacion, df.puntuacion)))
    return list_pred

In [5]:
reader = sp.reader.Reader(rating_scale=(1.0, 10.0))
data_train= sp.dataset.Dataset.load_from_df(trainset[['libro', 'usuario', 'puntuacion']], reader)
trainset_model = data_train.build_full_trainset()

In [7]:
SVD_algo = SVD(n_factors= 70, n_epochs= 90, lr_all= 0.003, reg_all= 0.03, biased=True)
t_0 = time.time()
SVD_algo.fit(trainset_model)
print ("tiempo: ", time.time() - t_0)

tiempo:  9.032744646072388


In [6]:
t_0 = time.time()
so_Means = {'name': 'cosine', 'user_based': False}
KNNWithMeans_algo = sp.prediction_algorithms.knns.KNNWithMeans(min_k=6, k=30, sim_options=so_Means)
KNNWithMeans_algo.fit(trainset_model)
print ("tiempo: ", time.time() - t_0)

Computing the cosine similarity matrix...
Done computing similarity matrix.
tiempo:  0.7444663047790527


In [7]:
print(len(testset))
print(len(trainset))

10584
42320


In [16]:
df_list_pred_SVD = pd.DataFrame(full_prediction("SVD",SVD_algo,testset))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


SVD MSE:      1.5905


In [8]:
df_list_pred_KNNWithMeans = pd.DataFrame(full_prediction("KNNWithMeans",KNNWithMeans_algo,testset))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [9]:
df_list_pred_KNNWithMeans

Unnamed: 0,libro,puntuacion
0,los-hijos,7.17
1,el-temor-de-un-hombre-sabio,8.34
2,leon-bocanegra,7.48
3,el-caballero-errante-2,7.75
4,el-mar-el-mar,7.17
...,...,...
10579,la-princesa-prometida,8.67
10580,la-prima-bette,7.01
10581,el-guardian-invisible-trilogia-del-baztan-1,7.46
10582,un-otono-romano,9.11


In [10]:
df_test_svd = pd.read_csv('./data/test_svd.csv')
df_test_svd

Unnamed: 0,id,libro,usuario,puntuacion,svd
0,1,los-hijos,201,,7.17
1,2,el-temor-de-un-hombre-sabio,299,,9.44
2,3,leon-bocanegra,126,,7.41
3,4,el-caballero-errante-2,107,,7.61
4,5,el-mar-el-mar,85,,6.59
...,...,...,...,...,...
10579,10580,la-princesa-prometida,854,,8.57
10580,10581,la-prima-bette,216,,8.05
10581,10582,el-guardian-invisible-trilogia-del-baztan-1,132,,6.90
10582,10583,un-otono-romano,133,,9.08


In [12]:
## SOLO para el caso del Test
## libro | usuario | puntuacion | svd
#df_test_svd['knn'] = df_list_pred_KNNWithMeans['puntuacion']
#df_test_svd.to_csv("./data/test_svd.csv", index=False)
#df_test_svd

Unnamed: 0,id,libro,usuario,puntuacion,svd,knn
0,1,los-hijos,201,,7.17,7.17
1,2,el-temor-de-un-hombre-sabio,299,,9.44,8.34
2,3,leon-bocanegra,126,,7.41,7.48
3,4,el-caballero-errante-2,107,,7.61,7.75
4,5,el-mar-el-mar,85,,6.59,7.17
...,...,...,...,...,...,...
10579,10580,la-princesa-prometida,854,,8.57,8.67
10580,10581,la-prima-bette,216,,8.05,7.01
10581,10582,el-guardian-invisible-trilogia-del-baztan-1,132,,6.90,7.46
10582,10583,un-otono-romano,133,,9.08,9.11


In [15]:
## SOLO para el caso del Train
# libro | usuario | puntuacion | svd
#df_train_svd['knn'] = df_list_pred_KNNWithMeans['puntuacion']
#df_train_svd.to_csv("./data/train_svd.csv", index=False)
#df_train_svd

Unnamed: 0,libro,usuario,puntuacion,svd,knn
0,el-maestro-de-esgrima,216,8.0,7.81,6.69
1,el-angel-mas-tonto-del-mundo,288,3.0,3.40,6.92
2,un-millon-de-gotas,300,7.0,7.02,7.25
3,el-resplandor,120,10.0,9.88,7.07
4,el-fuego,1122,5.0,5.19,6.30
...,...,...,...,...,...
42315,el-callejon-de-los-milagros,1079,8.0,8.02,8.19
42316,ponte-en-mi-piel,151,8.0,7.71,6.81
42317,veinte-anos-despues,169,7.0,7.05,6.79
42318,la-herencia-de-wilt,216,6.0,6.25,7.01
