In [1]:
import pandas as pd
import time
import numpy as np
import math
from sklearn import datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
import warnings, sys, os, gc
import surprise as sp

## Surprise

In [2]:
df_train = pd.read_csv('./data/opiniones_train.csv')
df_test = pd.read_csv('./data/opiniones_test.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['puntuacion'], axis=1),         df_train.puntuacion, test_size=0.3, random_state=0)

## SVD

In [4]:
#### Parametros ####
scale = (1.0, 10.0)
reader = sp.reader.Reader(rating_scale=scale)
SVD_algo = sp.prediction_algorithms.SVD
param_grid = {
    'n_factors': range(40, 80, 5),
    "n_epochs": [100, 300, 500, 700, 900],
    'lr_all': [0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008],
    'reg_all': [0.01,0.02,0.03],
    'biased':[True, False]
}
train = pd.concat([X_train, pd.DataFrame(y_train)], axis=1)
data_train = sp.dataset.Dataset.load_from_df(train, reader)

In [5]:
#### Busqueda de hiper parametros ####
t_0 = time.time()
svd_rs = sp.model_selection.search.RandomizedSearchCV(SVD_algo, param_grid, measures=['rmse'],cv=5, n_jobs=-1)
svd_rs.fit(data_train)
print("SVD")
print('RMSE =', svd_rs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)

SVD
RMSE = {'n_factors': 65, 'n_epochs': 700, 'lr_all': 0.007, 'reg_all': 0.03, 'biased': True}
tiempo:  510.2110085487366


In [6]:
#### Entrenamiento ####
t_0 = time.time()
train = df_train[['usuario', 'libro', 'puntuacion']]
best_params = {'n_epochs': 300, 'lr_all': 0.001, 'reg_all': 0.2, 'n_factors': 55}
data = sp.dataset.Dataset.load_from_df(train, reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)
model = SVD_algo(n_factors= 65, n_epochs= 700, lr_all= 0.007, reg_all= 0.03, biased=True)
print ("tiempo: ", time.time() - t_0)

tiempo:  0.1336228847503662


In [7]:
#### Predicciones ####
t_0 = time.time()
# Fit sobre trainset (70% de data) y predicciones sobre testset(30% data)
model.fit(trainset)
y_predictions = model.test(testset)
# Fit sobre trainset full
model.fit(data.build_full_trainset())
sp.accuracy.rmse(y_predictions)
print ("tiempo: ", time.time() - t_0)

RMSE: 1.6119
tiempo:  111.22566151618958


In [45]:
# Predicciones sobre df_test
predict = list(map(lambda x: model.predict(x[0], x[1]).est, zip(df_test.usuario, df_test.libro)))
submit = pd.DataFrame({'id': df_test.id, 'puntuacion': np.around(predict,4)})
submit.tail()

Unnamed: 0,id,puntuacion
10579,10580,8.3388
10580,10581,7.6435
10581,10582,7.3014
10582,10583,8.8245
10583,10584,5.5192


In [46]:
#n_factors= 55, n_epochs= 300, lr_all= 0.001, reg_all= 0.2
## RMSE: 1.5834 | RMSE_kaggle: 1.48523 ===> surpSVD_tunned.csv 
#n_factors= 65, n_epochs= 700, lr_all= 0.007, reg_all= 0.03, biased=True
##RMSE: 1.6119  | RMSE_kaggle:  ===> surpSVD_tunned_2.csv

submit.to_csv("./submission/surpSVD_tunned_2.csv", index=False)

In [None]:
predict_surprise_svd