In [1]:
import pandas as pd
import time
import random
import numpy as np
import math
from sklearn import datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
import warnings, sys, os, gc
import surprise as sp

#my_seed = 0
#random.seed(my_seed)
#np.random.seed(my_seed)

## Surprise

In [2]:
df_train = pd.read_csv('./data/opiniones_train.csv')
df_test = pd.read_csv('./data/opiniones_test.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['puntuacion'], axis=1),         df_train.puntuacion, test_size=0.3, random_state=0)

In [4]:
#### Genericos #### 
scale = (1.0, 10.0)
reader = sp.reader.Reader(rating_scale=scale)
train = pd.concat([X_train, pd.DataFrame(y_train)], axis=1)
data_train = sp.dataset.Dataset.load_from_df(train, reader)

In [5]:
# Funcion para hacer la prediccion de cada par usuario/libro
def full_prediction(algoritmo,test):
    list_pred = []
    for row in tqdm_notebook(test.itertuples()):
        row_pred = dict()
        pred = algoritmo.predict(row.usuario, row.libro, verbose=False)
        row_pred["id"] = row.id
        row_pred["puntuacion"] = np.around(pred.est, 4)
        list_pred.append(row_pred) 
    return list_pred

## SVD

In [6]:
#### Parametros SVD ####
SVD_algo = sp.prediction_algorithms.SVD
param_grid = {
    'n_factors': range(40,80,2),
    "n_epochs": range(100,1000,100),
    'lr_all': [0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008],
    'reg_all': [0.01,0.02,0.03,0.2,0.4,0.6],
    'biased':[True, False]
}

In [10]:
#### Busqueda de hiper parametros ####
t_0 = time.time()
svd_rs = sp.model_selection.search.RandomizedSearchCV(SVD_algo, param_grid, measures=['rmse'],cv=5, n_jobs=-1)
svd_rs.fit(data_train)
print("SVD")
print('RMSE =', svd_rs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)

SVD
RMSE = {'n_factors': 42, 'n_epochs': 100, 'lr_all': 0.003, 'reg_all': 0.2, 'biased': True}
tiempo:  400.4446563720703


In [7]:
#### Entrenamiento ####
t_0 = time.time()
train = df_train[['usuario', 'libro', 'puntuacion']]
best_params = {'n_epochs': 300, 'lr_all': 0.001, 'reg_all': 0.2, 'n_factors': 55}
data = sp.dataset.Dataset.load_from_df(train, reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)
model = SVD_algo(n_factors= 55, n_epochs= 300, lr_all= 0.001, reg_all= 0.2, biased=True)
print ("tiempo: ", time.time() - t_0)

tiempo:  0.11973857879638672


In [8]:
#### Predicciones ####
t_0 = time.time()
# Fit sobre trainset (70% de data) y predicciones sobre testset(30% data)
model.fit(trainset)
y_predictions = model.test(testset)
# Fit sobre trainset full
model.fit(data.build_full_trainset())
print(sp.accuracy.rmse(y_predictions))
print ("tiempo: ", time.time() - t_0)

RMSE: 1.5826
1.5825966236285707
tiempo:  47.333245038986206


In [9]:
submit = pd.DataFrame(full_prediction(model,df_test))
print(submit)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


          id  puntuacion
0          1      7.1872
1          2      8.0608
2          3      6.9903
3          4      7.7213
4          5      6.6881
...      ...         ...
10579  10580      8.4629
10580  10581      7.8519
10581  10582      7.2862
10582  10583      8.8856
10583  10584      5.6770

[10584 rows x 2 columns]


In [96]:
# Column Compare
#submit_manual.where(submit_manual.puntuacion==submit.puntuacion).notna()

In [102]:
#n_factors= 55, n_epochs= 300, lr_all= 0.001, reg_all= 0.2
## RMSE: 1.5834 | RMSE_kaggle: 1.48523 ===> surpSVD_tunned.csv 
#n_factors= 65, n_epochs= 700, lr_all= 0.007, reg_all= 0.03, biased=True
##RMSE: 1.6119  
#RMSE = {'n_factors': 72, 'n_epochs': 800, 'lr_all': 0.008, 'reg_all': 0.01, 'biased': True}
##RMSE: 1.6088
#RMSE = {'n_factors': 42, 'n_epochs': 100, 'lr_all': 0.003, 'reg_all': 0.2, 'biased': True}
##RMSE: 1.5852
#n_factors= 55, n_epochs= 125, lr_all= 0.0025, reg_all= 0.25, biased=True
## RMSE: 1.5827 | RMSE_kaggle: 1.49455 ===> surpSVD_tunned_2.csv

submit.to_csv("./submission/surpSVD_tunned_3.csv", index=False)

# NMF

In [None]:
NMF_algo = NMF()