In [1]:
import pandas as pd
import time
import random
import numpy as np
import math
from sklearn import datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
import warnings, sys, os, gc
import surprise as sp

#my_seed = 0
#random.seed(my_seed)
#np.random.seed(my_seed)

## Surprise

In [2]:
df_train = pd.read_csv('./data/opiniones_train.csv')
df_test = pd.read_csv('./data/opiniones_test.csv')
print(df_train.shape, df_test.shape)

(42320, 3) (10584, 4)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(df_train.drop(['puntuacion'], axis=1),         df_train.puntuacion, test_size=0.3, random_state=0)

In [4]:
#### Genericos #### 
scale = (1.0, 10.0)
reader = sp.reader.Reader(rating_scale=scale)
train = pd.concat([X_train, pd.DataFrame(y_train)], axis=1)
data_train = sp.dataset.Dataset.load_from_df(train, reader)

In [5]:
# Funcion para hacer la prediccion de cada par usuario/libro
def full_prediction(algoritmo,test):
    list_pred = []
    for row in tqdm_notebook(test.itertuples()):
        row_pred = dict()
        pred = algoritmo.predict(row.usuario, row.libro, verbose=False)
        row_pred["id"] = row.id
        row_pred["puntuacion"] = np.around(pred.est, 4)
        list_pred.append(row_pred) 
    return list_pred

## SVD

In [6]:
#### Parametros SVD ####
SVD_algo = sp.prediction_algorithms.SVD
param_grid = {
    'n_factors': range(40,60,1),
    "n_epochs": range(200,500,25),
    'lr_all': [0.001, 0.002, 0.01, 0.02],
    'reg_all': [0.1, 0.2, 0.3]
}

In [7]:
#### Busqueda de hiper parametros ####
t_0 = time.time()
svd_gs = sp.model_selection.search.GridSearchCV(SVD_algo, param_grid, measures=['rmse'], cv=5, n_jobs=-1)
svd_gs.fit(data_train)
print("SVD_GS")
print('best_Param =', svd_gs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)
#SVD_GS
#best_Param = {'n_factors': 55, 'n_epochs': 300, 'lr_all': 0.001, 'reg_all': 0.2}
#tiempo:  6867.6164293289185

#SVD_GS: 10000
#best_Param = {'n_factors': 42, 'n_epochs': 250, 'lr_all': 0.001, 'reg_all': 0.2}
#tiempo:  334.83989334106445

#SVD_GS: full
#best_Param = {'n_factors': 46, 'n_epochs': 200, 'lr_all': 0.001, 'reg_all': 0.2}
#tiempo:  1336.0529103279114
#SVD_GS full2: 20 hs
#best_Param = {'n_factors': 46, 'n_epochs': 200, 'lr_all': 0.001, 'reg_all': 0.2}
#tiempo:  70687.84449291229

SVD_GS
best_Param = {'n_factors': 46, 'n_epochs': 200, 'lr_all': 0.001, 'reg_all': 0.2}
tiempo:  70687.84449291229


In [10]:
#### Busqueda de hiper parametros ####
t_0 = time.time()
svd_rs = sp.model_selection.search.RandomizedSearchCV(SVD_algo, param_grid, measures=['rmse'],cv=5, n_jobs=-1)
svd_rs.fit(data_train)
print("SVD_RS")
print('best_Param =', svd_rs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)

SVD
RMSE = {'n_factors': 42, 'n_epochs': 100, 'lr_all': 0.003, 'reg_all': 0.2, 'biased': True}
tiempo:  400.4446563720703


In [8]:
#### Entrenamiento ####
t_0 = time.time()
train = df_train[['usuario', 'libro', 'puntuacion']]
data = sp.dataset.Dataset.load_from_df(train, reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)
model = SVD_algo(n_factors= 46, n_epochs= 200, lr_all= 0.001, reg_all= 0.2, biased=True)
print ("tiempo: ", time.time() - t_0)

tiempo:  0.21262049674987793


In [9]:
#### Predicciones ####
t_0 = time.time()
# Fit sobre trainset (70% de data) y predicciones sobre testset(30% data)
model.fit(trainset)
y_predictions = model.test(testset)
# Fit sobre trainset full
model.fit(data.build_full_trainset())
print(sp.accuracy.rmse(y_predictions))
print ("tiempo: ", time.time() - t_0)

RMSE: 1.5809
1.5809351401092744
tiempo:  30.04964590072632


In [10]:
submit = pd.DataFrame(full_prediction(model,df_test))
print(submit)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


          id  puntuacion
0          1      7.1850
1          2      8.0546
2          3      7.2928
3          4      7.5517
4          5      6.6829
...      ...         ...
10579  10580      8.5150
10580  10581      7.6038
10581  10582      7.4369
10582  10583      8.6752
10583  10584      5.8584

[10584 rows x 2 columns]


In [96]:
# Column Compare
#submit_manual.where(submit_manual.puntuacion==submit.puntuacion).notna()

In [11]:
#n_factors= 55, n_epochs= 300, lr_all= 0.001, reg_all= 0.2
## RMSE: 1.5834 | RMSE_kaggle: 1.48523 ===> surpSVD_tunned.csv 
## RMSE: 1.5846 | RMSE_kaggle: 1.48610 ===> surpSVD_tunned_3.csv 
#n_factors= 65, n_epochs= 700, lr_all= 0.007, reg_all= 0.03, biased=True
##RMSE: 1.6119  
#RMSE = {'n_factors': 72, 'n_epochs': 800, 'lr_all': 0.008, 'reg_all': 0.01, 'biased': True}
##RMSE: 1.6088
#RMSE = {'n_factors': 42, 'n_epochs': 100, 'lr_all': 0.003, 'reg_all': 0.2, 'biased': True}
##RMSE: 1.5852
#n_factors= 55, n_epochs= 125, lr_all= 0.0025, reg_all= 0.25, biased=True
## RMSE: 1.5827 | RMSE_kaggle: 1.49455 ===> surpSVD_tunned_2.csv

#best_Param = {'n_factors': 46, 'n_epochs': 200, 'lr_all': 0.001, 'reg_all': 0.2}
###### RMSE: 1.5809  | RMSE_Kaggle:  1.48142 ===============> surpSVD_tunned_4.csv <===============

#submit.to_csv("./submission/surpSVD_tunned_5.csv", index=False)

# NMF

In [16]:
#### Parametros NMF ####
NMF_algo = sp.prediction_algorithms.NMF
param_grid_nmf = {    
    'n_factors': range(5,40,5),
    "n_epochs": range(20,150,10),
    'reg_pu': [0.001, 0.05, 0.06, 0.07, 0.1],
    'reg_qi': [0.005, 0.01, 0.02, 0.03, 0.1],
    'reg_bu': [0.005, 0.01, 0.02, 0.03, 0.1],
    'reg_bi': [0.005, 0.01, 0.02, 0.03, 0.1],
    'lr_bu': [0.0005, 0.004, 0.005, 0.006, 0.01],
    'lr_bi': [0.0005, 0.004, 0.005, 0.006, 0.01]
    }

In [17]:
#### Busqueda de hiper parametros ####
t_0 = time.time()
nmf_rs = sp.model_selection.search.RandomizedSearchCV(NMF_algo, param_grid_nmf, measures=['rmse'],cv=5, n_jobs=-1)
nmf_rs.fit(data_train)
print("NMF")
print('best_Param =', nmf_rs.best_params['rmse'])

print ("tiempo: ", time.time() - t_0)

NMF
best_Param = {'n_factors': 35, 'n_epochs': 50, 'reg_pu': 0.07, 'reg_qi': 0.005, 'reg_bu': 0.1, 'reg_bi': 0.02, 'lr_bu': 0.004, 'lr_bi': 0.0005}
tiempo:  62.88409471511841


In [32]:
#### Entrenamiento ####
t_0 = time.time()
train = df_train[['usuario', 'libro', 'puntuacion']]
data = sp.dataset.Dataset.load_from_df(train, reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)
model = NMF_algo(n_factors= 35, n_epochs= 50)
print ("tiempo: ", time.time() - t_0)

tiempo:  0.14764928817749023


In [33]:
#### Predicciones ####
t_0 = time.time()
# Fit sobre trainset (70% de data) y predicciones sobre testset(30% data)
model.fit(trainset)
y_predictions = model.test(testset)
# Fit sobre trainset full
model.fit(data.build_full_trainset())
print(sp.accuracy.rmse(y_predictions))
print ("tiempo: ", time.time() - t_0)

RMSE: 1.8140
1.8139835153445447
tiempo:  10.64688491821289


In [14]:
submit = pd.DataFrame(full_prediction(model,df_test))
print(submit)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


          id  puntuacion
0          1      7.1653
1          2      6.7115
2          3      8.2686
3          4      7.0912
4          5      7.1653
...      ...         ...
10579  10580      8.3454
10580  10581      9.6970
10581  10582      6.1085
10582  10583      8.2641
10583  10584      4.3595

[10584 rows x 2 columns]


## KNN

In [6]:
#### Parametros KNN ####
KNNMean_algo = sp.prediction_algorithms.KNNWithMeans



In [None]:
param_grid_knn =  {'min_k': range(1,10,1), # default 1
                   'k': range(1,15,1),     # default 40
                   'sim_options': {'name': ["msd"], #["msd", "cosine", "pearson_baseline", "pearson"
                                   'min_support': range(1,9,1),
                                   'user_based': [True, False]}}

In [7]:
#### Busqueda de hiper parametros ####
t_0 = time.time()
knnM_cv = sp.model_selection.search.GridSearchCV(KNNMean_algo, param_grid_knn, measures=['rmse'],cv=5, n_jobs=2)

knnM_cv.fit(data_train)
print("KNN_msd")
print('best_Param =', knnM_cv.best_params['rmse'])
print ("tiempo: ", time.time() - t_0)

NMF
best_Param = {'min_k': 7, 'k': 14, 'sim_options': {'name': 'msd', 'min_support': 1, 'user_based': False}}
tiempo:  13650.820331335068


In [8]:
param_grid_knn =  {'min_k': range(1,10,1), # default 1
                   'k': range(1,15,1),     # default 40
                   'sim_options': {'name': ["cosine"], #["msd", "cosine", "pearson_baseline", "pearson"
                                   'min_support': range(1,9,1),
                                   'user_based': [True, False]}}
#### Busqueda de hiper parametros ####
t_0 = time.time()
knnM_cv = sp.model_selection.search.GridSearchCV(KNNMean_algo, param_grid_knn, measures=['rmse'],cv=5, n_jobs=2)

knnM_cv.fit(data_train)
print("KNN_cosine")
print('best_Param =', knnM_cv.best_params['rmse'])
print ("tiempo: ", time.time() - t_0)

NMF
best_Param = {'min_k': 6, 'k': 14, 'sim_options': {'name': 'cosine', 'min_support': 1, 'user_based': False}}
tiempo:  17756.52866578102


In [8]:
param_grid_knn =  {'min_k': range(1,10,1), # default 1
                   'k': range(1,15,1),     # default 40
                   'sim_options': {'name': ["pearson_baseline"], #["msd", "cosine", "pearson_baseline", "pearson"
                                   'min_support': range(1,9,1),
                                   'user_based': [True, False]}}
#### Busqueda de hiper parametros ####
t_0 = time.time()
knnM_cv = sp.model_selection.search.GridSearchCV(KNNMean_algo, param_grid_knn, measures=['rmse'],cv=5, n_jobs=2)

knnM_cv.fit(data_train)
print("KNN_pearson_baseline")
print('best_Param =', knnM_cv.best_params['rmse'])
print ("tiempo: ", time.time() - t_0)

KeyboardInterrupt: 

In [7]:
param_grid_knn =  {'min_k': range(1,10,1), # default 1
                   'k': range(1,15,1),     # default 40
                   'sim_options': {'name': ["pearson"], #["msd", "cosine", "pearson_baseline", "pearson"
                                   'min_support': range(1,9,1),
                                   'user_based': [True, False]}}
#### Busqueda de hiper parametros ####
t_0 = time.time()
knnM_cv = sp.model_selection.search.GridSearchCV(KNNMean_algo, param_grid_knn, measures=['rmse'],cv=5, n_jobs=2)

knnM_cv.fit(data_train)
print("KNN_pearson")
print('best_Param =', knnM_cv.best_params['rmse'])
print ("tiempo: ", time.time() - t_0)

KNN_pearson
best_Param = {'min_k': 6, 'k': 14, 'sim_options': {'name': 'pearson', 'min_support': 1, 'user_based': False}}
tiempo:  18970.095390319824


In [7]:
#best_Param = {'min_k': 1, 'k': 5, 'sim_options': {'name': 'msd', 'min_support': 1, 'user_based': False}}
#tiempo:  1790.2675387859344
sim_options =  {'name': 'pearson', 'min_support': 1, 'user_based': False}

In [8]:
#### Entrenamiento ####
t_0 = time.time()
train = df_train[['usuario', 'libro', 'puntuacion']]
data = sp.dataset.Dataset.load_from_df(train, reader)
trainset, testset = sp.model_selection.train_test_split(data, test_size=0.30, random_state=0)
model = sp.prediction_algorithms.knns.KNNWithMeans(min_k=6, k=14, sim_options=sim_options)
print ("tiempo: ", time.time() - t_0)

tiempo:  0.13117766380310059


In [9]:
## MSD RMSE: 1.7500
## COSINE RMSE: 1.7581
## PEARSON RMSE: 1.7722

#### Predicciones ####
t_0 = time.time()
# Fit sobre trainset (70% de data) y predicciones sobre testset(30% data)
model.fit(trainset)
y_predictions = model.test(testset)
# Fit sobre trainset full
model.fit(data.build_full_trainset())
print(sp.accuracy.rmse(y_predictions))
print ("tiempo: ", time.time() - t_0)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.7722
1.7721606633930036
tiempo:  37.10352563858032


## Cross Validation


In [62]:
kf = sp.model_selection.KFold(n_splits=2)

In [67]:
#pred_parciales_cv = dict(lambda: dict())#defaultdict(lambda: defaultdict(int))
list_pred = []
for trainset, testset in kf.split(data):

    # train and test algorithm.
    model.fit(trainset)
    predictions = model.test(testset)
    
    for element in pre:
        row_pred = dict()
        row_pred["id"] = element.uid
        row_pred["puntuacion"] = np.around(element.est, 4)
        list_pred.append(row_pred) 
    # Compute and print Root Mean Squared Error
    sp.accuracy.rmse(predictions, verbose=True)

RMSE: 1.8389
RMSE: 1.8144


In [68]:
list_pred

ion': 7.1598},
 {'id': 203, 'puntuacion': 6.2837},
 {'id': 91, 'puntuacion': 7.5905},
 {'id': 271, 'puntuacion': 7.4847},
 {'id': 98, 'puntuacion': 5.9931},
 {'id': 1915, 'puntuacion': 6.9876},
 {'id': 218, 'puntuacion': 7.3661},
 {'id': 235, 'puntuacion': 7.1598},
 {'id': 128, 'puntuacion': 8.2441},
 {'id': 226, 'puntuacion': 6.5246},
 {'id': 22, 'puntuacion': 6.5808},
 {'id': 274, 'puntuacion': 8.6095},
 {'id': 0, 'puntuacion': 7.9207},
 {'id': 151, 'puntuacion': 8.1336},
 {'id': 161, 'puntuacion': 7.1598},
 {'id': 216, 'puntuacion': 9.1245},
 {'id': 300, 'puntuacion': 7.3442},
 {'id': 126, 'puntuacion': 3.2484},
 {'id': 128, 'puntuacion': 8.438},
 {'id': 211, 'puntuacion': 7.3343},
 {'id': 88, 'puntuacion': 7.1598},
 {'id': 683, 'puntuacion': 8.584},
 {'id': 35, 'puntuacion': 6.1603},
 {'id': 102, 'puntuacion': 7.1598},
 {'id': 265, 'puntuacion': 9.3166},
 {'id': 299, 'puntuacion': 8.8961},
 {'id': 271, 'puntuacion': 7.0165},
 {'id': 199, 'puntuacion': 7.4781},
 {'id': 169, 'puntuac