In [1]:
import pandas as pd
import time
import pandas_profiling
import numpy as np
import math
import seaborn as sns
import numpy as np
from sklearn import datasets
from datetime import date, datetime
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
import warnings, sys, os, gc

import surprise as sp
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV, RandomizedSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, SVD, NMF
from surprise import accuracy


## Surprise

In [2]:
df_train = pd.read_csv('./data/opiniones_train.csv')
df_test = pd.read_csv('./data/opiniones_test.csv')

In [3]:
trainset, testset = train_test_split(df_train, test_size=0.30)
#trainset = df_train
#testset = df_test
print(trainset.shape, testset.shape)

(29624, 3) (12696, 3)


In [81]:
#testset = testset.rename(columns = {'libro': 'id'}, inplace = False)
#trainset = trainset.rename(columns = {'libro': 'id'}, inplace = False)

In [5]:
reader = sp.reader.Reader(rating_scale=(1.0, 10.0))
data_train= sp.dataset.Dataset.load_from_df(trainset[['libro', 'usuario', 'puntuacion']], reader)

### Busqueda hiper parametros

In [6]:
param_grid =  {'min_k': list(range(2,11)), 'k': list(range(10,45,5)),
                'sim_options': {'name': ["msd", "cosine", "pearson_baseline", "pearson"],
                'user_based': [False, True]}}

In [7]:
t_0=time.time()
knnbasic_rs = RandomizedSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'],n_iter=20, n_jobs=2)
knnbasic_rs.fit(data_train)
#knnbasic_gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=3)
#knnbasic_gs.fit(data_train)
print("KNNBasic")
print('RMSE =', knnbasic_rs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)

KNNBasic
RMSE = {'min_k': 7, 'k': 30, 'sim_options': {'name': 'msd', 'user_based': True}}
tiempo:  164.5410885810852


In [8]:
t_0=time.time()
knnmeans_rs = RandomizedSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], n_iter=20, n_jobs=2)
knnmeans_rs.fit(data_train)
print("KNNWithMeans")
print('RMSE =', knnmeans_rs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)

KNNWithMeans
RMSE = {'min_k': 8, 'k': 35, 'sim_options': {'name': 'msd', 'user_based': False}}
tiempo:  224.38877844810486


In [9]:
so_Basic = {'name': 'msd', 'user_based': True}
so_Means = {'name': 'msd', 'user_based': False}

In [10]:
KNNBasic_algo = sp.prediction_algorithms.knns.KNNBasic(min_k=7, k=30, sim_options=so_Basic)
KNNWithMeans_algo = sp.prediction_algorithms.knns.KNNWithMeans(min_k=8, k=35, sim_options=so_Means)

In [11]:
trainset = data_train.build_full_trainset()

In [12]:
t_0 = time.time()
pred_KNNBasic = KNNBasic_algo.fit(trainset)
print ("tiempo: ", time.time() - t_0)

Computing the msd similarity matrix...
Done computing similarity matrix.
tiempo:  4.452387809753418


In [69]:
t_0 = time.time()
pred_KNNWithMeans = KNNWithMeans_algo.fit(trainset)
print ("tiempo: ", time.time() - t_0)

Computing the cosine similarity matrix...
Done computing similarity matrix.
tiempo:  0.4728994369506836


In [20]:
def full_prediction(algo_name,algoritmo,test):
    list_pred = []
    for row in tqdm_notebook(test.itertuples()):
        row_pred = dict()
        uid = row.libro
        iid = row.usuario
        #print(uid, iid)
        pred = algoritmo.predict(uid, iid, verbose=False)
        row_pred["libro"] = uid
        row_pred["puntuacion"] = np.around(pred.est, 2)
        list_pred.append(row_pred)
    #print(row.ID.astype(int))
    df = pd.DataFrame(list_pred)    
    print(algo_name + " MSE:      %.4f" % np.sqrt(mean_squared_error(test.puntuacion, df.puntuacion)))
    return list_pred

In [18]:
test_knn = testset
test_knnMeans = testset
print(len(testset))

12696


In [21]:
t_0 = time.time()
df_list_pred_KNNBasic = pd.DataFrame(full_prediction("KNNBasic",KNNBasic_algo,test_knn))
df_list_pred_KNNWithMeans = pd.DataFrame(full_prediction("KNNWithMeans",KNNWithMeans_algo,test_knnMeans))
print ("tiempo: ", time.time() - t_0)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


KNNBasic MSE:      1.6802


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


KNNWithMeans MSE:      1.6788
tiempo:  3.1889472007751465


In [75]:
# 1.5 y algo...
#df_list_pred_KNN.to_csv("./submission/ignacio_submit_KNN.csv", index=False)

## SVD

In [22]:
# HiperParametros
param_grid_svd = {'n_factors': list(range(40,130,5)), 'n_epochs': list(range(1,100,1)), 
              'lr_all': [0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008],
              'reg_all': [0.01,0.02,0.03],
              'biased':[True, False]}

param_grid_nmf = {'n_factors': list(range(40,130,10)), 'n_epochs': list(range(1,100,1)),
              'biased':[True, False]}

SVD_algo = SVD()
NMF_algo = NMF()

In [23]:
t_0 = time.time()
svd_rs = RandomizedSearchCV(SVD, param_grid_svd, measures=['rmse', 'mae'],n_iter=25, n_jobs=2)
svd_rs.fit(data_train)
print("SVD")
print('RMSE =', svd_rs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)

SVD
RMSE = {'n_factors': 90, 'n_epochs': 9, 'lr_all': 0.006, 'reg_all': 0.03, 'biased': True}
tiempo:  210.96164655685425


In [26]:
t_0 = time.time()
nmf_rs = RandomizedSearchCV(NMF, param_grid_nmf, measures=['rmse', 'mae'],n_iter=20, n_jobs=2)
nmf_rs.fit(data_train)
print("NMF")
print('RMSE =', nmf_rs.best_params['rmse']) 
print ("tiempo: ", time.time() - t_0)

ZeroDivisionError: float division

In [83]:
SVD_algo = SVD(n_factors= 90, n_epochs= 9, lr_all= 0.006, reg_all= 0.03, biased=True)
NMF_algo = NMF(n_factors= 40, n_epochs= 95, biased=False)

In [84]:
trainset_model = data_train.build_full_trainset()
t_0 = time.time()
#pred_SVD = SVD_algo.fit(trainset_model)
print ("tiempo: ", time.time() - t_0)
t_0 = time.time()
pred_NMF = NMF_algo.fit(trainset_model)
print ("tiempo: ", time.time() - t_0)

tiempo:  0.00011324882507324219
tiempo:  9.384754180908203


In [85]:
#test_svd = testset
test_nmf = testset
print(len(testset))

12696


In [86]:
#df_list_pred_SVD = pd.DataFrame(full_prediction("SVD",SVD_algo,test_svd))
df_list_pred_NMF = pd.DataFrame(full_prediction("NMF",NMF_algo,test_nmf))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


NMF MSE:      1.8935
