In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm, tqdm_notebook
from xgboost import XGBRegressor
import warnings, sys, os, gc
from sklearn import preprocessing

## Carga Parcial

In [2]:
df_libros = pd.read_csv('./data/libros.csv')
df_usr = pd.read_csv('./data/usuarios.csv')
df_train = pd.read_csv('./data/train_svd.csv')
df_test = pd.read_csv('./data/test_svd.csv')
print(df_train.shape, df_test.shape)

(42320, 5) (10584, 6)


## Preprocesamiento

In [3]:
def write_submit(df_test,prediccion,fileName):
    submit = pd.DataFrame({'id':df_test.id})
    submit['puntuacion'] = pd.DataFrame(np.around(prediccion, 0))
    submit.head()
    len(submit)
    submit.to_csv("./submission/"+fileName+".csv", index=False)

In [5]:
df_libros.isna().sum()

anio           4
autor          4
editorial      9
genero         9
idioma       106
isbn           6
libro          0
resumen      364
titulo         4
dtype: int64

In [6]:
print(len(df_libros.anio), len(df_libros.anio.unique()))
moda_anio = df_libros.anio.mode().values[0]
moda_genero = df_libros.genero.mode().values[0]
moda_editorial = df_libros.editorial.mode().values[0]
moda_autor = df_libros.autor.mode().values[0]
df_libros[['idioma']] = df_libros[['idioma']].fillna('Español')
df_libros[['anio']] = df_libros[['anio']].fillna(moda_anio)
df_libros[['genero']] = df_libros[['genero']].fillna(moda_genero)
df_libros[['editorial']] = df_libros[['editorial']].fillna(moda_editorial)
df_libros[['autor']] = df_libros[['autor']].fillna(moda_autor)
df_libros.loc[df_libros.anio == '(200', 'anio'] = moda_anio
df_libros.loc[df_libros.anio == 'Español', 'anio'] = moda_anio
df_libros.loc[df_libros['anio'].str.len() != 4, 'anio'] = moda_anio
df_train = pd.merge(left=df_train, right=df_libros[['libro','anio', 'genero','editorial','autor']], on='libro')
df_test = pd.merge(left=df_test, right=df_libros[['libro','anio', 'genero','editorial','autor']], how='left', on='libro')
df_test[['anio']] = df_test[['anio']].fillna(moda_anio)
df_libros.isna().sum()

14950 94


anio           0
autor          0
editorial      0
genero         0
idioma         0
isbn           6
libro          0
resumen      364
titulo         4
dtype: int64

In [6]:
## Cambio sring del nombre del libro por isbn
## Elimino libros con valores na,"Español", "/", "B" y "q"
#train = df_train
#print(len(train))
#train['libro'] = train['libro'].map(df_libros.set_index('libro')['isbn'])
#train = train[train['libro'].notna()]
#train = train[~train.libro.str.contains("Español")]
#train = train[~train.libro.str.contains("/")]
#train = train[~train.libro.str.contains("B")]
#train = train[~train.libro.str.contains("-")]
#train = train[~train.libro.str.contains("q")]
#train = train[~train.libro.str.contains(" ")]
#train['libro'] = train['libro'].astype(int)
#print(len(train))
##Check para ver si son todos numericos
##train.applymap(np.isreal).sum()

In [7]:
train = df_train
test = df_test
train['libro']  = train['libro'].astype('category')
train['genero'] = train['genero'].astype('category')
train['anio']   = train['anio'].astype('int')
train['editorial'] = train['editorial'].astype('category')
train['autor']   = train['autor'].astype('category')

test['libro']   = test['libro'].astype('category')
test['genero']  = test['genero'].astype('category')
test['anio']    = test['anio'].astype('int')
test['editorial'] = test['editorial'].astype('category')
test['autor']   = test['autor'].astype('category')

In [8]:
full_x = train[['libro', 'usuario','svd','knn','anio','genero','editorial','autor']]
full_y = train[['puntuacion']]
X_train, X_test, y_train, y_test = train_test_split(full_x, full_y, test_size=0.3, random_state=0)

In [9]:
test= test[['libro', 'usuario','svd','knn','anio','genero','editorial','autor']]
test

Unnamed: 0,libro,usuario,svd,knn,anio,genero,editorial,autor
0,los-hijos,201,7.17,7.17,2014,"Biografías, Memorias",ALFAGUARA,"Talese, Gay"
1,el-temor-de-un-hombre-sabio,299,9.44,8.34,2011,"Fantástica, ciencia ficción",PLAZA & JANÉS,"Rothfuss, Patrick"
2,leon-bocanegra,126,7.41,7.48,2005,Ficción literaria,DEBOLSILLO,"Vázquez-Figueroa, Alberto"
3,el-caballero-errante-2,107,7.61,7.75,2004,"Cómics, Novela Gráfica",DEVIR,"Martin, George R.R."
4,el-mar-el-mar,85,6.59,7.17,2005,Narrativa,DEBOLSILLO,"Murdoch, Iris"
...,...,...,...,...,...,...,...,...
10579,la-princesa-prometida,854,8.57,8.67,2018,"Fantástica, ciencia ficción",ÁTICO DE LOS LIBROS,"Goldman, William"
10580,la-prima-bette,216,8.05,7.01,2010,Clásicos de la literatura,ALBA,"Balzac, Honore De"
10581,el-guardian-invisible-trilogia-del-baztan-1,132,6.90,7.46,2013,"Novela negra, intriga, terror",DESTINO,"Redondo, Dolores"
10582,un-otono-romano,133,9.08,9.11,2014,Narrativa,PLAZA & JANÉS,"Reverte, Javier"


In [10]:
X_test

Unnamed: 0,libro,usuario,svd,knn,anio,genero,editorial,autor
32123,anatomia-de-un-instante,547,7.40,6.41,2009,Narrativa,MONDADORI,"Cercas, Javier"
30718,muerto-hasta-el-anochecer,230,7.94,7.65,2009,"Fantástica, ciencia ficción",LA FACTORÍA DE IDEAS,"Harris, Charlaine"
37446,un-paraiso-inalcanzable,216,7.06,7.01,2013,Narrativa,LIBROS DEL ASTEROIDE,"Mortimer, John"
31138,todo-paracuellos,271,9.02,7.33,2007,"Cómics, Novela Gráfica",DEBOLSILLO,"Giménez Giménez, Carlos"
38096,levantado-del-suelo,312,6.01,6.07,2007,Literatura contemporánea,PUNTO DE LECTURA,"Saramago, José"
...,...,...,...,...,...,...,...,...
27792,si-esto-es-una-mujer,88,7.19,7.17,2019,"Novela negra, intriga, terror",DESTINO,"Silva, Lorenzo Y Trujillo, Noemí"
12778,el-diario-de-bridget-jones,128,7.63,5.64,2003,"Romántica, erótica",DEBOLSILLO,"Fielding, Helen"
17336,la-hipotesis-del-mal,309,7.73,7.50,2015,"Novela negra, intriga, terror",PLANETA,"Carrisi, Donato"
29736,el-banquete-1,288,8.81,6.92,2014,Clásicos de la literatura,GREDOS,Platón


## LGB

In [11]:
# create dataset for lightgbm
#lgb_train = lgb.Dataset(X_train, y_train)
lgb_train = lgb.Dataset(full_x, full_y)
#lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [12]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'l2', 'l1'},
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [14]:
print('Starting training...')
# train
#gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval,early_stopping_rounds=5)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20)
# save model to file
#gbm.save_model('model.txt')

Starting training...
You can set `force_col_wise=true` to remove the overhead.


In [14]:
print('Starting predicting...')
# predict
y_pred_lgb = gbm.predict(X_train, num_iteration=gbm.best_iteration)
# eval
#print('RMSE: %.4f' % np.sqrt(mean_squared_error(y_test, y_pred_lgb)))
#RMSE: 1.6434
# Ensamble:RMSE: 0.6602
# Ensamble:RMSE: 0.6557 ---> 1.52247
# Con anio y genero: RMSE: 0.7480 ---> 1.52072 ****
# Con anio, genero, editorial y autor: RMSE: 0.6544 ---> 1.52869

Starting predicting...
RMSE: 0.6544


In [15]:
y_pred_lgb_test = gbm.predict(test, num_iteration=gbm.best_iteration)

In [16]:
print(len(test))
print(len(y_pred_lgb_test))

10584
10584


In [17]:
write_submit(df_test,y_pred_lgb_test, "ignacio_submit_ensamble5")

## XGBOOST

In [10]:
## No funciona con data Categorica
#model_xgb = XGBRegressor()
#model_xgb.fit(X_train, y_train)
#print('Starting predicting...')
#y_pred_xgb = model_xgb.predict(X_test)
#print('RMSE: %.4f' % np.sqrt(mean_squared_error(y_test, y_pred_xgb)))

In [None]:
X_train, X_test, y_train, y_test

In [25]:
X_train

Unnamed: 0,libro,usuario,svd,knn,anio,genero,editorial,autor
40992,spqr-el-senador-de-roma,169,6.98,6.79,2007,Histórica y aventuras,EDHASA,"Waltari, Mika"
37239,narraciones-inverosimiles,300,6.10,6.68,2000,Narrativa,CLAN,"Alarcón, Pedro Antonio De"
25607,un-hombre-en-la-oscuridad,88,7.08,6.96,2008,Ficción literaria,ANAGRAMA,"Auster, Paul"
41609,la-danza-de-los-maestros-de-wu-li,255,5.13,6.21,1991,Lecturas complementarias,PLAZA & JANÉS,"Zukav, Gary"
29195,reinventarse-tu-segunda-oportunidad,2202,6.29,7.00,2010,Lecturas complementarias,PLATAFORMA,"Alonso Puig, Mario"
...,...,...,...,...,...,...,...,...
41993,fogatas,88,7.90,7.25,1988,Narrativa,EL ALEPH,"Nemirovsky, Irene"
32103,los-watson,305,7.81,7.04,2012,Literatura contemporánea,NÓRDICA,"Austen, Jane"
30403,el-hombre-de-arena-1,158,7.97,7.75,2014,"Novela negra, intriga, terror",PLANETA,"Kepler, Lars"
21243,el-gran-arcano,205,6.98,6.93,2007,Histórica y aventuras,DEBOLSILLO,"Sánchez-Garnica, Paloma"


In [22]:
libros = df_libros.libro.unique()
le = preprocessing.LabelEncoder()
le.fit(libros)

LabelEncoder()

In [26]:
target_encode_columns = ['libro', 'genero', 'editorial', 'autor']