# TP2 - Organización de Datos
#### Notebook principal

<hr>

### Notebooks utilizados:

- ***pre_processing:*** notebook para el manejo inicial de los dataframes.
- ***feature_generation:*** primer etapa del pipeline. En este notebook se generarán nuevos features para luego, realizar un proceso de selección de los mejores features para cada modelo.
- ***feature_selection*** segunda etapa, donde se buscara encontrar los features con mayor importancia, es decir aquellos que aporten mayor informacion.
- ***parameter_tuning:*** tercer etapa, notebook donde se tunean los parámetros para cada modelo.

<hr>


In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

seed = 42

In [2]:
import nbimporter

import pre_processing
import feature_generation
import feature_selection
import parameter_tuning

Importing Jupyter notebook from pre_processing.ipynb
Importing Jupyter notebook from feature_generation.ipynb
Importing Jupyter notebook from feature_selection.ipynb
Importing Jupyter notebook from parameter_tuning.ipynb


In [3]:
def escribir_respuesta(ids,predicciones):
    with open("respuesta.csv",'w') as archivo:
        archivo.write("id,target\n")
        for i in range(len(ids)):
            linea = f"{int(ids[i])},{predicciones[i]}"
            archivo.write(f"{linea}\n")

<hr>

# Resultados obtenidos

# Mejor submit hasta la fecha -- 19/11
Este submit nos dio 472 en Kaggle, pero tiene mucho overfitting.

In [4]:
import lightgbm as lgb
from datetime import datetime

In [5]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'max_depth': 14,
    'num_leaves': 120,
    'learning_rate': 0.02,
    'verbose': 0, 
    'early_stopping_round': 1000}
n_estimators=99999999

In [6]:
train,test = pre_processing.load_featured_datasets()

In [7]:
train['precio'] = train['precio'].map(lambda x: math.log(x))

In [8]:
train_selected = feature_selection.get_selected_dataframe(train)

In [9]:
X = train_selected.drop('precio', axis=1)
Y = train_selected['precio']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=seed)

In [10]:
print(datetime.now())

2019-11-19 23:15:12.446117


In [11]:
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid]
reg = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=500)



Training until validation scores don't improve for 1000 rounds
[500]	valid_0's l1: 0.197949
[1000]	valid_0's l1: 0.192564
[1500]	valid_0's l1: 0.189929
[2000]	valid_0's l1: 0.188237
[2500]	valid_0's l1: 0.186912
[3000]	valid_0's l1: 0.185966
[3500]	valid_0's l1: 0.185192
[4000]	valid_0's l1: 0.184344
[4500]	valid_0's l1: 0.183718
[5000]	valid_0's l1: 0.183126
[5500]	valid_0's l1: 0.182645
[6000]	valid_0's l1: 0.182254
[6500]	valid_0's l1: 0.181833
[7000]	valid_0's l1: 0.181479
[7500]	valid_0's l1: 0.181153
[8000]	valid_0's l1: 0.180848
[8500]	valid_0's l1: 0.1806
[9000]	valid_0's l1: 0.180359
[9500]	valid_0's l1: 0.180107
[10000]	valid_0's l1: 0.179954
[10500]	valid_0's l1: 0.179728
[11000]	valid_0's l1: 0.179515
[11500]	valid_0's l1: 0.179334
[12000]	valid_0's l1: 0.179182
[12500]	valid_0's l1: 0.17907
[13000]	valid_0's l1: 0.178926
[13500]	valid_0's l1: 0.178839
[14000]	valid_0's l1: 0.178763
[14500]	valid_0's l1: 0.178636
[15000]	valid_0's l1: 0.178544
[15500]	valid_0's l1: 0.178446

In [12]:
print(datetime.now())

2019-11-19 23:32:58.995210


In [13]:
Y_pred = reg.predict(X_val.values)

f = np.vectorize(math.exp)
Y_pred = f(Y_pred)
Y_val = f(Y_val.values)
mean_absolute_error(Y_val,Y_pred)

449620.60086473916

In [14]:
# Preparamos respuesta para Kaggle

In [17]:
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [18]:
ids = test_selected.index.values
X_test = test_selected.values
test_predict = reg.predict(X_test)
f = np.vectorize(math.exp)
test_predict = f(test_predict)
escribir_respuesta(ids, test_predict)

# LightGBM (sin overfitting)

In [1]:
import lightgbm as lgb

In [5]:
train,test = pre_processing.load_featured_datasets()

In [6]:
params = parameter_tuning.get_best_params()['lightgbm']

In [7]:
train['precio'] = train['precio'].map(lambda x: math.log(x))

In [8]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [44]:
X = train_selected.drop('precio', axis=1)
Y = train_selected['precio']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)

In [45]:
params

{'num_leaves': 55,
 'min_gain_to_split': 0.1,
 'min_data_in_leaf': 3000,
 'max_depth': 12,
 'max_bin': 150,
 'feature_fraction': 0.7,
 'bagging_freq': 5,
 'bagging_fraction': 0.75,
 'boosting_type': 'gbdt',
 'objective': 'regression',
 'metric': 'mae',
 'num_boost_round': 5000,
 'verbose': 0,
 'learning_rate': 0.05,
 'early_stopping_round': 200,
 'min_gain_to_spit': 0}

In [48]:
# Modificamos algunos parametros...
params['min_data_in_leaf'] = 5000
params['min_gain_to_split'] = 0.05
params['learning_rate'] = 0.05
params['num_boost_round'] = 10000

In [49]:
%%time
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid, d_train]
reg = lgb.train(params, train_set = d_train, valid_sets = watchlist, verbose_eval=500)

Training until validation scores don't improve for 200 rounds
[500]	training's l1: 0.210462	valid_0's l1: 0.213757
[1000]	training's l1: 0.203465	valid_0's l1: 0.20783
[1500]	training's l1: 0.199426	valid_0's l1: 0.20478
[2000]	training's l1: 0.196572	valid_0's l1: 0.202868
[2500]	training's l1: 0.194345	valid_0's l1: 0.201554
[3000]	training's l1: 0.192489	valid_0's l1: 0.200457
[3500]	training's l1: 0.190896	valid_0's l1: 0.19961
[4000]	training's l1: 0.189499	valid_0's l1: 0.198897
[4500]	training's l1: 0.188276	valid_0's l1: 0.198369
[5000]	training's l1: 0.187208	valid_0's l1: 0.197873
[5500]	training's l1: 0.18625	valid_0's l1: 0.197477
[6000]	training's l1: 0.185358	valid_0's l1: 0.197125
[6500]	training's l1: 0.184541	valid_0's l1: 0.196769
[7000]	training's l1: 0.18375	valid_0's l1: 0.196454
[7500]	training's l1: 0.183003	valid_0's l1: 0.196189
[8000]	training's l1: 0.182347	valid_0's l1: 0.195963
[8500]	training's l1: 0.18174	valid_0's l1: 0.195729
[9000]	training's l1: 0.181

KeyboardInterrupt: 

In [42]:
Y_pred = reg.predict(X_val.values)

f = np.vectorize(math.exp)
Y_pred = f(Y_pred)
Y_val = f(Y_val.values)
mean_absolute_error(Y_val,Y_pred)

496936.9091695474

In [43]:
# respuesta para kaggle
ids = test_selected.index.values
X_test = test_selected.values
test_predict = reg.predict(X_test)
f = np.vectorize(math.exp)
test_predict = f(test_predict)
escribir_respuesta(ids, test_predict)

# AREA DE TESTING

In [7]:
import lightgbm as lgb
from datetime import datetime

In [8]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    # Prevenir overfitting:
    'max_bin': 100,
    'num_leaves': 120,
    'min_data_in_leaf':5000,
    'bagging_fraction':0.605,
    'bagging_freq': 3,
    'feature_fraction': 0.7,
    'min_gain_to_split':0.1,
    # Parametros grales:
    'max_depth': 12,
    'learning_rate': 0.05,
    'verbose': 0, 
    'early_stopping_round': 1000}
n_estimators=99999999

In [3]:
train,test = pre_processing.load_featured_datasets()

In [10]:
train['precio'] = train['precio'].map(lambda x: math.log(x))

In [4]:
train_selected = feature_selection.get_selected_dataframe(train)

In [12]:
X = train_selected.drop('precio', axis=1)
Y = train_selected['precio']

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=seed)

In [13]:
print(datetime.now())

2019-11-27 19:35:24.278748


In [14]:
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid, d_train]
reg = lgb.train(params, d_train, n_estimators, watchlist, verbose_eval=500)



Training until validation scores don't improve for 1000 rounds
[500]	training's l1: 0.213136	valid_0's l1: 0.214525
[1000]	training's l1: 0.20647	valid_0's l1: 0.208747
[1500]	training's l1: 0.2028	valid_0's l1: 0.205831
[2000]	training's l1: 0.200274	valid_0's l1: 0.203974
[2500]	training's l1: 0.19836	valid_0's l1: 0.202622
[3000]	training's l1: 0.196984	valid_0's l1: 0.201671
[3500]	training's l1: 0.195789	valid_0's l1: 0.200955
[4000]	training's l1: 0.194816	valid_0's l1: 0.200341
[4500]	training's l1: 0.193961	valid_0's l1: 0.199806
[5000]	training's l1: 0.193248	valid_0's l1: 0.1994
[5500]	training's l1: 0.192529	valid_0's l1: 0.199025
[6000]	training's l1: 0.191979	valid_0's l1: 0.19869
[6500]	training's l1: 0.191367	valid_0's l1: 0.19846
[7000]	training's l1: 0.190866	valid_0's l1: 0.198202
[7500]	training's l1: 0.190411	valid_0's l1: 0.197928
[8000]	training's l1: 0.189964	valid_0's l1: 0.197746
[8500]	training's l1: 0.18955	valid_0's l1: 0.197565
[9000]	training's l1: 0.18916

In [15]:
print(datetime.now())

2019-11-27 19:44:47.102832


In [None]:
Y_pred = reg.predict(X_val.values)

f = np.vectorize(math.exp)
Y_pred = f(Y_pred)
Y_val = f(Y_val.values)
mean_absolute_error(Y_val,Y_pred)

In [16]:
Y_pred = reg.predict(X_val.values)

f = np.vectorize(math.exp)
Y_pred = f(Y_pred)
Y_val = f(Y_val.values)
mean_absolute_error(Y_val,Y_pred)

488561.10936908884

In [None]:
Y_pred = reg.predict(X_train.values)

f = np.vectorize(math.exp)
Y_pred = f(Y_pred)
Y_train = f(Y_train.values)
mean_absolute_error(Y_train,Y_pred)

In [17]:
# Mejores resultados hasta la fecha:

In [13]:
Y_pred = reg.predict(X_val.values)

f = np.vectorize(math.exp)
Y_pred = f(Y_pred)
Y_val = f(Y_val.values)
mean_absolute_error(Y_val,Y_pred)

449620.60086473916

In [26]:
# Preparamos respuesta para Kaggle

In [17]:
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [18]:
ids = test_selected.index.values
X_test = test_selected.values
test_predict = reg.predict(X_test)
f = np.vectorize(math.exp)
test_predict = f(test_predict)
escribir_respuesta(ids, test_predict)

### Modelo: Regresion lineal

In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

In [6]:
train,test = pre_processing.load_featured_datasets()

In [7]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [8]:
X = train_selected.drop('precio', axis=1).values
Y = train_selected['precio'].values

In [12]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [14]:
X = imp.fit_transform(X)

In [15]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)

In [16]:
reg = LinearRegression().fit(X_train,Y_train)

In [17]:
Y_predic = reg.predict(X_val)

In [20]:
mean_absolute_error(Y_predic,Y_val)

736737.7129180954

### Modelo: RandomForest

In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

In [7]:
train,test = pre_processing.load_featured_datasets()

In [8]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [9]:
X = train_selected.drop('precio', axis=1).values
Y = train_selected['precio'].values

In [13]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [14]:
X = imp.fit_transform(X)

In [15]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)

In [17]:
regressor = RandomForestRegressor(n_estimators = 100, random_state = seed, verbose=2, max_depth=10, n_jobs=4) 
regressor.fit(X_train, Y_train)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100building tree 2 of 100building tree 3 of 100building tree 4 of 100



building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.1min


building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78

[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  3.1min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
                      oob_score=False, random_state=42, verbose=2,
                      warm_start=False)

In [18]:
y_pred = regressor.predict(X_val)
mean_absolute_error(Y_val, y_pred)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished


588415.487957101

In [19]:
y_pred2 = regressor.predict(X_train)
mean_absolute_error(Y_train, y_pred2)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.2s finished


557178.1966972738

### Modelo: XGBoost

_Generacion del dataset de train con sus features_

In [20]:
import xgboost

In [21]:
train,test = pre_processing.load_featured_datasets()

In [22]:
train['precio'] = train['precio'].map(lambda x: math.log(x))

In [23]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [24]:
X = train_selected.drop('precio', axis=1).values
Y = train_selected['precio'].values

In [25]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)

In [26]:
reg = xgboost.XGBRegressor()

In [27]:
reg = xgboost.XGBRegressor(max_depth=14,n_estimators=140 ,learning_rate=0.1, verbosity=2,subsample=0.9, min_child_weight=15,n_jobs=4)
reg.fit(X_train,Y_train)

[17:46:13] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 54 extra nodes, 0 pruned nodes, max_depth=8
[17:46:13] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 64 extra nodes, 0 pruned nodes, max_depth=8
[17:46:14] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 70 extra nodes, 0 pruned nodes, max_depth=9
[17:46:14] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 74 extra nodes, 0 pruned nodes, max_depth=8
[17:46:15] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 86 extra nodes, 0 pruned nodes, max_depth=8
[17:46:15] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 82 extra nodes, 0 pruned nodes, max_depth=8
[17:46:16] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 102 extra nodes, 0 pruned nodes, max_depth=9
[17:46:16] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra n

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=14, min_child_weight=15, missing=None, n_estimators=140,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=2)

_Comprobacion contra el conjunto de validacion_

In [28]:
Y_pred = reg.predict(X_val)

f = np.vectorize(math.exp)
Y_pred = f(Y_pred)
Y_val = f(Y_val)
mean_absolute_error(Y_val,Y_pred)

475474.8101123654

In [43]:
# preparamos el csv de respuesta para kaggle

In [44]:
ids = test_selected.index.values
X_test = test_selected.values

In [45]:
test_predict = reg.predict(X_test)

f = np.vectorize(math.exp)
test_predict = f(test_predict)

In [46]:
escribir_respuesta(ids, test_predict)

### Modelo: KNN

In [11]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

In [5]:
train,test = pre_processing.load_featured_datasets()

In [6]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [7]:
X = train_selected.drop('precio', axis=1).values
Y = train_selected['precio'].values

In [12]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [13]:
X = imp.fit_transform(X)

In [14]:
sc = MinMaxScaler()

In [15]:
X = sc.fit_transform(X)

In [16]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=seed)

In [17]:
reg = KNeighborsRegressor(n_neighbors=10, algorithm='kd_tree', metric='minkowski', p=2)

In [18]:
reg.fit(X_train,Y_train)

KNeighborsRegressor(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                    weights='uniform')

In [19]:
Y_pred = reg.predict(X_val)

In [20]:
mean_absolute_error(Y_val,Y_pred)

750980.8806958334

### Modelo: Neural Networks

In [63]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

In [57]:
train,test = pre_processing.load_featured_datasets()

In [58]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [59]:
X = train_selected.drop('precio', axis=1).values
Y = train_selected['precio'].values

In [68]:
## Tratado de nulos y escalado de los datos para la red neuronal

In [60]:
scaler = MinMaxScaler()

In [61]:
X = scaler.fit_transform(X)

In [64]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [65]:
X = imp.fit_transform(X)

In [67]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.1, random_state=seed)

In [74]:
reg = MLPRegressor(hidden_layer_sizes=(6,),activation='relu',solver='adam',learning_rate='adaptive',max_iter=1000,
            learning_rate_init=0.01,alpha=0.01, verbose = True)

In [75]:
reg.fit(X_train,Y_train)

Iteration 1, loss = 5479174818011.77636719
Iteration 2, loss = 5314735544892.87109375
Iteration 3, loss = 5038288825323.17285156
Iteration 4, loss = 4690582949306.25097656
Iteration 5, loss = 4298947223866.83154297
Iteration 6, loss = 3885482613340.61230469
Iteration 7, loss = 3469673774375.79101562
Iteration 8, loss = 3070898556496.04150391
Iteration 9, loss = 2706648010594.15332031
Iteration 10, loss = 2393534030732.16455078
Iteration 11, loss = 2143240739350.12890625
Iteration 12, loss = 1961527643323.32617188
Iteration 13, loss = 1843553682394.23071289
Iteration 14, loss = 1771439218489.66186523
Iteration 15, loss = 1721304165548.47656250
Iteration 16, loss = 1677377910150.12377930
Iteration 17, loss = 1634672471379.27929688
Iteration 18, loss = 1592086428120.01684570
Iteration 19, loss = 1549346120525.87890625
Iteration 20, loss = 1506515828888.14965820
Iteration 21, loss = 1463462610477.21337891
Iteration 22, loss = 1420088647230.22167969
Iteration 23, loss = 1376657569487.944580



MLPRegressor(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(6,), learning_rate='adaptive',
             learning_rate_init=0.01, max_iter=1000, momentum=0.9,
             n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
             random_state=None, shuffle=True, solver='adam', tol=0.0001,
             validation_fraction=0.1, verbose=True, warm_start=False)

In [76]:
Y_pred = reg.predict(X_val)

In [77]:
mean_absolute_error(Y_val,Y_pred)

621982.1997982272

# Ensamble entre LightGBM, XGBoost y RandomForest (Blending)

In [17]:
import lightgbm as lgb
import xgboost

In [18]:
train,test = pre_processing.load_featured_datasets()

In [19]:
train['precio'] = train['precio'].map(lambda x: math.log(x))

In [20]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [21]:
xgb_params = parameter_tuning.get_best_params()['xgboost']
lgb_params = parameter_tuning.get_best_params()['lightgbm']

In [22]:
X = train_selected.drop('precio', axis=1).values
Y = train_selected['precio'].values

In [23]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2)

In [24]:
xgb_params

{'subsample': 0.9,
 'n_estimators': 140,
 'min_child_weight': 15,
 'max_depth': 15,
 'learning_rate': 0.1}

In [25]:
reg_xgb = xgboost.XGBRegressor(max_depth=15,n_estimators=140 ,learning_rate=0.1, verbosity=2,subsample=0.9, min_child_weight=15,n_jobs=4)

In [26]:
reg_xgb.fit(X_train,Y_train)

[12:51:50] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 56 extra nodes, 0 pruned nodes, max_depth=8
[12:51:50] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 66 extra nodes, 0 pruned nodes, max_depth=8
[12:51:51] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 72 extra nodes, 0 pruned nodes, max_depth=7
[12:51:52] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 72 extra nodes, 0 pruned nodes, max_depth=8
[12:51:53] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 88 extra nodes, 0 pruned nodes, max_depth=9
[12:51:54] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 84 extra nodes, 0 pruned nodes, max_depth=9
[12:51:55] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 100 extra nodes, 0 pruned nodes, max_depth=8
[12:51:55] INFO: /workspace/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 104 extra n

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=15, min_child_weight=15, missing=None, n_estimators=140,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=2)

In [28]:
Y_pred_xgb_val = reg_xgb.predict(X_val)

In [29]:
Y_pred_xgb_val = np.exp(Y_pred_xgb_val)

In [31]:
X_val_xgbadded = np.hstack((X_val, np.atleast_2d(Y_pred_xgb_val).T)) 

In [33]:
Y_pred_xgb_train = reg_xgb.predict(X_train)

In [34]:
Y_pred_xgb_train = np.exp(Y_pred_xgb_train)

In [35]:
X_train_xgbadded = np.hstack((X_train, np.atleast_2d(Y_pred_xgb_train).T)) 

In [36]:
# Entrenamiento de LightGBM

In [37]:
lgb_params

{'num_leaves': 55,
 'min_gain_to_split': 0.2,
 'min_data_in_leaf': 3000,
 'max_depth': 12,
 'max_bin': 150,
 'feature_fraction': 0.7,
 'bagging_freq': 5,
 'bagging_fraction': 0.75,
 'boosting_type': 'gbdt',
 'objective': 'regression',
 'metric': 'mae',
 'num_boost_round': 5000,
 'verbose': 0,
 'learning_rate': 0.1}

In [38]:
lgb_params['min_gain_to_split'] = 0
lgb_params['learning_rate'] = 0.05

In [41]:
d_train = lgb.Dataset(X_train_xgbadded, label=Y_train)
d_valid = lgb.Dataset(X_val_xgbadded, label=Y_val)
watchlist = [d_valid]
reg_lgb = lgb.train(lgb_params, train_set=d_train, valid_sets=watchlist, verbose_eval=500)



[500]	valid_0's l1: 0.1915
[1000]	valid_0's l1: 0.193023
[1500]	valid_0's l1: 0.193817
[2000]	valid_0's l1: 0.194267
[2500]	valid_0's l1: 0.194575
[3000]	valid_0's l1: 0.194831
[3500]	valid_0's l1: 0.195017
[4000]	valid_0's l1: 0.195187
[4500]	valid_0's l1: 0.195343
[5000]	valid_0's l1: 0.195491


In [42]:
Y_pred = reg_lgb.predict(X_val_xgbadded)

In [43]:
Y_pred = np.exp(Y_pred)

In [44]:
Y_val = np.exp(Y_val)

In [45]:
mean_absolute_error(Y_pred,Y_val)

493694.206134118

In [46]:
Y_pred_train = reg_lgb.predict(X_train_xgbadded)

In [47]:
Y_pred_train = np.exp(Y_pred_train)

In [48]:
mean_absolute_error(Y_pred_train,np.exp(Y_train))

252590.86866978844

# Ensamble entre LightGBM, XGBoost y RandomForest (Blending) v2

In [4]:
import lightgbm as lgb
import xgboost as xgb

In [5]:
train,test = pre_processing.load_featured_datasets()

In [6]:
train['precio'] = train['precio'].map(lambda x: math.log(x))

In [7]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [8]:
xgb_params = parameter_tuning.get_best_params()['xgboost']
lgb_params = parameter_tuning.get_best_params()['lightgbm']

In [9]:
X = train_selected.drop('precio', axis=1)
Y = train_selected['precio']

In [10]:
# Separamos el set de train en 10-90

In [None]:
X_90, X_10, Y_90, Y_10 = train_test_split(X, Y, test_size=0.1)

In [14]:
# Entrenamos XGBoost

In [13]:
reg_xgb = xgb.XGBRegressor(max_depth=15,n_estimators=140 ,learning_rate=0.1, verbosity=0,subsample=0.9, min_child_weight=15,n_jobs=4)

In [15]:
reg_xgb.fit(X_90.values,Y_90.values)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=15, min_child_weight=15, missing=None, n_estimators=140,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=0)

In [16]:
# Entrenamos LightGBM

In [17]:
lgb_params['min_gain_to_split'] = 0
lgb_params['learning_rate'] = 0.05
lgb_params['early_stopping_round'] = 200

In [18]:
X_train, X_val, Y_train, Y_val = train_test_split(X_90, Y_90, test_size=0.2)

In [19]:
d_train = lgb.Dataset(X_train.values, label=Y_train.values)
d_valid = lgb.Dataset(X_val.values, label=Y_val.values)
watchlist = [d_valid, d_train]
reg_lgb = lgb.train(lgb_params, train_set=d_train, valid_sets=watchlist, verbose_eval=500)



Training until validation scores don't improve for 200 rounds
[500]	training's l1: 0.204383	valid_0's l1: 0.20819
[1000]	training's l1: 0.196908	valid_0's l1: 0.202772
[1500]	training's l1: 0.192314	valid_0's l1: 0.199872
[2000]	training's l1: 0.18891	valid_0's l1: 0.1982
[2500]	training's l1: 0.18615	valid_0's l1: 0.197009
[3000]	training's l1: 0.183734	valid_0's l1: 0.19603
[3500]	training's l1: 0.181574	valid_0's l1: 0.195269
[4000]	training's l1: 0.179594	valid_0's l1: 0.194624
[4500]	training's l1: 0.177765	valid_0's l1: 0.194073
[5000]	training's l1: 0.176096	valid_0's l1: 0.193717
Did not meet early stopping. Best iteration is:
[5000]	training's l1: 0.176096	valid_0's l1: 0.193717


In [21]:
# Realizamos las predicciones para el 10 restante...
xgb_pred = reg_xgb.predict(X_10.values) 
lgb_pred = reg_lgb.predict(X_10.values)

In [22]:
# Entrenamos un modelo blender. Se utilizara xgb.

In [31]:
df = Y_10.to_frame()
df['xgb'] = xgb_pred
df['lgb'] = lgb_pred

In [34]:
X = df.drop('precio', axis=1)
Y = df['precio']

In [37]:
blender = xgb.XGBRegressor(max_depth=15,n_estimators=140 ,learning_rate=0.1, verbosity=0,subsample=0.9, min_child_weight=15,n_jobs=4)

In [38]:
blender.fit(X.values,Y.values)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=15, min_child_weight=15, missing=None, n_estimators=140,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=0)

In [39]:
# Listo el blender, ahora entrenamos LGB y XGB con train completo.

In [40]:
train_selected = feature_selection.get_selected_dataframe(train)
test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [41]:
X = train_selected.drop('precio', axis=1)
Y = train_selected['precio']

In [42]:
# xgb
reg_xgb = xgb.XGBRegressor(max_depth=15,n_estimators=140 ,learning_rate=0.1, verbosity=0,subsample=0.9, min_child_weight=15,n_jobs=4)
reg_xgb.fit(X.values,Y.values)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=15, min_child_weight=15, missing=None, n_estimators=140,
             n_jobs=4, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=0.9, verbosity=0)

In [45]:
del lgb_params['early_stopping_round']

In [47]:
# lgb
d_train = lgb.Dataset(X.values, label=Y.values)
reg_lgb = lgb.train(lgb_params, train_set=d_train, verbose_eval=False)

In [48]:
# Realizamos las predicciones para el set de test
xgb_pred = reg_xgb.predict(test_selected.values) 
lgb_pred = reg_lgb.predict(test_selected.values)

In [55]:
# Finalmente, armamos el dataframe con las predicciones y lo mandamos al blender:
df_test = test_selected.copy()
df_test['xgb'] = xgb_pred
df_test['lgb'] = lgb_pred

In [56]:
df_test = df_test[['xgb', 'lgb']]

In [58]:
ids = df_test.index.values
X_test = df_test.values
test_predict = blender.predict(X_test)
f = np.vectorize(math.exp)
test_predict = f(test_predict)
escribir_respuesta(ids, test_predict)