In [4]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

seed = 42

In [5]:
import nbimporter

import pre_processing
import feature_generation
import feature_selection
import predict

# Parameter Tuning para LightGBM
Utilizaremos learning rate de 0.1 y 5000 estimadores, luego esto sera escalado a mas estimadores con mejores learning rates, pero no deberia variar el resultado.

In [6]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [7]:
train,test = pre_processing.load_featured_datasets()

In [8]:
#train['precio'] = train['precio'].map(lambda x: math.log(x))

In [9]:
train_selected = feature_selection.get_selected_dataframe(train)
#test_selected = feature_selection.get_selected_dataframe(test, precio=False)

In [10]:
X = train_selected.drop('precio', axis=1).values
Y = train_selected['precio'].values

In [11]:
param_grid = {
    'num_leaves': [55, 60, 65],
    'max_depth': [8,10,12],
    'min_gain_to_split':[0.1, 0.2], 
    'max_bin':[50, 100, 150],
    'min_data_in_leaf':[3000, 5000, 7000],
    'bagging_freq':[4,5,6],
    'bagging_fraction':[0.65, 0.7, 0.75],
    'feature_fraction':[0.7]
}

In [12]:
reg = lgb.LGBMRegressor(boosting_type='gbdt',  objective='regression', metric='mae', num_boost_round=5000,
                       verbose=0, learning_rate=0.1)

In [13]:
#gridsearch = GridSearchCV(reg, param_grid, cv=4, scoring = 'neg_mean_absolute_error')
gridsearch = RandomizedSearchCV(reg, param_grid, n_iter=120, n_jobs=3, cv=4, scoring = 'neg_mean_absolute_error')

In [14]:
%%time
gridsearch.fit(X,Y)



CPU times: user 11min 24s, sys: 1.72 s, total: 11min 25s
Wall time: 16h 57min 11s


RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rage=0.1, learning_rate=0.1,
                                           max_depth=-1, metric='mae',
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_boost_round=5000,
                                           num_leaves=31...
                   param_distributions={'bagging_fraction': [0.65, 0.7, 0.75],
                                        'bagging_freq': [4, 5, 6],
                                   

In [15]:
gridsearch.best_score_

-494138.8445571253

In [16]:
gridsearch.best_params_

{'num_leaves': 55,
 'min_gain_to_split': 0.2,
 'min_data_in_leaf': 3000,
 'max_depth': 12,
 'max_bin': 150,
 'feature_fraction': 0.7,
 'bagging_freq': 5,
 'bagging_fraction': 0.75}

# Resultados obtenidos...

**Detalles de la prueba:**
- Metodo utilizado: RandomizedGridSearch.
- n_iterations: 40.
- Parametros a probar:

```
param_grid = {
    # Prevenir overfitting:
    'num_leaves': [60, 80, 100, 120],
    'max_depth': [5,10,15],
    'min_gain_to_split':[0.1], 
    'max_bin':[100],
    'min_data_in_leaf':[5000],
    'bagging_freq':[3, 5],
    'bagging_fraction':[0.5, 0.605, 0.7],
    'feature_fraction':[0.7]
}
```

**Resultados (4h 35min)**:
```
{'num_leaves': 60,
 'min_gain_to_split': 0.1,
 'min_data_in_leaf': 5000,
 'max_depth': 10,
 'max_bin': 100,
 'feature_fraction': 0.7,
 'bagging_freq': 5,
 'bagging_fraction': 0.7}
 
MAE = 502020k
 ```
 
<hr>

**Detalles de la prueba:**
- Metodo utilizado: RandomizedGridSearch.
- n_iterations: 120.
- Parametros a probar:

```
param_grid = {
    'num_leaves': [55, 60, 65],
    'max_depth': [8,10,12],
    'min_gain_to_split':[0.1, 0.2], 
    'max_bin':[50, 100, 150],
    'min_data_in_leaf':[3000, 5000, 7000],
    'bagging_freq':[4,5,6],
    'bagging_fraction':[0.65, 0.7, 0.75],
    'feature_fraction':[0.7]
}
```

**Resultados (16h 57min)**:
```
{'num_leaves': 55,
 'min_gain_to_split': 0.2,
 'min_data_in_leaf': 3000,
 'max_depth': 12,
 'max_bin': 150,
 'feature_fraction': 0.7,
 'bagging_freq': 5,
 'bagging_fraction': 0.75}
 
MAE = 494138k
 ```

In [1]:
def get_best_params_lgb():
    return {'num_leaves': 55,
            'min_gain_to_split': 0.2,
            'min_data_in_leaf': 3000,
            'max_depth': 12,
            'max_bin': 150,
            'feature_fraction': 0.7,
            'bagging_freq': 5,
            'bagging_fraction': 0.75,
            # Parametros que no fueron optimizados:
            'boosting_type':'gbdt',
            'objective':'regression',
            'metric':'mae',
            'num_boost_round':5000,
            'verbose':0,
            'learning_rate':0.1
           }