In [None]:
import os
import numpy as np
import pandas as pd
import sys 

import skopt
from skopt import gp_minimize, forest_minimize, BayesSearchCV
from skopt.space import Real, Integer, Categorical
from skopt.plots import plot_convergence
from skopt.utils import use_named_args
from skopt import dump, load

import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


https://www.kaggle.com/nanomathias/bayesian-optimization-of-xgboost-lb-0-9769

In [None]:
cat_cols = ['TODO']
num_cols = ['TODO']

ITERATIONS = 100 # 1000

In [None]:
X_train = pd.read_csv('data/X_train_pre_simple.csv')
X_valid = pd.read_csv('data/X_test_pre_simple.csv')
y_train = pd.read_csv('data/y_train_pre_simple.csv')
y_valid = pd.read_csv('data/y_test_pre_simple.csv')

In [None]:
def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest neg. MAE: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    '''clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")'''

In [None]:
# LGBMRegressor
bayes_cv_tuner = BayesSearchCV(
    estimator = lgb.LGBMRegressor(
        objective='regression_l1',
        #metric=['l2', 'auc'],
        n_jobs=-1,
        verbose=0
    ),
    search_spaces = {
        'boosting_type': ['gbdt', 'dart'],
        'learning_rate': (0.001, 1.0, 'log-uniform'),
        'num_leaves': (5, 200),      
        'max_depth': (-1, 50),
        'n_estimators': (10, 1000),
        'subsample_for_bin': (50000, 200000),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'reg_lambda': (1e-9, 1.0, 'log-uniform')
    },    
    scoring = 'neg_mean_absolute_error',
    cv = KFold(
        n_splits=3,
        shuffle=True,
        random_state=42
    ),
    n_jobs = 3,
    n_iter = ITERATIONS,   
    verbose = 0,
    refit = True,
    random_state = 42
)

# Fit the model
result = bayes_cv_tuner.fit(X_train, y_train, callback=status_print)