In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import random
import gc
import os
import datetime
import pickle
import optuna
import sqlite3

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

myfavouritenumber = 0
seed = myfavouritenumber
random.seed(seed)

pd.set_option('max_rows', 9999)
pd.set_option('max_columns', 9999)

Half_Half_LightGBM Training

In [2]:
class Trainer:
    def __init__(self, importance_df, model_type='lgb', use_feature_num=None):
        self.model_type = model_type
        
        if use_feature_num is not None:
            self.features = importance_df['feature'][:use_feature_num].tolist()
        else:
            self.features = None
            
    def train_half_optuna(self, X_train, y_train, params, num_boost_round, early_stopping_rounds, verbose=200, trial=None):
        
        if self.features is None:
            self.features = X_train.columns
            
        self.features = [c for c in self.features if c not in ['M']]
            
        self.X_train = X_train[self.features]
        self.y_train = y_train
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse')
        
        if self.model_type == 'lgb':
            d_half_1 = lgb.Dataset(self.X_train[:int(self.X_train.shape[0] / 2)], 
                                   label=self.y_train[:int(X_train.shape[0] / 2)])
            d_half_2 = lgb.Dataset(self.X_train[int(self.X_train.shape[0] / 2):], 
                                   label=self.y_train[int(X_train.shape[0] / 2):])

            self.model_1 = lgb.train(params, train_set=d_half_1, num_boost_round=num_boost_round, 
                                     valid_sets=[d_half_2], verbose_eval=verbose, 
                                     early_stopping_rounds=early_stopping_rounds, 
                                     callbacks=[pruning_callback])
            
            oof = self.model_1.predict(self.X_train[int(self.X_train.shape[0] / 2):],
                                       num_iteration=self.model_1.best_iteration)
            oof = np.clip(oof, 0, a_max=None)
            
            rmse = np.sqrt(mean_squared_error(self.y_train[int(X_train.shape[0] / 2):], oof))
            
        return rmse

In [3]:
# config
train_pkl_path = '../input/prep_train_20191118.pkl'
test_pkl_path = '../input/prep_test_20191118.pkl'
importance_path = '../Importance/importance_20191118.csv'
model_type = 'lgb'

# Create SQLite Table
con = sqlite3.connect('ashrae_lgb.db')

def objective(trial):
    # Search Parameter Range
    set_lgb_params = {
        'objective': 'regression',
        'boosting_type': 'gbrt',
        'metric': 'rmse',
        'n_jobs': -1,
        'learning_rate': 0.01,
        'max_bin': 255,
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        'num_leaves': trial.suggest_int('num_leaves', 100, 300),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.9),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 0.9),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 20),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e+3),
        'verbose': -1,
        'seed': 42
    }
    
    num_boost_round = 1000
    early_stopping_rounds = 50
    num_feature = 30
    verbose = False
    
    # Train Model
    trainer = Trainer(importance_df=importance_df, model_type=model_type, use_feature_num=num_feature)
    rmse = trainer.train_half_optuna(train[0], train[1], set_lgb_params, num_boost_round, 
                                       early_stopping_rounds, verbose, trial)

    return rmse

In [4]:
# Training
# Data Loading
with open(train_pkl_path, 'rb') as f:
    train = pickle.load(f)

importance_df = pd.read_csv(importance_path)

pruner = optuna.pruners.SuccessiveHalvingPruner(min_resource=50)

# New Study
# study = optuna.create_study(
#     study_name='ashrae_lgb',
#     storage='sqlite:///ashrae_lgb.db',
#     load_if_exists=True,
#     direction='minimize',
#     pruner=pruner
# )

# Reload Intermediate state
study = optuna.load_study(
    study_name='ashrae_lgb',
    storage='sqlite:///ashrae_lgb.db',
    pruner=pruner
)

study.optimize(objective, timeout=60*60*30)

[I 2019-11-23 08:38:26,751] Setting status of trial#132 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-23 08:57:05,598] Finished trial#133 resulted in value: 1.1386632947842301. Current best value is 1.1378703133301726 with parameters: {'bagging_fraction': 0.42797307400289963, 'bagging_freq': 6, 'colsample_bytree': 0.8659940184256816, 'feature_fraction': 0.852216419174993, 'max_depth': 10, 'min_data_in_leaf': 47, 'num_leaves': 242, 'reg_lambda': 137.8767751523373, 'subsample': 0.8922113062563624}.
[I 2019-11-23 08:59:38,894] Setting status of trial#134 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-23 09:00:33,125] Setting status of trial#135 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-23 09:18:40,141] Finished trial#136 resulted in value: 1.140466381979324. Current best value is 1.1378703133301726 with parameters: {'bagging_fraction': 0.42797307400289963, 'bagging_freq': 6, 'colsample_bytree': 0.8659940184256816, 'featur

[I 2019-11-23 13:35:25,548] Setting status of trial#167 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-23 13:52:35,058] Finished trial#168 resulted in value: 1.1396821146632943. Current best value is 1.1378703133301726 with parameters: {'bagging_fraction': 0.42797307400289963, 'bagging_freq': 6, 'colsample_bytree': 0.8659940184256816, 'feature_fraction': 0.852216419174993, 'max_depth': 10, 'min_data_in_leaf': 47, 'num_leaves': 242, 'reg_lambda': 137.8767751523373, 'subsample': 0.8922113062563624}.
[I 2019-11-23 13:53:18,790] Setting status of trial#169 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-23 14:10:28,217] Finished trial#170 resulted in value: 1.1411210417604087. Current best value is 1.1378703133301726 with parameters: {'bagging_fraction': 0.42797307400289963, 'bagging_freq': 6, 'colsample_bytree': 0.8659940184256816, 'feature_fraction': 0.852216419174993, 'max_depth': 10, 'min_data_in_leaf': 47, 'num_leaves': 242, 'reg_lambda': 137.87

[I 2019-11-23 18:48:17,566] Finished trial#210 resulted in value: 1.1400850698452425. Current best value is 1.1378703133301726 with parameters: {'bagging_fraction': 0.42797307400289963, 'bagging_freq': 6, 'colsample_bytree': 0.8659940184256816, 'feature_fraction': 0.852216419174993, 'max_depth': 10, 'min_data_in_leaf': 47, 'num_leaves': 242, 'reg_lambda': 137.8767751523373, 'subsample': 0.8922113062563624}.
[I 2019-11-23 18:49:05,893] Setting status of trial#211 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-23 18:58:55,590] Setting status of trial#212 as TrialState.PRUNED. Trial was pruned at iteration 800.
[I 2019-11-23 18:59:48,789] Setting status of trial#213 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-23 19:00:36,702] Setting status of trial#214 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-23 19:20:02,389] Finished trial#215 resulted in value: 1.139130884263791. Current best value is 1.1378703133301726 with parameter

[I 2019-11-23 23:04:30,169] Setting status of trial#264 as TrialState.PRUNED. Trial was pruned at iteration 800.
[I 2019-11-23 23:07:03,030] Setting status of trial#265 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-23 23:07:50,609] Setting status of trial#266 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-23 23:16:22,163] Setting status of trial#267 as TrialState.PRUNED. Trial was pruned at iteration 800.
[I 2019-11-23 23:17:13,339] Setting status of trial#268 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-23 23:20:05,455] Setting status of trial#269 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-23 23:38:53,473] Finished trial#270 resulted in value: 1.1400374884367568. Current best value is 1.1378703133301726 with parameters: {'bagging_fraction': 0.42797307400289963, 'bagging_freq': 6, 'colsample_bytree': 0.8659940184256816, 'feature_fraction': 0.852216419174993, 'max_depth': 10, 'min_data_in_leaf': 47,

[I 2019-11-24 02:46:02,688] Setting status of trial#323 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 02:48:30,767] Setting status of trial#324 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-24 02:57:48,076] Setting status of trial#325 as TrialState.PRUNED. Trial was pruned at iteration 800.
[I 2019-11-24 03:00:47,984] Setting status of trial#326 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-24 03:19:53,754] Finished trial#327 resulted in value: 1.139339272544123. Current best value is 1.1378522421159347 with parameters: {'bagging_fraction': 0.42523401307460185, 'bagging_freq': 6, 'colsample_bytree': 0.7187743617504782, 'feature_fraction': 0.8822412268247286, 'max_depth': 10, 'min_data_in_leaf': 42, 'num_leaves': 279, 'reg_lambda': 114.8060332041216, 'subsample': 0.8631504025541011}.
[I 2019-11-24 03:38:46,157] Finished trial#328 resulted in value: 1.138590443108138. Current best value is 1.1378522421159347 with paramet

[I 2019-11-24 06:02:03,839] Setting status of trial#390 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 06:10:41,236] Setting status of trial#391 as TrialState.PRUNED. Trial was pruned at iteration 800.
[I 2019-11-24 06:11:37,353] Setting status of trial#392 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 06:12:33,292] Setting status of trial#393 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 06:13:28,362] Setting status of trial#394 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 06:14:23,780] Setting status of trial#395 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 06:15:15,912] Setting status of trial#396 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 06:16:08,276] Setting status of trial#397 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 06:19:01,973] Setting status of trial#398 as TrialState.PRUNED. Trial was pruned at itera

[I 2019-11-24 09:03:37,559] Setting status of trial#457 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-24 09:06:24,858] Setting status of trial#458 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-24 09:26:01,812] Finished trial#459 resulted in value: 1.1399541470220431. Current best value is 1.1378522421159347 with parameters: {'bagging_fraction': 0.42523401307460185, 'bagging_freq': 6, 'colsample_bytree': 0.7187743617504782, 'feature_fraction': 0.8822412268247286, 'max_depth': 10, 'min_data_in_leaf': 42, 'num_leaves': 279, 'reg_lambda': 114.8060332041216, 'subsample': 0.8631504025541011}.
[I 2019-11-24 09:26:56,064] Setting status of trial#460 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 09:29:55,853] Setting status of trial#461 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-24 09:30:51,227] Setting status of trial#462 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 09:40:22,71

[I 2019-11-24 11:29:40,271] Setting status of trial#527 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 11:30:33,980] Setting status of trial#528 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 11:31:25,133] Setting status of trial#529 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 11:32:20,290] Setting status of trial#530 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 11:42:12,117] Setting status of trial#531 as TrialState.PRUNED. Trial was pruned at iteration 800.
[I 2019-11-24 11:44:59,000] Setting status of trial#532 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-11-24 11:45:54,458] Setting status of trial#533 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 11:46:47,590] Setting status of trial#534 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-11-24 11:47:42,740] Setting status of trial#535 as TrialState.PRUNED. Trial was pruned at iter