In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import random
import gc
import os
import datetime
import pickle
import optuna
import sqlite3

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

myfavouritenumber = 0
seed = myfavouritenumber
random.seed(seed)

pd.set_option('max_rows', 9999)
pd.set_option('max_columns', 9999)

Half_Half_LightGBM Training

In [2]:
class Trainer:
    def __init__(self, importance_df, model_type='lgb', use_feature_num=None):
        self.model_type = model_type
        
        if use_feature_num is not None:
            self.features = importance_df['feature'][:use_feature_num].tolist()
        else:
            self.features = None
            
    def train_half_optuna(self, X_train, y_train, params, num_boost_round, early_stopping_rounds, verbose=200, trial=None):
        
        if self.features is None:
            self.features = X_train.columns
            
        self.features = [c for c in self.features if c not in ['M']]
            
        self.X_train = X_train[self.features]
        self.y_train = y_train
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse')
        
        if self.model_type == 'lgb':
            d_half_1 = lgb.Dataset(self.X_train[:int(self.X_train.shape[0] / 2)], 
                                   label=self.y_train[:int(X_train.shape[0] / 2)])
            d_half_2 = lgb.Dataset(self.X_train[int(self.X_train.shape[0] / 2):], 
                                   label=self.y_train[int(X_train.shape[0] / 2):])

            self.model_1 = lgb.train(params, train_set=d_half_1, num_boost_round=num_boost_round, 
                                     valid_sets=[d_half_2], verbose_eval=verbose, 
                                     early_stopping_rounds=early_stopping_rounds, 
                                     callbacks=[pruning_callback])
            
            oof = self.model_1.predict(self.X_train[int(self.X_train.shape[0] / 2):],
                                       num_iteration=self.model_1.best_iteration)
            oof = np.clip(oof, 0, a_max=None)
            
            rmse = np.sqrt(mean_squared_error(self.y_train[int(X_train.shape[0] / 2):], oof))
            
        return rmse

In [7]:
# config
train_pkl_path = '../input/prep_train_20191202_list.pkl'
importance_path = '../Importance/importance_20191202.csv'
model_type = 'lgb'

# Create SQLite Table
con = sqlite3.connect('ashrae_lgb_3.db')

def objective(trial):
    # Search Parameter Range
    set_lgb_params = {
        'objective': 'regression',
        'boosting_type': 'goss',
        'metric': 'rmse',
        'n_jobs': -1,
        'learning_rate': 0.01,
        'max_bin': 255,
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'num_leaves': trial.suggest_int('num_leaves', 100, 300),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.9),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 0.9),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e+3),
        'verbose': -1,
        'seed': 42
    }
    
    num_boost_round = 5000
    early_stopping_rounds = 200
    num_feature = 40
    verbose = False
    
    # Train Model
    trainer = Trainer(importance_df=importance_df, model_type=model_type, use_feature_num=num_feature)
    rmse = trainer.train_half_optuna(train[0], train[1], set_lgb_params, num_boost_round, 
                                       early_stopping_rounds, verbose, trial)

    return rmse

In [8]:
# Training
# Data Loading
with open(train_pkl_path, 'rb') as f:
    train = pickle.load(f)

importance_df = pd.read_csv(importance_path)

pruner = optuna.pruners.SuccessiveHalvingPruner(min_resource=50)

# New Study
study = optuna.create_study(
    study_name='ashrae_lgb_3',
    storage='sqlite:///ashrae_lgb_3.db',
    load_if_exists=True,
    direction='minimize',
    pruner=pruner
)

# Reload Intermediate state
# study = optuna.load_study(
#     study_name='ashrae_lgb',
#     storage='sqlite:///ashrae_lgb.db',
#     pruner=pruner
# )

study.optimize(objective, timeout=60*60*48)

[I 2019-12-05 07:28:24,418] Using an existing study with name 'ashrae_lgb_3' instead of creating a new one.
[I 2019-12-05 07:32:52,243] Finished trial#1 resulted in value: 1.5094749968513392. Current best value is 1.5094749968513392 with parameters: {'colsample_bytree': 0.8538078457016902, 'feature_fraction': 0.6828417210061907, 'max_depth': 10, 'min_data_in_leaf': 40, 'num_leaves': 275, 'reg_lambda': 108.33570081783267, 'subsample': 0.8186750756290126}.
[I 2019-12-05 07:37:26,371] Finished trial#2 resulted in value: 1.468674766541497. Current best value is 1.468674766541497 with parameters: {'colsample_bytree': 0.7480508566998729, 'feature_fraction': 0.8152382909987603, 'max_depth': 14, 'min_data_in_leaf': 45, 'num_leaves': 132, 'reg_lambda': 110.29781842683936, 'subsample': 0.8773961653276434}.
[I 2019-12-05 07:38:33,075] Setting status of trial#3 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-12-05 07:42:27,094] Setting status of trial#4 as TrialState.PRUNED. Trial 

[I 2019-12-05 09:38:48,862] Finished trial#56 resulted in value: 1.3914035038283732. Current best value is 1.3846552760882818 with parameters: {'colsample_bytree': 0.6446820363068849, 'feature_fraction': 0.8995423930082788, 'max_depth': 20, 'min_data_in_leaf': 11, 'num_leaves': 257, 'reg_lambda': 866.825333775333, 'subsample': 0.663255757899185}.
[I 2019-12-05 09:42:38,436] Setting status of trial#57 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 09:44:11,811] Setting status of trial#58 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-12-05 09:48:10,697] Setting status of trial#59 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 09:51:54,039] Setting status of trial#60 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 09:57:39,046] Finished trial#61 resulted in value: 1.3880935867756143. Current best value is 1.3846552760882818 with parameters: {'colsample_bytree': 0.6446820363068849, 'feature_fraction': 

[I 2019-12-05 12:29:18,881] Setting status of trial#114 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 12:33:16,735] Setting status of trial#115 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 12:37:19,811] Setting status of trial#116 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 12:41:29,993] Setting status of trial#117 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 12:43:05,912] Setting status of trial#118 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-12-05 12:44:41,095] Setting status of trial#119 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-12-05 12:46:18,923] Setting status of trial#120 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-12-05 12:50:30,257] Setting status of trial#121 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 12:54:41,997] Setting status of trial#122 as TrialState.PRUNED. Trial was pruned at i

[I 2019-12-05 15:37:24,745] Setting status of trial#173 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 15:39:01,456] Setting status of trial#174 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-12-05 15:43:14,782] Setting status of trial#175 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 15:47:34,543] Setting status of trial#176 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 15:49:09,799] Setting status of trial#177 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-12-05 15:53:10,652] Setting status of trial#178 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 15:54:49,092] Setting status of trial#179 as TrialState.PRUNED. Trial was pruned at iteration 50.
[I 2019-12-05 16:01:05,993] Finished trial#180 resulted in value: 1.3745741661542878. Current best value is 1.3702855248471535 with parameters: {'colsample_bytree': 0.6942838936080526, 'feature_fraction': 0.89899

[I 2019-12-05 18:47:40,804] Finished trial#217 resulted in value: 1.3710459923977658. Current best value is 1.3702855248471535 with parameters: {'colsample_bytree': 0.6942838936080526, 'feature_fraction': 0.8989966937060903, 'max_depth': 20, 'min_data_in_leaf': 36, 'num_leaves': 275, 'reg_lambda': 0.31129758193713547, 'subsample': 0.6958677000718564}.
[I 2019-12-05 18:51:50,152] Setting status of trial#218 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 18:55:55,572] Setting status of trial#219 as TrialState.PRUNED. Trial was pruned at iteration 200.
[I 2019-12-05 19:02:02,455] Finished trial#220 resulted in value: 1.3771169207457548. Current best value is 1.3702855248471535 with parameters: {'colsample_bytree': 0.6942838936080526, 'feature_fraction': 0.8989966937060903, 'max_depth': 20, 'min_data_in_leaf': 36, 'num_leaves': 275, 'reg_lambda': 0.31129758193713547, 'subsample': 0.6958677000718564}.
[I 2019-12-05 19:06:07,122] Setting status of trial#221 as TrialSt

KeyboardInterrupt: 