In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool
import random
import gc
import os
import datetime
import pickle
import optuna
import sqlite3

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error

myfavouritenumber = 0
seed = myfavouritenumber
random.seed(seed)

pd.set_option('max_rows', 9999)
pd.set_option('max_columns', 9999)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Half_Half_LightGBM Training

In [11]:
class Trainer:
    def __init__(self, importance_df, model_type='lgb', use_feature_num=None):
        self.model_type = model_type
        
        if use_feature_num is not None:
            self.features = importance_df['feature'][:feature_num].tolist()
        else:
            self.features = None
            
    def train_half_optuna(self, X_train, y_train, params, num_boost_round, early_stopping_rounds, verbose=200, trial=None):
        
        if self.features is None:
            self.features = X_train.columns
            
        self.features = [c for c in self.features if c not in ['M']]
            
        self.X_train = X_train[self.features]
        self.y_train = y_train
        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse')
        
        if self.model_type == 'lgb':
            d_half_1 = lgb.Dataset(self.X_train[:int(self.X_train.shape[0] / 2)], 
                                   label=self.y_train[:int(X_train.shape[0] / 2)])
            d_half_2 = lgb.Dataset(self.X_train[int(self.X_train.shape[0] / 2):], 
                                   label=self.y_train[int(X_train.shape[0] / 2):])

            self.model_1 = lgb.train(params, train_set=d_half_1, num_boost_round=num_boost_round, 
                                     valid_sets=[d_half_2], verbose_eval=verbose, 
                                     early_stopping_rounds=early_stopping_rounds, 
                                     callbacks=[pruning_callback])
            
            oof = self.model_1.predict(self.X_train[int(self.X_train.shape[0] / 2):],
                                       num_iteration=self.model_1.best_iteration)
            
            rmse = np.sqrt(mean_squared_error(self.y_train[int(X_train.shape[0] / 2):], oof))
            
        return rmse

In [12]:
# config
train_pkl_path = '../input/prep_train_20191118.pkl'
test_pkl_path = '../input/prep_test_20191118.pkl'
importance_path = '../Importance/importance_20191118.csv'
model_type = 'lgb'

# Create SQLite Table
con = sqlite3.connect('ashrae_lgb.db')

def objective(trial):
    # Search Parameter Range
    set_lgb_params = {
        'objective': 'regression',
        'boosting_type': 'gbrt',
        'metric': 'rmse',
        'n_jobs': -1,
        'learning_rate': 0.01,
        'max_bin': 255,
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'num_leaves': trial.suggest_int('num_leaves', 10, 200),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 0.9),
        'subsample': trial.suggest_uniform('subsample', 0.6, 0.9),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 0.9),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.9),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 20),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e+3),
        'verbose': -1,
        'seed': 42
    }
    
    num_boost_round = 500
    early_stopping_rounds = 50
    num_feature = None
    verbose = False
    
    # Train Model
    trainer = Trainer(importance_df=importance_df, model_type=model_type, use_feature_num=num_feature)
    rmse = trainer.train_half_optuna(train[0], train[1], set_lgb_params, num_boost_round, 
                                       early_stopping_rounds, verbose, trial)

    return rmse

In [None]:
# Training
# Data Loading
with open(train_pkl_path, 'rb') as f:
    train = pickle.load(f)

importance_df = pd.read_csv(importance_path)

pruner = optuna.pruners.SuccessiveHalvingPruner(min_resource=500)
study = optuna.create_study(
    study_name='ashrae_lgb',
    storage='sqlite:///ashrae_lgb.db',
    load_if_exists=True,
    direction='minimize',
    pruner=pruner
)

# Reload Intermediate state
# study = optuna.load_study(
#     study_name='ashrae_lgb',
#     storage='sqlite:///ashrae_lgb.db',
#     pruner=pruner
# )

study.optimize(objective, timeout=60*60*8)

[I 2019-11-18 15:12:36,641] Using an existing study with name 'ashrae_lgb' instead of creating a new one.
[I 2019-11-18 15:25:14,611] Finished trial#7 resulted in value: 1.358166369237579. Current best value is 1.358166369237579 with parameters: {'bagging_fraction': 0.679651512314768, 'bagging_freq': 5, 'colsample_bytree': 0.6470667583544704, 'feature_fraction': 0.8076645109502967, 'max_depth': 5, 'min_data_in_leaf': 33, 'num_leaves': 20, 'reg_lambda': 6.627668083305644, 'subsample': 0.8367722515913292}.
[I 2019-11-18 15:38:15,949] Finished trial#8 resulted in value: 1.3719347956545194. Current best value is 1.358166369237579 with parameters: {'bagging_fraction': 0.679651512314768, 'bagging_freq': 5, 'colsample_bytree': 0.6470667583544704, 'feature_fraction': 0.8076645109502967, 'max_depth': 5, 'min_data_in_leaf': 33, 'num_leaves': 20, 'reg_lambda': 6.627668083305644, 'subsample': 0.8367722515913292}.
[I 2019-11-18 15:49:59,609] Finished trial#9 resulted in value: 1.3268459956873593. C