# LightGBMモデルの作成

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import lightgbm as lgb

EDAで作成したデータファイル（``data_7.csv``）から特徴量を読み込む

In [2]:
df_7 = pd.read_csv('data_7.csv')
df_7.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,IC_01,IC_07,IC_05,IC_16,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998,0.00607,0.014466,0.004136,0.000658,...,-0.203459,0.630524,0.227728,0.407087,0.450352,-0.074469,0.151244,-0.105504,0.626158,0.352528
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.009087,0.009291,0.007049,-0.002076,...,-0.306443,-0.047546,0.26053,-0.207901,0.595487,0.809261,0.648386,0.05219,0.772152,0.271778
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,0.004675,0.000957,0.006154,-0.000429,...,-0.276111,0.39304,0.370413,0.04279,0.437297,0.560578,0.440411,0.032559,0.923573,0.112833
3,10005,66.53263,51.847306,60.052535,52.108977,69.993075,-0.000398,0.006878,0.009051,0.000369,...,-0.248033,0.522701,0.10787,0.605457,0.660222,0.7115,0.321404,0.077306,1.06345,0.124246
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421,0.005192,0.010585,0.01216,-0.00092,...,-0.178258,0.387695,0.166441,0.176302,0.504423,0.821638,0.517638,0.372172,0.787271,0.328486


後々、スクラッチで作成したStackingクラスでも使用できるようにする必要がある。
sklearnのメソッドを想定して上記を作成しているため、lightGBMのskleanラッパーを用いてモデル作成する。  
  
ハイパーパラメーターは、別にBayesian Optimizationで求めた値を使う。  
  
K-foldライブラリを使用し、3通りのtrain/validationの分割で学習する。

In [7]:
# sklearnのラッパーを使用
model_1 = lgb.LGBMRegressor(boosting_type='gbdt',
                            objective='regression',
                            random_seed=0,
                            lambda_l1=0.05458084891512328,
                            lambda_l2=0.004051458495736978,
                            num_leaves=6,
                            feature_fraction=0.652,
                            bagging_fraction=0.6265650130525431,
                            min_child_samples=20)
model_2 = lgb.LGBMRegressor(boosting_type='gbdt', 
                            objective='regression',
                            random_seed=0, 
                            lambda_l1=0.005081093287901702,
                            lambda_l2=6.575224879606878,
                            num_leaves=4,
                            feature_fraction=0.5,
                            bagging_fraction=0.9933635801530803,
                            min_child_samples=25)
model_3 = lgb.LGBMRegressor(boosting_type='gbdt', 
                            objective='regression',
                            random_seed=0,
                            lambda_l1=1.477911772044783e-07,
                            lambda_l2=0.8187371385937187,
                            num_leaves=31,
                            feature_fraction=0.6,
                            bagging_fraction=1.0,
                            min_child_samples=100)
model_4 = lgb.LGBMRegressor(boosting_type='gbdt', 
                            objective='regression',
                            random_seed=0,
                            lambda_l1=0.001963368031457678,
                            lambda_l2=0.015560943368075215,
                            num_leaves=31,
                            feature_fraction=0.6,
                            bagging_fraction=1.0,
                            min_child_samples=20)
model_5 = lgb.LGBMRegressor(boosting_type='gbdt',
                            objective='regression',
                            random_seed=0,
                            lambda_l1=9.71697193524586,
                            lambda_l2=8.45148735600162,
                            num_leaves=3,
                            feature_fraction=0.92,
                            bagging_fraction=0.6221063497072915,
                            min_child_samples=100)
models = [model_1, model_2, model_3, model_4, model_5]

X = df_7.iloc[:, 6:].values
y = df_7.iloc[:, 1:6].values
preds = np.empty_like(y)
targets = df_7.columns[1:6]

# 3-fold
# 各変数毎に学習・推論
for i, target in enumerate(targets):
    print('==< {} >=='.format(target))
    y_cv = y[:, i]
    kfold = KFold(n_splits=3).split(X, y_cv)
    for train_index, test_index in kfold:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y_cv[train_index], y_cv[test_index]
        
        models[i].fit(X_train, y_train,
                      eval_set=[(X_val, y_val)],
                      eval_metric='mae',
                      early_stopping_rounds=10,
                      verbose=True)
        preds[:, i] = models[i].predict(X)
        
scores = np.sum(np.abs(y - preds), axis=0) / np.sum(preds, axis=0)
scores

==< age >==
[1]	valid_0's rmse: 13.2195	valid_0's l2: 174.754
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 12.911	valid_0's l2: 166.695
[3]	valid_0's rmse: 12.7256	valid_0's l2: 161.941
[4]	valid_0's rmse: 12.4805	valid_0's l2: 155.763
[5]	valid_0's rmse: 12.3139	valid_0's l2: 151.633
[6]	valid_0's rmse: 12.1233	valid_0's l2: 146.975
[7]	valid_0's rmse: 11.9633	valid_0's l2: 143.12
[8]	valid_0's rmse: 11.8001	valid_0's l2: 139.242
[9]	valid_0's rmse: 11.654	valid_0's l2: 135.816
[10]	valid_0's rmse: 11.5451	valid_0's l2: 133.29
[11]	valid_0's rmse: 11.4469	valid_0's l2: 131.032
[12]	valid_0's rmse: 11.3425	valid_0's l2: 128.652
[13]	valid_0's rmse: 11.2747	valid_0's l2: 127.118
[14]	valid_0's rmse: 11.1893	valid_0's l2: 125.201
[15]	valid_0's rmse: 11.1236	valid_0's l2: 123.735
[16]	valid_0's rmse: 11.0502	valid_0's l2: 122.107
[17]	valid_0's rmse: 10.9839	valid_0's l2: 120.645
[18]	valid_0's rmse: 10.9609	valid_0's l2: 120.141
[19]	valid_0's rmse: 1

[64]	valid_0's rmse: 9.95017	valid_0's l2: 99.006
[65]	valid_0's rmse: 9.94343	valid_0's l2: 98.8719
[66]	valid_0's rmse: 9.93631	valid_0's l2: 98.7302
[67]	valid_0's rmse: 9.93658	valid_0's l2: 98.7357
[68]	valid_0's rmse: 9.92004	valid_0's l2: 98.4073
[69]	valid_0's rmse: 9.91707	valid_0's l2: 98.3482
[70]	valid_0's rmse: 9.91526	valid_0's l2: 98.3125
[71]	valid_0's rmse: 9.90905	valid_0's l2: 98.1892
[72]	valid_0's rmse: 9.89784	valid_0's l2: 97.9673
[73]	valid_0's rmse: 9.89129	valid_0's l2: 97.8376
[74]	valid_0's rmse: 9.8898	valid_0's l2: 97.808
[75]	valid_0's rmse: 9.88602	valid_0's l2: 97.7334
[76]	valid_0's rmse: 9.88411	valid_0's l2: 97.6956
[77]	valid_0's rmse: 9.88496	valid_0's l2: 97.7124
[78]	valid_0's rmse: 9.87894	valid_0's l2: 97.5934
[79]	valid_0's rmse: 9.87288	valid_0's l2: 97.4738
[80]	valid_0's rmse: 9.86921	valid_0's l2: 97.4013
[81]	valid_0's rmse: 9.86631	valid_0's l2: 97.3441
[82]	valid_0's rmse: 9.86259	valid_0's l2: 97.2707
[83]	valid_0's rmse: 9.85651	valid

[19]	valid_0's rmse: 9.61195	valid_0's l2: 92.3897
[20]	valid_0's rmse: 9.61415	valid_0's l2: 92.4319
[21]	valid_0's rmse: 9.616	valid_0's l2: 92.4675
[22]	valid_0's rmse: 9.60941	valid_0's l2: 92.3408
[23]	valid_0's rmse: 9.60128	valid_0's l2: 92.1847
[24]	valid_0's rmse: 9.59861	valid_0's l2: 92.1334
[25]	valid_0's rmse: 9.59799	valid_0's l2: 92.1214
[26]	valid_0's rmse: 9.59489	valid_0's l2: 92.0619
[27]	valid_0's rmse: 9.59818	valid_0's l2: 92.125
[28]	valid_0's rmse: 9.59174	valid_0's l2: 92.0015
[29]	valid_0's rmse: 9.59001	valid_0's l2: 91.9682
[30]	valid_0's rmse: 9.58755	valid_0's l2: 91.921
[31]	valid_0's rmse: 9.58422	valid_0's l2: 91.8572
[32]	valid_0's rmse: 9.58553	valid_0's l2: 91.8823
[33]	valid_0's rmse: 9.5899	valid_0's l2: 91.9662
[34]	valid_0's rmse: 9.58845	valid_0's l2: 91.9383
[35]	valid_0's rmse: 9.59061	valid_0's l2: 91.9797
[36]	valid_0's rmse: 9.59424	valid_0's l2: 92.0495
[37]	valid_0's rmse: 9.59159	valid_0's l2: 91.9986
[38]	valid_0's rmse: 9.5901	valid_0'

[42]	valid_0's rmse: 9.54635	valid_0's l2: 91.1329
[43]	valid_0's rmse: 9.55085	valid_0's l2: 91.2187
[44]	valid_0's rmse: 9.55126	valid_0's l2: 91.2265
[45]	valid_0's rmse: 9.55637	valid_0's l2: 91.3242
[46]	valid_0's rmse: 9.55473	valid_0's l2: 91.2929
[47]	valid_0's rmse: 9.55455	valid_0's l2: 91.2895
[48]	valid_0's rmse: 9.55647	valid_0's l2: 91.3261
[49]	valid_0's rmse: 9.55538	valid_0's l2: 91.3053
[50]	valid_0's rmse: 9.5559	valid_0's l2: 91.3152
[51]	valid_0's rmse: 9.55691	valid_0's l2: 91.3346
[52]	valid_0's rmse: 9.54994	valid_0's l2: 91.2014
Early stopping, best iteration is:
[42]	valid_0's rmse: 9.54635	valid_0's l2: 91.1329
==< domain1_var2 >==
[1]	valid_0's rmse: 11.1687	valid_0's l2: 124.741
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 11.1754	valid_0's l2: 124.888
[3]	valid_0's rmse: 11.18	valid_0's l2: 124.992
[4]	valid_0's rmse: 11.1832	valid_0's l2: 125.065
[5]	valid_0's rmse: 11.1876	valid_0's l2: 125.163
[6]	valid_0's rmse: 11.1

[33]	valid_0's rmse: 11.8721	valid_0's l2: 140.947
Early stopping, best iteration is:
[23]	valid_0's rmse: 11.8655	valid_0's l2: 140.789
[1]	valid_0's rmse: 11.4999	valid_0's l2: 132.248
Training until validation scores don't improve for 10 rounds
[2]	valid_0's rmse: 11.4954	valid_0's l2: 132.144
[3]	valid_0's rmse: 11.4847	valid_0's l2: 131.899
[4]	valid_0's rmse: 11.4787	valid_0's l2: 131.76
[5]	valid_0's rmse: 11.4712	valid_0's l2: 131.587
[6]	valid_0's rmse: 11.4655	valid_0's l2: 131.457
[7]	valid_0's rmse: 11.4627	valid_0's l2: 131.393
[8]	valid_0's rmse: 11.4611	valid_0's l2: 131.357
[9]	valid_0's rmse: 11.4553	valid_0's l2: 131.225
[10]	valid_0's rmse: 11.4573	valid_0's l2: 131.269
[11]	valid_0's rmse: 11.4535	valid_0's l2: 131.182
[12]	valid_0's rmse: 11.4522	valid_0's l2: 131.154
[13]	valid_0's rmse: 11.4515	valid_0's l2: 131.136
[14]	valid_0's rmse: 11.4462	valid_0's l2: 131.016
[15]	valid_0's rmse: 11.4441	valid_0's l2: 130.967
[16]	valid_0's rmse: 11.4452	valid_0's l2: 130.

array([0.13919489, 0.1404716 , 0.13673358, 0.1703873 , 0.17427029])

作成したモデルを別に使用できるよう、保存する

In [8]:
import pickle
filename = 'lgbs_7cv.sav'
pickle.dump(models, open(filename, 'wb'))