# XGBoostモデルの作成

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import xgboost as xgb

EDAで作成したデータファイル（``data_7.csv``）から特徴量を読み込む

In [2]:
df_7 = pd.read_csv('data_7.csv')
df_7.head()

Unnamed: 0,Id,age,domain1_var1,domain1_var2,domain2_var1,domain2_var2,IC_01,IC_07,IC_05,IC_16,...,CBN(13)_vs_DMN(94),CBN(18)_vs_DMN(94),CBN(4)_vs_DMN(94),CBN(7)_vs_DMN(94),CBN(18)_vs_CBN(13),CBN(4)_vs_CBN(13),CBN(7)_vs_CBN(13),CBN(4)_vs_CBN(18),CBN(7)_vs_CBN(18),CBN(7)_vs_CBN(4)
0,10001,57.436077,30.571975,62.553736,53.32513,51.427998,0.00607,0.014466,0.004136,0.000658,...,-0.203459,0.630524,0.227728,0.407087,0.450352,-0.074469,0.151244,-0.105504,0.626158,0.352528
1,10002,59.580851,50.969456,67.470628,60.651856,58.311361,0.009087,0.009291,0.007049,-0.002076,...,-0.306443,-0.047546,0.26053,-0.207901,0.595487,0.809261,0.648386,0.05219,0.772152,0.271778
2,10004,71.413018,53.152498,58.012103,52.418389,62.536641,0.004675,0.000957,0.006154,-0.000429,...,-0.276111,0.39304,0.370413,0.04279,0.437297,0.560578,0.440411,0.032559,0.923573,0.112833
3,10005,66.53263,51.847306,60.052535,52.108977,69.993075,-0.000398,0.006878,0.009051,0.000369,...,-0.248033,0.522701,0.10787,0.605457,0.660222,0.7115,0.321404,0.077306,1.06345,0.124246
4,10007,38.617381,49.197021,65.674285,40.151376,34.096421,0.005192,0.010585,0.01216,-0.00092,...,-0.178258,0.387695,0.166441,0.176302,0.504423,0.821638,0.517638,0.372172,0.787271,0.328486


In [None]:
targets = df_7.columns[1:6]

for target in targets:
    print('==< {} >=='.format(target))
    X = df_7.iloc[:, 6:]
    y = df_7[str(target)]

    xgb_model = xgb.XGBRegressor()
    clf = GridSearchCV(xgb_model,
                      {'max_depth': [2, 4, 6],
                       'n_estimators': [50, 100, 200],
                       'reg_alpha':[0, 0.3, 1],
                       'reg_lambda':[0.3, 1, 3]}, verbose=1)
    clf.fit(X, y, eval_metric='mae')
    print(clf.best_score_)
    print(clf.best_params_)

sklearnのGridSearchライブラリでパラメーターチューニングを試みたが、時間がかかりすぎるため断念…  
  
lightGBMと同様、stackingで使用できるようにskleanラッパーを用いてモデル作成。  
    
K-foldライブラリを使用し、3通りのtrain/validationの分割で学習する。

In [None]:
# sklearnのラッパーを使用

model_1 = xgb.XGBRegressor()
model_2 = xgb.XGBRegressor()
model_3 = xgb.XGBRegressor()
model_4 = xgb.XGBRegressor()
model_5 = xgb.XGBRegressor()
models = [model_1, model_2, model_3, model_4, model_5]

X = df_7.iloc[:, 6:].values
y = df_7.iloc[:, 1:6].values
preds = np.empty_like(y)
targets = df_7.columns[1:6]

# 3-fold
# 各変数毎に学習・推論
for i, target in enumerate(targets):
    print('==< {} >=='.format(target))
    y_cv = y[:, i]
    kfold = KFold(n_splits=3).split(X, y_cv)
    for train_index, test_index in kfold:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]
        
        models[i].fit(X_train, y_train,
                      eval_set=[(X_val, y_val)],
                      eval_metric='mae',
                      early_stopping_rounds=10,
                      verbose=True)
        preds[:, i] = models[i].predict(X)
        
scores = np.sum(np.abs(y - preds), axis=0) / np.sum(preds, axis=0)
scores

==< age >==
[0]	validation_0-mae:45.9661
Will train until validation_0-mae hasn't improved in 10 rounds.
[1]	validation_0-mae:41.3252
[2]	validation_0-mae:37.155
[3]	validation_0-mae:33.4253
[4]	validation_0-mae:30.0993
[5]	validation_0-mae:27.176
[6]	validation_0-mae:24.6027
[7]	validation_0-mae:22.3541
[8]	validation_0-mae:20.3954
[9]	validation_0-mae:18.6813
[10]	validation_0-mae:17.2179
[11]	validation_0-mae:15.9781
[12]	validation_0-mae:14.9358
[13]	validation_0-mae:14.0552
[14]	validation_0-mae:13.3193
[15]	validation_0-mae:12.7089
[16]	validation_0-mae:12.2034
[17]	validation_0-mae:11.7786
[18]	validation_0-mae:11.4354
[19]	validation_0-mae:11.1501
[20]	validation_0-mae:10.91
[21]	validation_0-mae:10.712
[22]	validation_0-mae:10.5524
[23]	validation_0-mae:10.4286
[24]	validation_0-mae:10.3187
[25]	validation_0-mae:10.2351
[26]	validation_0-mae:10.1675
[27]	validation_0-mae:10.1154
[28]	validation_0-mae:10.0672
[29]	validation_0-mae:10.0276
[30]	validation_0-mae:9.9966
[31]	valid

[50]	validation_0-mae:9.88904
[51]	validation_0-mae:9.89333
[52]	validation_0-mae:9.88669
[53]	validation_0-mae:9.8839
[54]	validation_0-mae:9.88561
[55]	validation_0-mae:9.8822
[56]	validation_0-mae:9.8873
[57]	validation_0-mae:9.88506
[58]	validation_0-mae:9.88222
[59]	validation_0-mae:9.88103
[60]	validation_0-mae:9.88386
[61]	validation_0-mae:9.88956
[62]	validation_0-mae:9.89114
[63]	validation_0-mae:9.89117
[64]	validation_0-mae:9.89628
[65]	validation_0-mae:9.89686
[66]	validation_0-mae:9.89722
[67]	validation_0-mae:9.90421
[68]	validation_0-mae:9.89999
[69]	validation_0-mae:9.90648
Stopping. Best iteration:
[59]	validation_0-mae:9.88103

[0]	validation_0-mae:46.7111
Will train until validation_0-mae hasn't improved in 10 rounds.
[1]	validation_0-mae:42.0835
[2]	validation_0-mae:37.9277
[3]	validation_0-mae:34.2046
[4]	validation_0-mae:30.8896
[5]	validation_0-mae:27.9237
[6]	validation_0-mae:25.2905
[7]	validation_0-mae:22.9756
[8]	validation_0-mae:20.9497
[9]	validation_0-mae:

[29]	validation_0-mae:9.77118
[30]	validation_0-mae:9.7347
[31]	validation_0-mae:9.69013
[32]	validation_0-mae:9.65904
[33]	validation_0-mae:9.63792
[34]	validation_0-mae:9.61788
[35]	validation_0-mae:9.59147
[36]	validation_0-mae:9.58157
[37]	validation_0-mae:9.56647
[38]	validation_0-mae:9.5536
[39]	validation_0-mae:9.54928
[40]	validation_0-mae:9.54644
[41]	validation_0-mae:9.53571
[42]	validation_0-mae:9.53584
[43]	validation_0-mae:9.52904
[44]	validation_0-mae:9.52527
[45]	validation_0-mae:9.52104
[46]	validation_0-mae:9.52154
[47]	validation_0-mae:9.52164
[48]	validation_0-mae:9.51703
[49]	validation_0-mae:9.51893
[50]	validation_0-mae:9.51952
[51]	validation_0-mae:9.51642
[52]	validation_0-mae:9.51495
[53]	validation_0-mae:9.51574
[54]	validation_0-mae:9.5192
[55]	validation_0-mae:9.52297
[56]	validation_0-mae:9.52468
[57]	validation_0-mae:9.52685
[58]	validation_0-mae:9.5221
[59]	validation_0-mae:9.51752
[60]	validation_0-mae:9.51293
[61]	validation_0-mae:9.51549
[62]	validatio

[6]	validation_0-mae:25.3405
[7]	validation_0-mae:23.0444
[8]	validation_0-mae:21.0514
[9]	validation_0-mae:19.309
[10]	validation_0-mae:17.7997
[11]	validation_0-mae:16.4899
[12]	validation_0-mae:15.3718
[13]	validation_0-mae:14.4336
[14]	validation_0-mae:13.6471
[15]	validation_0-mae:12.9892
[16]	validation_0-mae:12.4393
[17]	validation_0-mae:11.9717
[18]	validation_0-mae:11.5828
[19]	validation_0-mae:11.2545
[20]	validation_0-mae:10.9859
[21]	validation_0-mae:10.7514
[22]	validation_0-mae:10.5686
[23]	validation_0-mae:10.4073
[24]	validation_0-mae:10.276
[25]	validation_0-mae:10.1754
[26]	validation_0-mae:10.0889
[27]	validation_0-mae:10.0171
[28]	validation_0-mae:9.95189
[29]	validation_0-mae:9.89852
[30]	validation_0-mae:9.85813
[31]	validation_0-mae:9.82995
[32]	validation_0-mae:9.79266
[33]	validation_0-mae:9.76922
[34]	validation_0-mae:9.75134
[35]	validation_0-mae:9.73186
[36]	validation_0-mae:9.71555
[37]	validation_0-mae:9.70146
[38]	validation_0-mae:9.69182
[39]	validation_

作成したモデルを別に使用できるよう、保存する

In [20]:
import pickle
filename = 'xgbs_7cv.sav'
pickle.dump(models, open(filename, 'wb'))