# 第6章
モデルのハイパーパラメータのチューニングや特徴量の選択によってモデルの精度を高めるテクニックを学ぶ。

---
ソースコードは以下から引用しています: https://github.com/ghmagazine/kagglebook/tree/master/ch06

ライセンス: https://github.com/ghmagazine/kagglebook/blob/master/LICENSE

## hyperopt（ベイズ最適化）でGBDTのパラメータチューニングを行う

### データの準備

In [1]:
import numpy as np
import pandas as pd

# train_xは学習データ、train_yは目的変数、test_xはテストデータ
# pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

train = pd.read_csv('data/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']
test_x = pd.read_csv('data/sample-data/test_preprocessed.csv')

# 学習データを学習データとバリデーションデータに分ける
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

### モデルの準備

In [2]:
# xgboostによる学習・予測を行うクラス
import xgboost as xgb


class Model:

    def __init__(self, params=None):
        self.model = None
        if params is None:
            self.params = {}
        else:
            self.params = params

    def fit(self, tr_x, tr_y, va_x, va_y):
        params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
        params.update(self.params)
        num_round = 10
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.model = xgb.train(params, dtrain, num_round, evals=watchlist)

    def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred

### チューニングしたいパラメータを引数にとり、最小化したい評価指標のスコアを返す関数を作成
モデルを引数のパラメータで学習させ、バリデーションデータへの予測を行い、評価指標のスコアを計算する処理を行う

In [3]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import log_loss


def score(params):
    # パラメータを与えたときに最小化する評価指標を指定する
    # 具体的には、モデルにパラメータを指定して学習・予測させた場合のスコアを返すようにする

    # max_depthの型を整数型に修正する
    params['max_depth'] = int(params['max_depth'])

    # Modelクラスを定義しているものとする
    # Modelクラスは、fitで学習し、predictで予測値の確率を出力する
    model = Model(params)
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    print(f'params: {params}, logloss: {score:.4f}')

    # 情報を記録しておく
    history.append((params, score))

    return {'loss': score, 'status': STATUS_OK}

### 探索するパラメータ空間を定義 

In [4]:
# 探索するパラメータの空間を指定する
space = {
    'min_child_weight': hp.quniform('min_child_weight', 1, 5, 1),
    'max_depth': hp.quniform('max_depth', 3, 9, 1),
    'gamma': hp.quniform('gamma', 0, 0.4, 0.1),
}

### パラメータ探索の実行

In [5]:
# hyperoptによるパラメータ探索の実行
# 探索回数（100回程度で十分）
max_evals = 100
trials = Trials()
history = []
fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)

[0]	train-error:0.124	eval-error:0.1456
[1]	train-error:0.115333	eval-error:0.1452
[2]	train-error:0.110933	eval-error:0.1416
[3]	train-error:0.102	eval-error:0.136
[4]	train-error:0.094	eval-error:0.1312
[5]	train-error:0.087867	eval-error:0.1328
[6]	train-error:0.083867	eval-error:0.1296
[7]	train-error:0.0804	eval-error:0.1248
[8]	train-error:0.074667	eval-error:0.1216
[9]	train-error:0.070133	eval-error:0.1196
params: {'gamma': 0.0, 'max_depth': 7, 'min_child_weight': 4.0}, logloss: 0.2942
[0]	train-error:0.106133	eval-error:0.144
[1]	train-error:0.090933	eval-error:0.1424
[2]	train-error:0.080133	eval-error:0.1388
[3]	train-error:0.075333	eval-error:0.1348
[4]	train-error:0.071333	eval-error:0.1308
[5]	train-error:0.063733	eval-error:0.1268
[6]	train-error:0.055067	eval-error:0.12
[7]	train-error:0.051467	eval-error:0.1188
[8]	train-error:0.045867	eval-error:0.1144
[9]	train-error:0.0408	eval-error:0.1108
params: {'gamma': 0.2, 'max_depth': 9, 'min_child_weight': 2.0}, logloss: 0.

[3]	train-error:0.082	eval-error:0.1316
[4]	train-error:0.0768	eval-error:0.126
[5]	train-error:0.0688	eval-error:0.1204
[6]	train-error:0.062133	eval-error:0.1204
[7]	train-error:0.0552	eval-error:0.1176
[8]	train-error:0.050533	eval-error:0.118
[9]	train-error:0.0456	eval-error:0.1132
params: {'gamma': 0.0, 'max_depth': 9, 'min_child_weight': 3.0}, logloss: 0.2793
[0]	train-error:0.121067	eval-error:0.1408
[1]	train-error:0.111067	eval-error:0.1404
[2]	train-error:0.105733	eval-error:0.1376
[3]	train-error:0.096	eval-error:0.1312
[4]	train-error:0.0892	eval-error:0.1292
[5]	train-error:0.084933	eval-error:0.1248
[6]	train-error:0.080267	eval-error:0.124
[7]	train-error:0.076533	eval-error:0.12
[8]	train-error:0.072133	eval-error:0.1192
[9]	train-error:0.068667	eval-error:0.1208
params: {'gamma': 0.2, 'max_depth': 7, 'min_child_weight': 3.0}, logloss: 0.2927
[0]	train-error:0.1432	eval-error:0.1568
[1]	train-error:0.1348	eval-error:0.1532
[2]	train-error:0.125733	eval-error:0.1436
[3]

[7]	train-error:0.070933	eval-error:0.1224
[8]	train-error:0.065333	eval-error:0.1224
[9]	train-error:0.061333	eval-error:0.1208
params: {'gamma': 0.30000000000000004, 'max_depth': 7, 'min_child_weight': 2.0}, logloss: 0.2911
[0]	train-error:0.111467	eval-error:0.1428
[1]	train-error:0.0992	eval-error:0.1396
[2]	train-error:0.0884	eval-error:0.1356
[3]	train-error:0.084667	eval-error:0.1312
[4]	train-error:0.076267	eval-error:0.124
[5]	train-error:0.069067	eval-error:0.12
[6]	train-error:0.064	eval-error:0.1216
[7]	train-error:0.059733	eval-error:0.1148
[8]	train-error:0.052133	eval-error:0.118
[9]	train-error:0.049067	eval-error:0.112
params: {'gamma': 0.1, 'max_depth': 8, 'min_child_weight': 2.0}, logloss: 0.2851
[0]	train-error:0.1116	eval-error:0.1428
[1]	train-error:0.097733	eval-error:0.1452
[2]	train-error:0.091467	eval-error:0.1392
[3]	train-error:0.082	eval-error:0.1316
[4]	train-error:0.076533	eval-error:0.1256
[5]	train-error:0.0692	eval-error:0.1228
[6]	train-error:0.063333

params: {'gamma': 0.1, 'max_depth': 9, 'min_child_weight': 1.0}, logloss: 0.2825
[0]	train-error:0.121067	eval-error:0.1408
[1]	train-error:0.111067	eval-error:0.1404
[2]	train-error:0.105333	eval-error:0.1372
[3]	train-error:0.095867	eval-error:0.1312
[4]	train-error:0.089333	eval-error:0.1292
[5]	train-error:0.085467	eval-error:0.1268
[6]	train-error:0.080933	eval-error:0.1256
[7]	train-error:0.076267	eval-error:0.1224
[8]	train-error:0.0708	eval-error:0.118
[9]	train-error:0.067867	eval-error:0.1176
params: {'gamma': 0.0, 'max_depth': 7, 'min_child_weight': 3.0}, logloss: 0.2913
[0]	train-error:0.111467	eval-error:0.1428
[1]	train-error:0.099467	eval-error:0.14
[2]	train-error:0.087333	eval-error:0.1388
[3]	train-error:0.082	eval-error:0.1324
[4]	train-error:0.075333	eval-error:0.128
[5]	train-error:0.068533	eval-error:0.1248
[6]	train-error:0.0632	eval-error:0.124
[7]	train-error:0.060133	eval-error:0.122
[8]	train-error:0.052533	eval-error:0.12
[9]	train-error:0.0468	eval-error:0.

[0]	train-error:0.106133	eval-error:0.144
[1]	train-error:0.091333	eval-error:0.1428
[2]	train-error:0.0808	eval-error:0.1384
[3]	train-error:0.076533	eval-error:0.1356
[4]	train-error:0.071733	eval-error:0.1304
[5]	train-error:0.062533	eval-error:0.1216
[6]	train-error:0.054267	eval-error:0.1236
[7]	train-error:0.049867	eval-error:0.1208
[8]	train-error:0.045867	eval-error:0.118
[9]	train-error:0.040267	eval-error:0.1188
params: {'gamma': 0.4, 'max_depth': 9, 'min_child_weight': 2.0}, logloss: 0.2809
[0]	train-error:0.098133	eval-error:0.1408
[1]	train-error:0.0844	eval-error:0.1436
[2]	train-error:0.071467	eval-error:0.1332
[3]	train-error:0.065333	eval-error:0.1348
[4]	train-error:0.059467	eval-error:0.1328
[5]	train-error:0.050267	eval-error:0.1264
[6]	train-error:0.045333	eval-error:0.1228
[7]	train-error:0.0396	eval-error:0.1216
[8]	train-error:0.034133	eval-error:0.1168
[9]	train-error:0.0312	eval-error:0.1148
params: {'gamma': 0.30000000000000004, 'max_depth': 9, 'min_child_wei

[0]	train-error:0.1116	eval-error:0.1428
[1]	train-error:0.097867	eval-error:0.1452
[2]	train-error:0.0916	eval-error:0.1392
[3]	train-error:0.082267	eval-error:0.1316
[4]	train-error:0.077333	eval-error:0.1264
[5]	train-error:0.0692	eval-error:0.126
[6]	train-error:0.064	eval-error:0.1208
[7]	train-error:0.058133	eval-error:0.1212
[8]	train-error:0.051067	eval-error:0.1208
[9]	train-error:0.0468	eval-error:0.1196
params: {'gamma': 0.30000000000000004, 'max_depth': 9, 'min_child_weight': 3.0}, logloss: 0.2805
[0]	train-error:0.107867	eval-error:0.1424
[1]	train-error:0.096133	eval-error:0.1404
[2]	train-error:0.081733	eval-error:0.1312
[3]	train-error:0.073467	eval-error:0.1292
[4]	train-error:0.0692	eval-error:0.1232
[5]	train-error:0.063733	eval-error:0.124
[6]	train-error:0.055733	eval-error:0.1188
[7]	train-error:0.0504	eval-error:0.1156
[8]	train-error:0.0452	eval-error:0.1136
[9]	train-error:0.041333	eval-error:0.1132
params: {'gamma': 0.4, 'max_depth': 8, 'min_child_weight': 1.0

[1]	train-error:0.134533	eval-error:0.1516
[2]	train-error:0.1268	eval-error:0.1452
[3]	train-error:0.123467	eval-error:0.1424
[4]	train-error:0.115867	eval-error:0.138
[5]	train-error:0.1132	eval-error:0.1372
[6]	train-error:0.1096	eval-error:0.1396
[7]	train-error:0.104533	eval-error:0.1352
[8]	train-error:0.100667	eval-error:0.134
[9]	train-error:0.097733	eval-error:0.1272
params: {'gamma': 0.4, 'max_depth': 5, 'min_child_weight': 3.0}, logloss: 0.3108
[0]	train-error:0.106133	eval-error:0.144
[1]	train-error:0.090933	eval-error:0.1424
[2]	train-error:0.080133	eval-error:0.1388
[3]	train-error:0.075333	eval-error:0.1348
[4]	train-error:0.071333	eval-error:0.1308
[5]	train-error:0.063733	eval-error:0.1268
[6]	train-error:0.055067	eval-error:0.12
[7]	train-error:0.051467	eval-error:0.1188
[8]	train-error:0.045867	eval-error:0.1144
[9]	train-error:0.0408	eval-error:0.1108
params: {'gamma': 0.2, 'max_depth': 9, 'min_child_weight': 2.0}, logloss: 0.2792
[0]	train-error:0.107867	eval-erro

{'gamma': 0.30000000000000004, 'max_depth': 9.0, 'min_child_weight': 2.0}

### 結果の出力

In [6]:
# 記録した情報からパラメータとスコアを出力する
# （trialsからも情報が取得できるが、パラメータの取得がやや行いづらいため）
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f'best params:{best[0]}, score:{best[1]:.4f}')

best params:{'gamma': 0.30000000000000004, 'max_depth': 9, 'min_child_weight': 2.0}, score:0.2782


## GBDTで特徴量の重要度を求める

In [7]:
# train_xは学習データ、train_yは目的変数
train = pd.read_csv('data/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train['target']

import xgboost as xgb

# xgboost
dtrain = xgb.DMatrix(train_x, label=train_y)
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
num_round = 50
model = xgb.train(params, dtrain, num_round)

# 重要度の上位を出力する
fscore = model.get_score(importance_type='total_gain')  # デフォルトの頻度ではなくゲインを見る
fscore = sorted([(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True)
print('xgboost importance')
print(fscore[:5])

xgboost importance
[('weight', 2614.0292872053), ('medical_info_a1', 2240.9029885495024), ('height', 1973.3420535613589), ('age', 1442.8326779044812), ('medical_info_a2', 1150.6861457969187)]
