# 第5章
モデルの評価について学ぶ。

---
ソースコードは以下から引用しています: https://github.com/ghmagazine/kagglebook/tree/master/ch05

ライセンス: https://github.com/ghmagazine/kagglebook/blob/master/LICENSE

## データとモデルの準備

In [1]:
import numpy as np
import pandas as pd

# train_xは学習データ、train_yは目的変数、test_xはテストデータ
# pandasのDataFrame, Seriesで保持します。（numpyのarrayで保持することもあります）

train = pd.read_csv('data/sample-data/train_preprocessed.csv')
origin_train_x = train.drop(['target'], axis=1)
origin_train_y = train['target']
origin_test_x = pd.read_csv('data/sample-data/test_preprocessed.csv')

# xgboostによる学習・予測を行うクラス
import xgboost as xgb


class Model:

    def __init__(self, params=None):
        self.model = None
        if params is None:
            self.params = {}
        else:
            self.params = params

    def fit(self, tr_x, tr_y, va_x, va_y):
        params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
        params.update(self.params)
        num_round = 10
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        self.model = xgb.train(params, dtrain, num_round, evals=watchlist)

    def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred

## クロスバリデーション

In [2]:
train_x, train_y = origin_train_x.copy(), origin_train_y.copy()

from sklearn.metrics import log_loss
from sklearn.model_selection import KFold

# Modelクラスを定義しているものとする
# Modelクラスは、fitで学習し、predictで予測値の確率を出力する

scores = []

# KFoldクラスを用いてクロスバリデーションの分割を行う
# 一見ランダムに並んでるように見えるデータでも、念のためデータをシャッフル
kf = KFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    # 学習の実行、バリデーションデータの予測値の出力、スコアの計算を行う
    model = Model()
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    scores.append(score)

# 各foldのスコアの平均をとる
print(np.mean(scores))

[0]	train-error:0.128533	eval-error:0.1516
[1]	train-error:0.115333	eval-error:0.146
[2]	train-error:0.109333	eval-error:0.1376
[3]	train-error:0.105333	eval-error:0.1364
[4]	train-error:0.096933	eval-error:0.1384
[5]	train-error:0.094667	eval-error:0.1364
[6]	train-error:0.087333	eval-error:0.1296
[7]	train-error:0.084933	eval-error:0.1244
[8]	train-error:0.078133	eval-error:0.1208
[9]	train-error:0.073733	eval-error:0.1172
[0]	train-error:0.124	eval-error:0.1512
[1]	train-error:0.1156	eval-error:0.1452
[2]	train-error:0.110933	eval-error:0.1404
[3]	train-error:0.1072	eval-error:0.1396
[4]	train-error:0.097067	eval-error:0.1364
[5]	train-error:0.092133	eval-error:0.1312
[6]	train-error:0.087333	eval-error:0.124
[7]	train-error:0.084133	eval-error:0.1236
[8]	train-error:0.0804	eval-error:0.12
[9]	train-error:0.0768	eval-error:0.1208
[0]	train-error:0.130267	eval-error:0.1576
[1]	train-error:0.1188	eval-error:0.1384
[2]	train-error:0.113067	eval-error:0.1416
[3]	train-error:0.1056	eval-

## GroupKFold

GroupKFoldクラスではシャッフルと乱数シードの指定ができないため使いづらいので、

ここではKFoldクラスを用いて、顧客ID単位で分割するアプローチ

In [3]:
train_x, train_y = origin_train_x.copy(), origin_train_y.copy()

# 4件ずつ同じユーザーがいるデータであったとする
train_x['user_id'] = np.arange(0, len(train_x)) // 4
train_x.head()

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth,user_id
0,50,1,166.445608,65.016732,9,7000000,134,202,1,11,...,1,0,1,0,0,2015,2,3,24182,0
1,68,0,164.334615,56.544217,0,7000000,438,263,3,14,...,0,1,1,0,0,2015,5,9,24185,0
2,77,1,167.462917,54.242267,2,6000000,313,325,1,18,...,1,0,1,0,0,2016,2,13,24194,0
3,17,1,177.097725,71.147762,3,8000000,342,213,2,11,...,0,0,1,0,0,2015,7,6,24187,0
4,62,0,158.165788,65.240697,1,9000000,327,102,0,14,...,0,1,1,1,0,2016,9,17,24201,1


In [4]:
# user_id列の顧客IDを単位として分割することにする
user_id = train_x['user_id']
unique_user_ids = user_id.unique()
unique_user_ids

array([   0,    1,    2, ..., 2497, 2498, 2499])

In [5]:
from sklearn.model_selection import KFold, GroupKFold

# KFoldクラスを用いて、顧客ID単位で分割する
kf = KFold(n_splits=4, shuffle=True, random_state=71)

In [6]:
# 挙動の確認用
for tr_group_idx, va_group_idx in kf.split(unique_user_ids):
    print('tr_group_idx: ', tr_group_idx)
    print('va_group_idx: ', va_group_idx)
    print('-'*10)
    
print('tr_group size: ', len(tr_group_idx))
print('va_group size: ', len(va_group_idx))

tr_group_idx:  [   0    1    2 ... 2496 2498 2499]
va_group_idx:  [   8    9   11   12   14   17   20   22   27   32   39   44   48   52
   53   64   70   78   79   83   91   94  103  104  106  110  111  117
  118  123  130  140  141  142  143  145  146  150  154  158  160  161
  163  170  173  179  180  182  185  190  209  210  218  227  231  233
  237  242  247  251  252  262  263  266  267  272  273  287  288  291
  292  299  304  316  317  327  329  339  344  345  348  350  352  358
  367  368  372  374  376  380  384  386  398  406  410  414  417  431
  434  435  446  449  453  457  462  468  474  476  477  484  493  497
  503  507  512  515  516  522  523  528  535  536  540  542  546  548
  550  551  553  558  563  569  570  571  576  577  578  580  584  585
  586  589  590  591  594  596  603  606  607  608  612  614  618  620
  622  623  624  634  636  639  642  644  647  651  654  657  659  661
  668  669  671  680  681  688  690  691  692  705  709  712  714  721
  725  727 

tr_group_idx:  [   1    2    3 ... 2496 2497 2499]
va_group_idx:  [   0    4    7   10   19   23   25   28   29   30   41   56   58   65
   67   72   75   77   84   88   89   97   99  101  102  107  112  119
  121  125  126  128  133  136  138  148  151  153  155  164  168  169
  171  174  176  187  192  196  197  199  201  206  216  219  225  226
  229  230  241  245  248  249  254  259  264  265  279  282  284  301
  302  303  309  313  319  320  322  328  333  337  338  342  359  363
  369  375  383  387  389  392  395  396  401  404  405  407  413  415
  422  423  425  428  430  438  439  440  444  447  455  459  465  466
  472  473  475  478  485  487  489  506  508  514  520  521  525  529
  533  545  552  555  557  560  562  572  575  582  587  588  595  599
  600  604  605  615  619  625  629  637  641  646  652  653  660  662
  676  679  683  684  687  699  701  707  716  729  733  741  743  749
  751  754  756  760  761  766  769  775  777  781  782  783  786  787
  788  796 

In [7]:
# 確認用
print(tr_group_idx)
print(unique_user_ids[tr_group_idx])

[   1    2    3 ... 2496 2497 2499]
[   1    2    3 ... 2496 2497 2499]


In [8]:
# 確認用
print(user_id)
print(user_id.isin(tr_group_idx))

0          0
1          0
2          0
3          0
4          1
5          1
6          1
7          1
8          2
9          2
10         2
11         2
12         3
13         3
14         3
15         3
16         4
17         4
18         4
19         4
20         5
21         5
22         5
23         5
24         6
25         6
26         6
27         6
28         7
29         7
        ... 
9970    2492
9971    2492
9972    2493
9973    2493
9974    2493
9975    2493
9976    2494
9977    2494
9978    2494
9979    2494
9980    2495
9981    2495
9982    2495
9983    2495
9984    2496
9985    2496
9986    2496
9987    2496
9988    2497
9989    2497
9990    2497
9991    2497
9992    2498
9993    2498
9994    2498
9995    2498
9996    2499
9997    2499
9998    2499
9999    2499
Name: user_id, Length: 10000, dtype: int64
0       False
1       False
2       False
3       False
4        True
5        True
6        True
7        True
8        True
9        True
10       True
11       T

In [9]:
scores = []
for tr_group_idx, va_group_idx in kf.split(unique_user_ids):
    # 顧客IDをtrain/valid（学習に使うデータ、バリデーションデータ）に分割する

    # 元のコードだと、
    # tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[va_group_idx]
    # となっているが、tr_group_idxもunique_user_ids[tr_group_idx]も中身は同じなので、以下のように書いても問題なさそう
    # （書き換えた場合でも、scoreは変化しない）
    tr_groups, va_groups = tr_group_idx, va_group_idx

    # 各レコードの顧客IDがtrain/validのどちらに属しているかによって分割する
    is_tr = user_id.isin(tr_groups)
    is_va = user_id.isin(va_groups)
    tr_x, va_x = train_x[is_tr], train_x[is_va]
    tr_y, va_y = train_y[is_tr], train_y[is_va]
    
    # 学習の実行、バリデーションデータの予測値の出力、スコアの計算を行う
    model = Model()
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    scores.append(score)

# 各foldのスコアの平均をとる
print(np.mean(scores))

[0]	train-error:0.123467	eval-error:0.1536
[1]	train-error:0.116933	eval-error:0.1536
[2]	train-error:0.1092	eval-error:0.1504
[3]	train-error:0.102	eval-error:0.1452
[4]	train-error:0.0988	eval-error:0.1456
[5]	train-error:0.093067	eval-error:0.138
[6]	train-error:0.0864	eval-error:0.1332
[7]	train-error:0.080533	eval-error:0.1304
[8]	train-error:0.075467	eval-error:0.1308
[9]	train-error:0.0704	eval-error:0.1288
[0]	train-error:0.1264	eval-error:0.1528
[1]	train-error:0.1148	eval-error:0.1488
[2]	train-error:0.109867	eval-error:0.1444
[3]	train-error:0.102267	eval-error:0.144
[4]	train-error:0.095067	eval-error:0.1312
[5]	train-error:0.0924	eval-error:0.1344
[6]	train-error:0.085333	eval-error:0.1288
[7]	train-error:0.081333	eval-error:0.1256
[8]	train-error:0.0768	eval-error:0.1216
[9]	train-error:0.0736	eval-error:0.1204
[0]	train-error:0.129067	eval-error:0.1508
[1]	train-error:0.1196	eval-error:0.1408
[2]	train-error:0.1104	eval-error:0.1436
[3]	train-error:0.103867	eval-error:0.

## 時系列データのクロスバリデーション（時系列に沿って行う方法）

In [10]:
train_x, train_y = origin_train_x.copy(), origin_train_y.copy()
test_x = origin_test_x.copy()

# 時系列データであり、時間に沿って変数periodを設定したとする
train_x['period'] = np.arange(0, len(train_x)) // (len(train_x) // 4)
train_x['period'] = np.clip(train_x['period'], 0, 3)
test_x['period'] = 4

In [11]:
train_x

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth,period
0,50,1,166.445608,65.016732,9,7000000,134,202,1,11,...,1,0,1,0,0,2015,2,3,24182,0
1,68,0,164.334615,56.544217,0,7000000,438,263,3,14,...,0,1,1,0,0,2015,5,9,24185,0
2,77,1,167.462917,54.242267,2,6000000,313,325,1,18,...,1,0,1,0,0,2016,2,13,24194,0
3,17,1,177.097725,71.147762,3,8000000,342,213,2,11,...,0,0,1,0,0,2015,7,6,24187,0
4,62,0,158.165788,65.240697,1,9000000,327,102,0,14,...,0,1,1,1,0,2016,9,17,24201,0
5,14,0,162.875632,76.423119,7,10000,389,229,1,11,...,1,0,0,0,0,2015,10,27,24190,0
6,63,1,181.146801,63.982878,2,6000000,57,344,2,11,...,0,0,0,1,0,2015,8,19,24188,0
7,42,1,176.865716,68.588308,9,9000000,307,315,1,15,...,0,0,0,0,0,2016,11,5,24203,0
8,9,0,158.744094,63.187882,9,3000000,201,393,3,11,...,0,0,1,0,0,2016,1,5,24193,0
9,40,1,164.893735,52.348560,0,7000000,209,226,3,13,...,0,0,1,1,0,2016,5,2,24197,0


In [12]:
test_x.head()

Unnamed: 0,age,sex,height,weight,product,amount,medical_info_a1,medical_info_a2,medical_info_a3,medical_info_b1,...,medical_keyword_6,medical_keyword_7,medical_keyword_8,medical_keyword_9,medical_keyword_10,year,month,day,yearmonth,period
0,49,1,187.431987,81.008363,1,1000000,302,212,1,10,...,1,0,1,0,0,2016,12,6,24204,4
1,79,1,171.63263,71.067812,6,2000,197,469,0,14,...,0,0,0,1,1,2016,9,3,24201,4
2,78,0,163.543983,64.032098,0,4000000,247,225,2,17,...,1,0,1,0,0,2015,4,10,24184,4
3,26,1,150.391858,52.32291,2,1000000,108,228,0,15,...,0,1,0,0,0,2016,4,17,24196,4
4,14,1,165.835167,67.008154,2,4000000,181,90,2,11,...,0,0,1,0,0,2015,1,26,24181,4


In [13]:
# 変数periodを基準に分割することにする（0から3までが学習データ、4がテストデータとする）
# 変数periodが1, 2, 3のデータをそれぞれバリデーションデータとし、それ以前のデータを学習に使う
# TimeSeriesSplitの場合、データの並び順しか使えないため使いづらいので、自分でva_period_listを定義してforで回す

va_period_list = [1, 2, 3]

scores = []
for va_period in va_period_list:
    is_tr = train_x['period'] < va_period
    is_va = train_x['period'] == va_period
    tr_x, va_x = train_x[is_tr], train_x[is_va]
    tr_y, va_y = train_y[is_tr], train_y[is_va]

    # 学習の実行、バリデーションデータの予測値の出力、スコアの計算を行う
    model = Model()
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    scores.append(score)

# 各foldのスコアの平均をとる
print(np.mean(scores))

[0]	train-error:0.1248	eval-error:0.1468
[1]	train-error:0.0984	eval-error:0.1444
[2]	train-error:0.096	eval-error:0.1388
[3]	train-error:0.0896	eval-error:0.1364
[4]	train-error:0.0816	eval-error:0.1324
[5]	train-error:0.0756	eval-error:0.1304
[6]	train-error:0.0672	eval-error:0.1312
[7]	train-error:0.06	eval-error:0.1304
[8]	train-error:0.0568	eval-error:0.1284
[9]	train-error:0.05	eval-error:0.128
[0]	train-error:0.121	eval-error:0.1528
[1]	train-error:0.1104	eval-error:0.1496
[2]	train-error:0.1028	eval-error:0.1468
[3]	train-error:0.0924	eval-error:0.1416
[4]	train-error:0.089	eval-error:0.1404
[5]	train-error:0.0846	eval-error:0.1428
[6]	train-error:0.08	eval-error:0.1444
[7]	train-error:0.0716	eval-error:0.1368
[8]	train-error:0.068	eval-error:0.1352
[9]	train-error:0.0602	eval-error:0.132
[0]	train-error:0.1256	eval-error:0.1576
[1]	train-error:0.116133	eval-error:0.1524
[2]	train-error:0.112933	eval-error:0.1484
[3]	train-error:0.102533	eval-error:0.1496
[4]	train-error:0.0949