githubのLightGBM/example/python-guide/に従ってチュートリアルを行ってみる。

### Simple_example.py

In [1]:
# coding: utf-8
# pylint: disable = invalid-name, C0111
import json
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error

In [2]:
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('./examples/regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('./examples/regression/regression.test', header=None, sep='\t')
print(df_train.shape,df_test.shape)

Load data...
(7000, 29) (500, 29)


In [3]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,1,0.644,0.247,-0.447,0.862,0.374,0.854,-1.126,-0.79,2.173,...,-0.19,-0.744,3.102,0.958,1.061,0.98,0.875,0.581,0.905,0.796
1,0,0.385,1.8,1.037,1.044,0.349,1.502,-0.966,1.734,0.0,...,-0.44,0.638,3.102,0.695,0.909,0.981,0.803,0.813,1.149,1.116
2,0,1.214,-0.166,0.004,0.505,1.434,0.628,-1.174,-1.23,1.087,...,-1.383,1.355,0.0,0.848,0.911,1.043,0.931,1.058,0.744,0.696
3,1,0.42,1.111,0.137,1.516,-1.657,0.854,0.623,1.605,1.087,...,0.731,1.424,3.102,1.597,1.282,1.105,0.73,0.148,1.231,1.234
4,0,0.897,-1.703,-1.306,1.022,-0.729,0.836,0.859,-0.333,2.173,...,-2.019,-0.289,0.0,0.805,0.93,0.984,1.43,2.198,1.934,1.684


In [4]:
y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values
print('y_train', y_train.shape)
print('y_test', y_test.shape)
print('x_train', X_train.shape)
print('x_test', X_test.shape)

y_train (7000,)
y_test (500,)
x_train (7000, 28)
x_test (500, 28)


In [5]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [6]:
lgb_train
# lgb特有のクラスがどうやら存在するらしい

<lightgbm.basic.Dataset at 0x114e5e7f0>

In [7]:
# specify your configurations as a dict
#　ここでパラメーターなどを設定する。あとで個々のパラメーターがなんの働きをしているのか調べよう
params = {
    'task': 'train', #訓練かテストかの指定も必要
    'boosting_type': 'gbdt', # gdbt以外もあるの？
    'objective': 'regression', #  回帰
    'metric': {'l2', 'auc'}, # 二乗誤差はわかるけどAUCって？
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [8]:
print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=20,
                valid_sets=lgb_eval, # ついでに検証も個々でできるみたい？
                early_stopping_rounds=5) #　早期打ち止めはここで設定できる

Start training...
[1]	valid_0's l2: 0.243898	valid_0's auc: 0.721096
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.240605	valid_0's auc: 0.732932
[3]	valid_0's l2: 0.236472	valid_0's auc: 0.773583
[4]	valid_0's l2: 0.232586	valid_0's auc: 0.781089
[5]	valid_0's l2: 0.22865	valid_0's auc: 0.787902
[6]	valid_0's l2: 0.226187	valid_0's auc: 0.780565
[7]	valid_0's l2: 0.223738	valid_0's auc: 0.786571
[8]	valid_0's l2: 0.221012	valid_0's auc: 0.784918
[9]	valid_0's l2: 0.218429	valid_0's auc: 0.784878
[10]	valid_0's l2: 0.215505	valid_0's auc: 0.788917
[11]	valid_0's l2: 0.213027	valid_0's auc: 0.791586
[12]	valid_0's l2: 0.210809	valid_0's auc: 0.793884
[13]	valid_0's l2: 0.208612	valid_0's auc: 0.796174
[14]	valid_0's l2: 0.207468	valid_0's auc: 0.793731
[15]	valid_0's l2: 0.206009	valid_0's auc: 0.794247
[16]	valid_0's l2: 0.20465	valid_0's auc: 0.793715
[17]	valid_0's l2: 0.202489	valid_0's auc: 0.797085
[18]	valid_0's l2: 0.200668	valid_0's auc: 0.802

In [9]:
print('Save model...')
# save model to file
gbm.save_model('model.txt')

Save model...


In [10]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) #イテレーションごとの予測が出せるっぽい
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Start predicting...
The rmse of prediction is: 0.44512434910807497


In [11]:
gbm.best_iteration

20

In [12]:
#ちなみにdataframeを直接予測することもできる。
df_test.iloc[:,1:].head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,19,20,21,22,23,24,25,26,27,28
0,0.644,0.247,-0.447,0.862,0.374,0.854,-1.126,-0.79,2.173,1.015,...,-0.19,-0.744,3.102,0.958,1.061,0.98,0.875,0.581,0.905,0.796
1,0.385,1.8,1.037,1.044,0.349,1.502,-0.966,1.734,0.0,0.966,...,-0.44,0.638,3.102,0.695,0.909,0.981,0.803,0.813,1.149,1.116
2,1.214,-0.166,0.004,0.505,1.434,0.628,-1.174,-1.23,1.087,0.579,...,-1.383,1.355,0.0,0.848,0.911,1.043,0.931,1.058,0.744,0.696
3,0.42,1.111,0.137,1.516,-1.657,0.854,0.623,1.605,1.087,1.511,...,0.731,1.424,3.102,1.597,1.282,1.105,0.73,0.148,1.231,1.234
4,0.897,-1.703,-1.306,1.022,-0.729,0.836,0.859,-0.333,2.173,1.336,...,-2.019,-0.289,0.0,0.805,0.93,0.984,1.43,2.198,1.934,1.684


In [13]:
print('Start predicting...')

for i in range(1,21):
    # predict
    y_pred = gbm.predict(df_test.iloc[:,1:], num_iteration=i) #イテレーションごとの予測が出せるっぽい
    # eval
    print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
    #上のl2と数字違うけど

Start predicting...
The rmse of prediction is: 0.4938607241619976
The rmse of prediction is: 0.4905154359341549
The rmse of prediction is: 0.4862842453841557
The rmse of prediction is: 0.48227163492963754
The rmse of prediction is: 0.4781731557256002
The rmse of prediction is: 0.4755914035480772
The rmse of prediction is: 0.47301003808307857
The rmse of prediction is: 0.4701187746576402
The rmse of prediction is: 0.4673641209298133
The rmse of prediction is: 0.4642249453292065
The rmse of prediction is: 0.46154898941415096
The rmse of prediction is: 0.45913944261856193
The rmse of prediction is: 0.45674100176886023
The rmse of prediction is: 0.45548693523578715
The rmse of prediction is: 0.45388261448639494
The rmse of prediction is: 0.45238204489027045
The rmse of prediction is: 0.44998823214138367
The rmse of prediction is: 0.4479599792791922
The rmse of prediction is: 0.44637392507504314
The rmse of prediction is: 0.44512434910807497


このチュートリアルをブログにまとめてる人がすでにいた
http://marugari2.hatenablog.jp/entry/2016/12/14/235747

### sklearn_example
もちろんsklearn likeなインターフェイスも兼ね備えていて、sklearnと同じように使うことができる。gridsearchとかもできる。

In [14]:
# coding: utf-8
# pylint: disable = invalid-name, C0111
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [15]:
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('./examples/regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('./examples/regression/regression.test', header=None, sep='\t')
print(df_test.head())

y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values

# この方法ではgbm専用のデータフレームを制作しなくてもよい

Load data...
   0      1      2      3      4      5      6      7      8      9   ...    \
0   1  0.644  0.247 -0.447  0.862  0.374  0.854 -1.126 -0.790  2.173  ...     
1   0  0.385  1.800  1.037  1.044  0.349  1.502 -0.966  1.734  0.000  ...     
2   0  1.214 -0.166  0.004  0.505  1.434  0.628 -1.174 -1.230  1.087  ...     
3   1  0.420  1.111  0.137  1.516 -1.657  0.854  0.623  1.605  1.087  ...     
4   0  0.897 -1.703 -1.306  1.022 -0.729  0.836  0.859 -0.333  2.173  ...     

      19     20     21     22     23     24     25     26     27     28  
0 -0.190 -0.744  3.102  0.958  1.061  0.980  0.875  0.581  0.905  0.796  
1 -0.440  0.638  3.102  0.695  0.909  0.981  0.803  0.813  1.149  1.116  
2 -1.383  1.355  0.000  0.848  0.911  1.043  0.931  1.058  0.744  0.696  
3  0.731  1.424  3.102  1.597  1.282  1.105  0.730  0.148  1.231  1.234  
4 -2.019 -0.289  0.000  0.805  0.930  0.984  1.430  2.198  1.934  1.684  

[5 rows x 29 columns]


In [16]:
print('Start training...')
# train
gbm = lgb.LGBMRegressor(objective='regression',
                        num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)

gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)], #でもsklearnとちょっと異なり、evalsetは必要みたい
        eval_metric='l1', #さっきと変わってl1、一体どういう評価指標なんだろう
        early_stopping_rounds=5)


Start training...
[1]	valid_0's l1: 0.491735
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l1: 0.486563
[3]	valid_0's l1: 0.481489
[4]	valid_0's l1: 0.476848
[5]	valid_0's l1: 0.47305
[6]	valid_0's l1: 0.469049
[7]	valid_0's l1: 0.465556
[8]	valid_0's l1: 0.462208
[9]	valid_0's l1: 0.458676
[10]	valid_0's l1: 0.454998
[11]	valid_0's l1: 0.452047
[12]	valid_0's l1: 0.449158
[13]	valid_0's l1: 0.44608
[14]	valid_0's l1: 0.443554
[15]	valid_0's l1: 0.440643
[16]	valid_0's l1: 0.437687
[17]	valid_0's l1: 0.435454
[18]	valid_0's l1: 0.433288
[19]	valid_0's l1: 0.431297
[20]	valid_0's l1: 0.428946
Did not meet early stopping. Best iteration is:
[20]	valid_0's l1: 0.428946


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.05, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=20,
       n_jobs=-1, num_leaves=31, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [17]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

Start predicting...
The rmse of prediction is: 0.4441153344254208


In [18]:
# 変数重要度の確認
# feature importances
print('Feature importances:', list(gbm.feature_importances_))

Feature importances: [23, 7, 0, 33, 5, 56, 9, 1, 1, 21, 2, 5, 1, 19, 9, 6, 1, 10, 4, 10, 0, 31, 61, 4, 48, 102, 52, 79]


自分で`eval_metric=`の関数を作成することもできる。以下に例を示す。

In [19]:
# 自分で定義した評価関数
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# ちょっと自分でも作ってみる
def mse(y_true, y_pred):
    return 'MSE', mean_squared_error(y_true, y_pred) ** 0.5, True
#最後のFalseはなんだこれ←Trueにすると定義した関数を大きくするように学習が進む

In [20]:
print('Start training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=mse, #metricにl2入れてないのにl2が表示されてるけどどういうこっちゃ #自作関数のときだけはなるっぽい？
        early_stopping_rounds=5)

Start training with custom eval function...
[1]	valid_0's l2: 0.242763	valid_0's MSE: 0.49271
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.237895	valid_0's MSE: 0.487745
[3]	valid_0's l2: 0.233277	valid_0's MSE: 0.482987
[4]	valid_0's l2: 0.22925	valid_0's MSE: 0.4788
[5]	valid_0's l2: 0.226155	valid_0's MSE: 0.475558
[6]	valid_0's l2: 0.222963	valid_0's MSE: 0.472189
Early stopping, best iteration is:
[1]	valid_0's l2: 0.242763	valid_0's MSE: 0.49271


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.05, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=20,
       n_jobs=-1, num_leaves=31, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [21]:
print('Start training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle, #metricにl2入れてないのにl2が表示されてる #自作関数のときだけはなるっぽい？
        early_stopping_rounds=5)

Start training with custom eval function...
[1]	valid_0's l2: 0.242763	valid_0's RMSLE: 0.344957
Training until validation scores don't improve for 5 rounds.
[2]	valid_0's l2: 0.237895	valid_0's RMSLE: 0.341693
[3]	valid_0's l2: 0.233277	valid_0's RMSLE: 0.338462
[4]	valid_0's l2: 0.22925	valid_0's RMSLE: 0.335656
[5]	valid_0's l2: 0.226155	valid_0's RMSLE: 0.333431
[6]	valid_0's l2: 0.222963	valid_0's RMSLE: 0.331104
[7]	valid_0's l2: 0.220364	valid_0's RMSLE: 0.329193
[8]	valid_0's l2: 0.217872	valid_0's RMSLE: 0.327337
[9]	valid_0's l2: 0.215328	valid_0's RMSLE: 0.325433
[10]	valid_0's l2: 0.212743	valid_0's RMSLE: 0.323523
[11]	valid_0's l2: 0.210805	valid_0's RMSLE: 0.321986
[12]	valid_0's l2: 0.208945	valid_0's RMSLE: 0.320523
[13]	valid_0's l2: 0.206986	valid_0's RMSLE: 0.319027
[14]	valid_0's l2: 0.205513	valid_0's RMSLE: 0.317796
[15]	valid_0's l2: 0.203728	valid_0's RMSLE: 0.316383
[16]	valid_0's l2: 0.201865	valid_0's RMSLE: 0.314827
[17]	valid_0's l2: 0.200639	valid_0's RMS

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.05, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=20,
       n_jobs=-1, num_leaves=31, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [22]:
print('Start predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1])

Start predicting...
The rmsle of prediction is: 0.3110323289863277


sklearnのgrid searchを用いた例

In [23]:
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

gbm = GridSearchCV(estimator, param_grid)

gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'learning_rate': 0.1, 'n_estimators': 40}


### advanced_example.py
更に詳細にできることを見ていこう。
ここでは以下の例を示している
- 特徴量の名前をセットする
- one-hotにすることなくカテゴリカルな特徴を扱う
- jsonにモデルを出力する。
- 変数重要度を得る
- Get feature names
- 予測のためにモデルを読み込む
- Dump and load model with pickle
- Load model file to continue training
- 学習中に学習率を変更する
- 学習中にパラメーターを変更する
- Self-defined objective function
- Self-defined eval metric
- Callback function

In [24]:
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

try:
    #python3ではcPickleが存在しなく、代わりに_pickleが同様の役割をする
    # ちなみにこれはc言語で実装されたpickleのことらしい
    import _pickle as pickle
except BaseException:
    import pickle

In [25]:
# load or create your dataset
print('Load data...')
df_train = pd.read_csv('./examples/binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('./examples/binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('./examples/binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('./examples/binary_classification/binary.test.weight', header=None)[0]

Load data...


In [26]:
for i in range(len(df_train.columns)):
    print("feature",i,": ",len(pd.unique(df_train.iloc[:,i])))
df_test.describe()
# 9, 13,17,21,26?らへんはカテゴリカルっぽい

feature 0 :  2
feature 1 :  1920
feature 2 :  3295
feature 3 :  2923
feature 4 :  2116
feature 5 :  3028
feature 6 :  1721
feature 7 :  3165
feature 8 :  2928
feature 9 :  3
feature 10 :  1761
feature 11 :  3239
feature 12 :  2973
feature 13 :  3
feature 14 :  1786
feature 15 :  3239
feature 16 :  2963
feature 17 :  3
feature 18 :  1708
feature 19 :  3222
feature 20 :  2967
feature 21 :  3
feature 22 :  1633
feature 23 :  1281
feature 24 :  660
feature 25 :  1436
feature 26 :  1866
feature 27 :  1412
feature 28 :  1254


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,...,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,0.544,1.040106,0.020148,0.021794,1.0024,0.00464,0.992614,-0.04589,-0.006802,1.062656,...,0.040108,-0.081236,1.060884,1.09247,1.046386,1.051016,1.026018,1.008524,1.06204,0.978822
std,0.498559,0.603834,0.982632,1.029083,0.607242,1.003319,0.487095,0.972654,1.00956,1.021128,...,1.004528,1.00887,1.423061,0.811289,0.461281,0.155523,0.40242,0.574999,0.430023,0.346463
min,0.0,0.279,-2.258,-1.715,0.043,-1.735,0.177,-2.823,-1.738,0.0,...,-2.467,-1.738,0.0,0.202,0.472,0.85,0.482,0.115,0.417,0.445
25%,0.0,0.6115,-0.6895,-0.887,0.552,-0.85075,0.68375,-0.73125,-0.847,0.0,...,-0.651,-0.96275,0.0,0.8045,0.838,0.985,0.77425,0.684,0.826,0.78075
50%,1.0,0.8905,0.0385,0.0225,0.888,0.065,0.894,-0.0345,-0.035,1.087,...,-0.0225,-0.094,0.0,0.8885,0.946,0.99,0.9305,0.8895,0.9635,0.8835
75%,1.0,1.31725,0.706,0.91625,1.3335,0.89225,1.14225,0.6385,0.84075,2.173,...,0.7905,0.72675,3.102,1.03575,1.087,1.02225,1.1645,1.1545,1.1735,1.079
max,1.0,4.357,2.292,1.743,4.34,1.743,3.446,2.89,1.739,2.173,...,2.489,1.728,3.102,7.531,5.52,2.493,3.931,5.59,4.492,3.676


In [27]:
y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values

num_train, num_feature = X_train.shape

In [28]:
for i in range(X_train.shape[1]):
    print("feature",i,":",len(pd.unique(X_train[:,i])))

feature 0 : 1920
feature 1 : 3295
feature 2 : 2923
feature 3 : 2116
feature 4 : 3028
feature 5 : 1721
feature 6 : 3165
feature 7 : 2928
feature 8 : 3
feature 9 : 1761
feature 10 : 3239
feature 11 : 2973
feature 12 : 3
feature 13 : 1786
feature 14 : 3239
feature 15 : 2963
feature 16 : 3
feature 17 : 1708
feature 18 : 3222
feature 19 : 2967
feature 20 : 3
feature 21 : 1633
feature 22 : 1281
feature 23 : 660
feature 24 : 1436
feature 25 : 1866
feature 26 : 1412
feature 27 : 1254


In [29]:
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
# とのことだが再利用するとはどういうことだろう...
lgb_train = lgb.Dataset(X_train, y_train,
                        weight=W_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,
                       weight=W_test, free_raw_data=False)

In [30]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', #二値判別
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [31]:
# generate a feature name
feature_name = ['feature_' + str(col) for col in range(num_feature)]
feature_name[4]

'feature_4'

In [32]:
print('Start training...')
# feature_name and categorical_feature
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,  # eval training data #トレーニングデータで評価を行いながら学習する
                feature_name=feature_name, #ここで特徴名を入れる
                categorical_feature=[20]) #feature21はカテゴリカルじゃないと思うんだが...(feature20はカテゴリカル)
#勝手に20に修正しておいた
# check feature name
print('Finish first 10 rounds...')
print('7th feature name is:', repr(lgb_train.feature_name[6]))

Start training...
[1]	training's binary_logloss: 0.6808
[2]	training's binary_logloss: 0.672433
[3]	training's binary_logloss: 0.664801
[4]	training's binary_logloss: 0.655987
[5]	training's binary_logloss: 0.647645
[6]	training's binary_logloss: 0.640706
[7]	training's binary_logloss: 0.634695
[8]	training's binary_logloss: 0.627758
[9]	training's binary_logloss: 0.621304
[10]	training's binary_logloss: 0.615334
Finish first 10 rounds...
7th feature name is: 'feature_6'




モデルの保存

In [33]:
# save model to file
gbm.save_model('model.txt')

# dump model to json (and save to file)
print('Dump model to JSON...')
model_json = gbm.dump_model()

with open('model.json', 'w+') as f:
    json.dump(model_json, f, indent=4)


Dump model to JSON...


In [34]:
# feature names
print('Feature names:', gbm.feature_name())

Feature names: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27']


変数重要度

In [35]:
# feature importances
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [16, 3, 4, 19, 1, 26, 3, 0, 2, 15, 1, 2, 0, 10, 2, 3, 0, 2, 1, 3, 0, 9, 33, 3, 36, 41, 30, 35]


学習済みのモデルを読み込んで、推論させる。

In [36]:
bst = lgb.Booster(model_file='model.txt')
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
print('The rmse of loaded model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

The rmse of loaded model's prediction is: 0.46433153299922175


もちろんpickleでもモデルを保存できるが、当たり前過ぎるので省略(他人がつくったpickleを開くのはセキュリティ的にも心配だし)

途中まで訓練したモデルを追加で訓練する。

In [37]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model='model.txt',#init_modelを指定してあげればok
                valid_sets=lgb_eval)
print('Finish 10 - 20 rounds with model file...')

[11]	valid_0's binary_logloss: 0.618069
[12]	valid_0's binary_logloss: 0.614416
[13]	valid_0's binary_logloss: 0.610315
[14]	valid_0's binary_logloss: 0.607474
[15]	valid_0's binary_logloss: 0.603242
[16]	valid_0's binary_logloss: 0.599735
[17]	valid_0's binary_logloss: 0.596481
[18]	valid_0's binary_logloss: 0.593245
[19]	valid_0's binary_logloss: 0.59029
[20]	valid_0's binary_logloss: 0.587607
Finish 10 - 20 rounds with model file...




学習率を小さくする

In [38]:
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
# 学習率をラムダ式（じゃなくてもいいはずだよね）で書くことで学習率をどんどん減衰させて行くことができる。
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                valid_sets=lgb_eval)

print('Finish 20 - 30 rounds with decay learning rates...')



[21]	valid_0's binary_logloss: 0.618069
[22]	valid_0's binary_logloss: 0.614451
[23]	valid_0's binary_logloss: 0.610428
[24]	valid_0's binary_logloss: 0.607663
[25]	valid_0's binary_logloss: 0.603582
[26]	valid_0's binary_logloss: 0.600224
[27]	valid_0's binary_logloss: 0.597179
[28]	valid_0's binary_logloss: 0.594054
[29]	valid_0's binary_logloss: 0.591174
[30]	valid_0's binary_logloss: 0.58866
Finish 20 - 30 rounds with decay learning rates...


In [39]:
# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

print('Finish 30 - 40 rounds with changing bagging_fraction...')
#bagging_fractionを30ラウンドから変更しているっぽいが...ありがたみがよくわからんね



[31]	valid_0's binary_logloss: 0.618616
[32]	valid_0's binary_logloss: 0.615173
[33]	valid_0's binary_logloss: 0.611609
[34]	valid_0's binary_logloss: 0.607951
[35]	valid_0's binary_logloss: 0.60503
[36]	valid_0's binary_logloss: 0.602717
[37]	valid_0's binary_logloss: 0.599952
[38]	valid_0's binary_logloss: 0.596947
[39]	valid_0's binary_logloss: 0.594915
[40]	valid_0's binary_logloss: 0.592395
Finish 30 - 40 rounds with changing bagging_fraction...


自分で定義した目的関数を用いて学習させる

In [40]:
# 目的関数
# self-defined objective function
# f(preds: array, train_data: Dataset) -> grad: array, hess: array
# log likelihood loss
def loglikelood(preds, train_data):
    labels = train_data.get_label()
    preds = 1. / (1. + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1. - preds)
    return grad, hess
#勾配も返さないといけないっぽい？

# 評価関数
# self-defined eval metric
# f(preds: array, train_data: Dataset) -> name: string, eval_result: float, is_higher_better: bool
# binary error
def binary_error(preds, train_data):
    labels = train_data.get_label()
    return 'error', np.mean(labels != (preds > 0.5)), False

In [41]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm, #前のモデルから10ラウンド分回す
                fobj=loglikelood, #目的関数をセット
                feval=binary_error, #評価関数をセット
                valid_sets=lgb_eval)

print('Finish 40 - 50 rounds with self-defined objective function and eval metric...')



[41]	valid_0's binary_logloss: 4.789	valid_0's error: 0.408
[42]	valid_0's binary_logloss: 4.83297	valid_0's error: 0.398
[43]	valid_0's binary_logloss: 4.76209	valid_0's error: 0.39
[44]	valid_0's binary_logloss: 4.7494	valid_0's error: 0.376
[45]	valid_0's binary_logloss: 4.84787	valid_0's error: 0.376
[46]	valid_0's binary_logloss: 5.03903	valid_0's error: 0.376
[47]	valid_0's binary_logloss: 4.77534	valid_0's error: 0.37
[48]	valid_0's binary_logloss: 5.12551	valid_0's error: 0.362
[49]	valid_0's binary_logloss: 5.06038	valid_0's error: 0.368
[50]	valid_0's binary_logloss: 5.04804	valid_0's error: 0.364
Finish 40 - 50 rounds with self-defined objective function and eval metric...




つぎで何やってるかまじでわからんぞ

途中で、valid dataも考慮するようにしているっぽい

In [42]:
print('Start a new training job...')


# callback
def reset_metrics():
    def callback(env): #クラスっぽい書き方だけど…envから継承してるみたいな...
        lgb_eval_new = lgb.Dataset(X_test, y_test, reference=lgb_train)
        if env.iteration - env.begin_iteration == 5:
            print('Add a new valid dataset at iteration 5...')
            env.model.add_valid(lgb_eval_new, 'new valid')
    callback.before_iteration = True # 宣言した関数.attributeに代入するとそれを扱うことができるらしい
    callback.order = 0
    return callback


gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                valid_sets=lgb_train,
                callbacks=[reset_metrics()])

print('Finish first 10 rounds with callback function...')



Start a new training job...
[1]	training's binary_logloss: 0.610155
[2]	training's binary_logloss: 0.605968
[3]	training's binary_logloss: 0.601972
[4]	training's binary_logloss: 0.59773
[5]	training's binary_logloss: 0.593409
Add a new valid dataset at iteration 5...
[6]	training's binary_logloss: 0.589581	new valid's binary_logloss: 0.663248
[7]	training's binary_logloss: 0.586011	new valid's binary_logloss: 0.659375
[8]	training's binary_logloss: 0.582538	new valid's binary_logloss: 0.655759
[9]	training's binary_logloss: 0.578805	new valid's binary_logloss: 0.652358
[10]	training's binary_logloss: 0.575115	new valid's binary_logloss: 0.64847
Finish first 10 rounds with callback function...


In [43]:
reset_metrics.hoge=12

In [44]:
reset_metrics.hoge
#こんなことできたのかpython

12