In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# データセットを読み込む(pandas.DataFrame)
df = pd.read_csv('data1719_dfr.csv')
df_X = df.drop(['Trait'], axis=1) # 説明変数のみにする
y = df['Trait']  # 正解クラス

# 学習データとテストデータに分ける
X_train, X_test, y_train, y_test = train_test_split(df_X, y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

# 学習データを、学習用と検証用に分ける
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train,
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=y_train)


# データを格納する
# 学習用
lgb_train = lgb.Dataset(X_train, y_train,
                        free_raw_data=False)
# 検証用
lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train,
                       free_raw_data=False)

# パラメータを設定
params = {'task': 'train',                # 学習、トレーニング ⇔　予測predict
          'boosting_type': 'gbdt',        # 勾配ブースティング
          'objective': 'multiclass',      # 目的関数：多値分類、マルチクラス分類
          'metric': 'multi_logloss',      # 分類モデルの性能を測る指標
          'num_class': 3,                 # 目的変数のクラス数
          'learning_rate': 0.02,          # 学習率（初期値0.1）
          'num_leaves': 23,               # 決定木の複雑度を調整（初期値31）
          'min_data_in_leaf': 1,          # データの最小数（初期値20）
         }

# 学習
evaluation_results = {}                                     # 学習の経過を保存する箱
model = lgb.train(params,                                   # 上記で設定したパラメータ
                  lgb_train,                                # 使用するデータセット
                  num_boost_round=1000,                     # 学習の回数
                  valid_names=['train', 'valid'],           # 学習経過で表示する名称
                  valid_sets=[lgb_train, lgb_eval],         # モデル検証のデータセット
                  evals_result=evaluation_results,          # 学習の経過を保存
                  early_stopping_rounds=20,                 # アーリーストッピング
                  verbose_eval=10)                          # 学習の経過の表示(10回毎)

# 最もスコアが良いときのラウンドを保存
optimum_boost_rounds = model.best_iteration

# テストデータで予測
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_max = np.argmax(y_pred, axis=1)

# Accuracy の計算
accuracy = sum(y_test == y_pred_max) / len(y_test)
print('accuracy:', accuracy)

# feature importanceを表示
importance = pd.DataFrame(model.feature_importance(), index=df_X.columns, columns=['importance'])
display(importance)

Training until validation scores don't improve for 20 rounds
[10]	train's multi_logloss: 0.631532	valid's multi_logloss: 0.648258
[20]	train's multi_logloss: 0.594258	valid's multi_logloss: 0.62394
[30]	train's multi_logloss: 0.567324	valid's multi_logloss: 0.607003
[40]	train's multi_logloss: 0.547697	valid's multi_logloss: 0.596615
[50]	train's multi_logloss: 0.533072	valid's multi_logloss: 0.590565
[60]	train's multi_logloss: 0.522022	valid's multi_logloss: 0.587101
[70]	train's multi_logloss: 0.513585	valid's multi_logloss: 0.585476
[80]	train's multi_logloss: 0.507024	valid's multi_logloss: 0.585039
[90]	train's multi_logloss: 0.502036	valid's multi_logloss: 0.584566
[100]	train's multi_logloss: 0.498219	valid's multi_logloss: 0.58625
[110]	train's multi_logloss: 0.495268	valid's multi_logloss: 0.58882
Early stopping, best iteration is:
[90]	train's multi_logloss: 0.502036	valid's multi_logloss: 0.584566
accuracy: 0.6591928251121076


Unnamed: 0,importance
item1_1_1_a,0
item1_1_2_a,0
item1_1_3_a,0
item1_1_4_a,0
item1_3_1_a,0
...,...
item35_5_2_a,0
item35_5_3_a,38
item35_7_1_a,28
item35_7_2_a,92


In [5]:
# feature importanceを表示
importance = pd.DataFrame(model.feature_importance(importance_type='split'), index=df_X.columns, columns=['importance'])
importance = importance.sort_values('importance', ascending=False)
display(importance)

Unnamed: 0,importance
item2_3_3_a,202
item18_3_3_a,180
item6_7_3_a,170
item10_3_2_a,162
item1_7_2_a,126
...,...
item15_3_2_a,0
item15_1_4_a,0
item15_1_3_a,0
item15_1_2_a,0


In [6]:
# feature importanceを表示
importance = pd.DataFrame(model.feature_importance(importance_type='gain'), index=df_X.columns, columns=['importance'])
importance = importance.sort_values('importance', ascending=False)
display(importance)

Unnamed: 0,importance
item18_3_3_a,1293.632110
item6_7_3_a,784.494720
item14_3_3_a,576.928788
item1_7_2_a,364.332579
item10_3_2_a,339.957663
...,...
item15_7_3_a,0.000000
item15_5_4_a,0.000000
item15_5_1_a,0.000000
item15_3_4_a,0.000000
