In [33]:
import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

In [2]:
# データを用意する
iris = datasets.load_iris()                   # scikit-learnのdatasetsを読み込む
X = pd.DataFrame(iris.data[:, [0, 1, 2, 3]])  # 訓練データ
Y = pd.Series(iris.target)                    # 教師データ

In [3]:
# データをホールドアウト法で分割
train_X, test_X, train_Y, test_Y = train_test_split(X, Y,                # 訓練データとテストデータに分割する
                                                    test_size=0.3,       # テストデータの割合
                                                    shuffle=True,        # シャッフルする
                                                    random_state=0)      # 乱数シードを固定する

In [4]:
# データセットを登録
lgb_train = lgb.Dataset(train_X, train_Y)
lgb_test = lgb.Dataset(test_X, test_Y, reference=lgb_train)

In [5]:
# LightGBMのハイパーパラメータを設定
params = {'task': 'train',              # タスクを訓練に設定
            'boosting_type': 'gbdt',      # GBDTを指定
            'objective': 'multiclass',    # 多クラス分類を指定
            'metric': {'multi_logloss'},  # 多クラス分類の損失（誤差）
            'num_class': 3,               # クラスの数（irisデータセットが3個のクラスなので）
            'learning_rate': 0.1,         # 学習率
            'num_leaves': 21,             # ノードの数
            'min_data_in_leaf': 3,        # 決定木ノードの最小データ数
            'num_iteration': 100}         # 予測器(決定木)の数:イタレーション

In [6]:
lgb_results = {}                                    # 学習の履歴を入れる入物
model = lgb.train(params=params,                    # ハイパーパラメータをセット
                    train_set=lgb_train,              # 訓練データを訓練用にセット
                    valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
                    valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
                    num_boost_round=100,              # 計算回数
                    early_stopping_rounds=10,         # アーリーストッピング設定
                    evals_result=lgb_results)         # 履歴を保存する

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85
[LightGBM] [Info] Number of data points in the train set: 105, number of used features: 4
[LightGBM] [Info] Start training from score -1.127600
[LightGBM] [Info] Start training from score -1.188224
[LightGBM] [Info] Start training from score -0.990399
[1]	Train's multi_logloss: 0.913561	Test's multi_logloss: 0.934514
Training until validation scores don't improve for 10 rounds
[2]	Train's multi_logloss: 0.772834	Test's multi_logloss: 0.793098
[3]	Train's multi_logloss: 0.660348	Test's multi_logloss: 0.680805
[4]	Train's multi_logloss: 0.568621	Test's multi_logloss: 0.589843
[5]	Train's multi_logloss: 0.492768	Test's multi_logloss: 0.515155
[6]	Train's multi_logloss: 0.429386	Test's multi_logloss: 0.453238
[7]	Train's multi_logloss: 0.376003	Test's multi_logloss: 0.401553
[8]	Train's multi_logloss: 0.330755	Test's multi_logloss: 0.358195
[9]	Train's multi_logloss: 0.292209	Test's multi_logloss: 0.3



In [7]:
loss_train = lgb_results['Train']['multi_logloss']  # 訓練誤差
loss_test = lgb_results['Test']['multi_logloss']    # 汎化誤差
best_iteration = model.best_iteration               # 最良の予測器が得られたイタレーション数
print(best_iteration)

22


In [39]:
display(test_X.shape)
y_pred = model.predict(test_X, num_iteration=model.best_iteration)
y_pred = np.argmax(y_pred, axis=1) 
display(y_pred.shape)
display(test_Y.shape)
accuracy = accuracy_score(y_pred, test_Y)
print(f'accuracy: {accuracy}')
cm = confusion_matrix(test_Y, y_pred)
print(cm)

(45, 4)

(45,)

(45,)

accuracy: 0.9555555555555556
[[16  0  0]
 [ 0 17  1]
 [ 0  1 10]]
