In [1]:
import os
os.chdir('/pc_win_loss')

In [12]:
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoost, CatBoostClassifier, Pool
from sklearn.ensemble import StackingClassifier

from functions.visualizer import *
from src.config import *

# データ取り込み

In [3]:
df_prep = pd.read_pickle('output/df_prep.pkl')
df_prep.head()

Unnamed: 0,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience,blueWins
0,0,5,8,6,0,0,14536,17256,0
1,1,10,1,5,0,0,14536,17863,0
2,0,3,10,2,0,0,17409,17256,0
3,1,7,10,8,0,0,19558,18201,0
4,0,4,9,4,0,0,17409,17256,0


In [4]:
df_X = df_prep.drop(COL_BLUEWINS, axis=1)
y = df_prep[COL_BLUEWINS]

# モデル構築

## 関数定義

### LightGBM

In [5]:
'''def lgbm_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):
    # データを格納する
    # 学習用
    lgb_train = lgb.Dataset(X_train_cv, y_train_cv,
                            free_raw_data=False)
    # 検証用
    lgb_eval = lgb.Dataset(X_eval_cv, y_eval_cv, reference=lgb_train,
                           free_raw_data=False)

    # 学習
    evaluation_results = {}
    model = lgb.LGBMClassifier()
    model.fit(X_train_cv, y_train_cv)
    
    # 検証用データで予測
    y_pred = model.predict(X_eval_cv)
    # y_pred_max = np.argmax(y_pred, axis=1)

    # Accuracy の計算
    # accuracy = accuracy_score(y_eval_cv, y_pred_max)
    accuracy = accuracy_score(y_eval_cv, y_pred)
    print('LightGBM Accuracy:', accuracy)
    
    return(model, accuracy, y_pred)'''

"def lgbm_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):\n    # データを格納する\n    # 学習用\n    lgb_train = lgb.Dataset(X_train_cv, y_train_cv,\n                            free_raw_data=False)\n    # 検証用\n    lgb_eval = lgb.Dataset(X_eval_cv, y_eval_cv, reference=lgb_train,\n                           free_raw_data=False)\n\n    # 学習\n    evaluation_results = {}\n    model = lgb.LGBMClassifier()\n    model.fit(X_train_cv, y_train_cv)\n    \n    # 検証用データで予測\n    y_pred = model.predict(X_eval_cv)\n    # y_pred_max = np.argmax(y_pred, axis=1)\n\n    # Accuracy の計算\n    # accuracy = accuracy_score(y_eval_cv, y_pred_max)\n    accuracy = accuracy_score(y_eval_cv, y_pred)\n    print('LightGBM Accuracy:', accuracy)\n    \n    return(model, accuracy, y_pred)"

In [6]:
def lgbm_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):
    # データを格納する
    # 学習用
    lgb_train = lgb.Dataset(X_train_cv, y_train_cv,
                            free_raw_data=False)
    # 検証用
    lgb_eval = lgb.Dataset(X_eval_cv, y_eval_cv, reference=lgb_train,
                           free_raw_data=False)
    
    # パラメータを設定
    params = {'task': 'train',                # レーニング ⇔　予測predict
              'boosting_type': 'gbdt',        # 勾配ブースティング
              'objective': 'multiclass',      # 目的関数：多値分類、マルチクラス分類
              'metric': 'multi_logloss',      # 検証用データセットで、分類モデルの性能を測る指標
              'num_class': 2,                 # 目的変数のクラス数
              'learning_rate': 0.1,           # 学習率（初期値0.1）
              'num_leaves': 23,               # 決定木の複雑度を調整（初期値31）
              'min_data_in_leaf': 1,          # データの最小数（初期値20）
             }

    # 学習
    evaluation_results = {}                                # 学習の経過を保存する箱
    model = lgb.train(params,                              # 上記で設定したパラメータ
                      lgb_train,                           # 使用するデータセット
                      num_boost_round=200,                 # 学習の回数
                      valid_names=['train', 'valid'],      # 学習経過で表示する名称
                      valid_sets=[lgb_train, lgb_eval],    # モデルの検証に使用するデータセット
                      evals_result=evaluation_results,     # 学習の経過を保存
                      early_stopping_rounds=10,            # アーリーストッピングの回数
                      verbose_eval=0)                      # 学習の経過を表示する刻み（非表示）

    # 検証用データで予測
    y_pred = model.predict(X_eval_cv, num_iteration=model.best_iteration)
    y_pred_max = np.argmax(y_pred, axis=1)

    # Accuracy の計算
    accuracy = accuracy_score(y_eval_cv, y_pred_max)
    print('LightGBM Accuracy:', accuracy)
    
    return(model, accuracy, y_pred)

### XGBoost

In [7]:
'''def xgb_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv, loop_counts):
    # データを格納する
    # 学習用
    xgb_train = xgb.DMatrix(X_train_cv, label=y_train_cv)
    # 検証用
    xgb_eval = xgb.DMatrix(X_eval_cv, label=y_eval_cv)
    # テスト用
    #xgb_test = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'learning_rate': 0.05,           # 学習率
        'max_depth': 6, 
        'min_child_weight': 9, 
        'n_estimators': 200
    }

    # 学習
    evals = [(xgb_train, 'train'), (xgb_eval, 'eval')] # 学習に用いる検証用データ
    evaluation_results = {}                            # 学習の経過を保存する箱
    bst = xgb.XGBClassifier(
                        learning_rate=0.05, 
                        max_depth=6, 
                        min_child_weight=9, 
                        n_estimators=200
                         )
    bst.fit(X_train_cv, y_train_cv)
    
    # 検証用データで予測
    y_pred = bst.predict(X_eval_cv)

    print('Trial: ' + str(loop_counts))
    
    # Accuracy の計算
    accuracy = accuracy_score(y_eval_cv, y_pred)
    print('XGBoost Accuracy:', accuracy)
    
    return(bst, accuracy, y_pred)'''

"def xgb_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv, loop_counts):\n    # データを格納する\n    # 学習用\n    xgb_train = xgb.DMatrix(X_train_cv, label=y_train_cv)\n    # 検証用\n    xgb_eval = xgb.DMatrix(X_eval_cv, label=y_eval_cv)\n    # テスト用\n    #xgb_test = xgb.DMatrix(X_test, label=y_test)\n\n    xgb_params = {\n        'learning_rate': 0.05,           # 学習率\n        'max_depth': 6, \n        'min_child_weight': 9, \n        'n_estimators': 200\n    }\n\n    # 学習\n    evals = [(xgb_train, 'train'), (xgb_eval, 'eval')] # 学習に用いる検証用データ\n    evaluation_results = {}                            # 学習の経過を保存する箱\n    bst = xgb.XGBClassifier(\n                        learning_rate=0.05, \n                        max_depth=6, \n                        min_child_weight=9, \n                        n_estimators=200\n                         )\n    bst.fit(X_train_cv, y_train_cv)\n    \n    # 検証用データで予測\n    y_pred = bst.predict(X_eval_cv)\n\n    print('Trial: ' + str(loop_counts))\n    \n    # Accu

In [8]:
def xgb_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv, loop_counts):
    # データを格納する
    # 学習用
    xgb_train = xgb.DMatrix(X_train_cv, label=y_train_cv)
    # 検証用
    xgb_eval = xgb.DMatrix(X_eval_cv, label=y_eval_cv)
    # テスト用
    #xgb_test = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'multi:softprob',  # 多値分類問題
        'num_class': 2,                 # 目的変数のクラス数
        'learning_rate': 0.1,           # 学習率
        'eval_metric': 'mlogloss'       # 学習用の指標 (Multiclass logloss)
    }

    # 学習
    evals = [(xgb_train, 'train'), (xgb_eval, 'eval')] # 学習に用いる検証用データ
    evaluation_results = {}                            # 学習の経過を保存する箱
    bst = xgb.train(xgb_params,                        # 上記で設定したパラメータ
                    xgb_train,                         # 使用するデータセット
                    num_boost_round=200,               # 学習の回数
                    early_stopping_rounds=10,          # アーリーストッピング
                    evals=evals,                       # 学習経過で表示する名称
                    evals_result=evaluation_results,   # 上記で設定した検証用データ
                    verbose_eval=0                     # 学習の経過の表示(非表示)
                    )
    
    # 検証用データで予測
    y_pred = bst.predict(xgb_eval, ntree_limit=bst.best_ntree_limit)
    y_pred_max = np.argmax(y_pred, axis=1)

    print('Trial: ' + str(loop_counts))
    
    # Accuracy の計算
    accuracy = accuracy_score(y_eval_cv, y_pred_max)
    print('XGBoost Accuracy:', accuracy)
    
    return(bst, accuracy, y_pred)

### CαtBoost

In [9]:
'''def catboost_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):
    # データを格納する
    # 学習用
    CatBoost_train = Pool(X_train_cv, label=y_train_cv)
    # 検証用
    CatBoost_eval = Pool(X_eval_cv, label=y_eval_cv)

    # 学習
    catb = CatBoostClassifier(custom_loss=['Accuracy'])
    catb.fit(X_train_cv, y_train_cv, verbose=False)

    # 検証用データで予測
    y_pred = catb.predict(X_eval_cv)
    # y_pred_max = np.argmax(y_pred, axis=1)

    # Accuracy の計算
    # accuracy = sum(y_eval_cv == y_pred_max) / len(y_eval_cv)
    accuracy = accuracy_score(y_eval_cv, y_pred)
    print('CatBoost Accuracy:', accuracy)
    
    return(catb, accuracy, y_pred)'''

"def catboost_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):\n    # データを格納する\n    # 学習用\n    CatBoost_train = Pool(X_train_cv, label=y_train_cv)\n    # 検証用\n    CatBoost_eval = Pool(X_eval_cv, label=y_eval_cv)\n\n    # 学習\n    catb = CatBoostClassifier(custom_loss=['Accuracy'])\n    catb.fit(X_train_cv, y_train_cv, verbose=False)\n\n    # 検証用データで予測\n    y_pred = catb.predict(X_eval_cv)\n    # y_pred_max = np.argmax(y_pred, axis=1)\n\n    # Accuracy の計算\n    # accuracy = sum(y_eval_cv == y_pred_max) / len(y_eval_cv)\n    accuracy = accuracy_score(y_eval_cv, y_pred)\n    print('CatBoost Accuracy:', accuracy)\n    \n    return(catb, accuracy, y_pred)"

In [10]:
def catboost_train_cv(X_train_cv, y_train_cv, X_eval_cv, y_eval_cv):
    # データを格納する
    # 学習用
    CatBoost_train = Pool(X_train_cv, label=y_train_cv)
    # 検証用
    CatBoost_eval = Pool(X_eval_cv, label=y_eval_cv)

    # パラメータを設定
    params = {        
        'loss_function': 'MultiClass',    # 多値分類問題
        'num_boost_round': 1000,          # 学習の回数
        'early_stopping_rounds': 10       # アーリーストッピングの回数
    }

    # 学習
    catb = CatBoost(params)
    catb.fit(CatBoost_train, eval_set=[CatBoost_eval], verbose=False)

    # 検証用データで予測
    y_pred = catb.predict(X_eval_cv, prediction_type='Probability')
    y_pred_max = np.argmax(y_pred, axis=1)

    # Accuracy の計算
    accuracy = sum(y_eval_cv == y_pred_max) / len(y_eval_cv)
    print('CatBoost Accuracy:', accuracy)
    
    return(catb, accuracy, y_pred)

## アンサンブル学習

In [16]:
# 各5つのモデルを保存するリストの初期化
xgb_models = []
lgbm_models = []
catb_models = []
# 各5つのモデルの正答率を保存するリストの初期化
xgb_accuracies = []
lgbm_accuracies = []
catb_accuracies = []
# 学習のカウンター
loop_counts = 1

# 各クラスの確率（3モデル*5seed*３クラス）
first_probs = pd.DataFrame(np.zeros((len(df_X), 3*5*2)))

# count = 0
for seed_no in range(5): 
#     print('count: {}'.format(str(count)))
#     if count >= 2:
#         print('----------------end----------------')
#         break
        
#     count += 1
    
    # 学習データの数だけの数列（0行から最終行まで連番）
    row_no_list = list(range(len(df_X)))

    # KFoldクラスをインスタンス化（これを使って5分割する）
    K_fold = StratifiedKFold(n_splits=5, shuffle=True,  random_state=42)

    
    # KFoldクラスで分割した回数だけ実行（ここでは5回）
    for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y):        
        # ilocで取り出す行を指定
        X_train_cv = df_X.iloc[train_cv_no, :]
        y_train_cv = pd.Series(y).iloc[train_cv_no]
        X_eval_cv = df_X.iloc[eval_cv_no, :]
        y_eval_cv = pd.Series(y).iloc[eval_cv_no]
        
        # XGBoostの訓練を実行
        bst, bst_accuracy, xgb_prob = xgb_train_cv(X_train_cv, y_train_cv,
                                                   X_eval_cv, y_eval_cv, 
                                                   loop_counts)
        # LIghtGBMの訓練を実行
        model, model_accuracy, lgbm_prob = lgbm_train_cv(X_train_cv, y_train_cv, 
                                                         X_eval_cv, y_eval_cv)
        # CatBoostの訓練を実行
        catb, catb_accuracy, catb_prob = catboost_train_cv(X_train_cv, y_train_cv,
                                                           X_eval_cv, y_eval_cv)
        # 実行回数のカウント
        loop_counts += 1
        
        # 学習が終わったモデルをリストに入れておく
        xgb_models.append(bst) 
        lgbm_models.append(model) 
        catb_models.append(catb) 
        
        # 学習が終わったモデルの正答率をリストに入れておく
        xgb_accuracies.append(bst_accuracy) 
        lgbm_accuracies.append(model_accuracy) 
        catb_accuracies.append(catb_accuracy) 
        
        # 検証データの各クラスの確率
        for i in range(2):
            first_probs.iloc[eval_cv_no, (seed_no * 2) + i] = xgb_prob[:, i]
            first_probs.iloc[eval_cv_no, (seed_no * 2) + 10 + i] = lgbm_prob[:, i]
            first_probs.iloc[eval_cv_no, (seed_no * 2) + 20 + i] = catb_prob[:, i]



Trial: 1
XGBoost Accuracy: 0.77875
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.677028
[LightGBM] [Info] Start training from score -0.709531
LightGBM Accuracy: 0.78
CatBoost Accuracy: 0.780625




Trial: 2
XGBoost Accuracy: 0.798125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.80625
CatBoost Accuracy: 0.80375




Trial: 3
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.7775
CatBoost Accuracy: 0.775




Trial: 4
XGBoost Accuracy: 0.791875
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.783125
CatBoost Accuracy: 0.789375




Trial: 5
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.78125
CatBoost Accuracy: 0.7775




Trial: 6
XGBoost Accuracy: 0.77875
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.677028
[LightGBM] [Info] Start training from score -0.709531
LightGBM Accuracy: 0.78
CatBoost Accuracy: 0.780625




Trial: 7
XGBoost Accuracy: 0.798125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.80625
CatBoost Accuracy: 0.80375




Trial: 8
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.7775
CatBoost Accuracy: 0.775




Trial: 9
XGBoost Accuracy: 0.791875
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.783125
CatBoost Accuracy: 0.789375




Trial: 10
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.78125
CatBoost Accuracy: 0.7775




Trial: 11
XGBoost Accuracy: 0.77875
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.677028
[LightGBM] [Info] Start training from score -0.709531
LightGBM Accuracy: 0.78
CatBoost Accuracy: 0.780625




Trial: 12
XGBoost Accuracy: 0.798125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.80625
CatBoost Accuracy: 0.80375




Trial: 13
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.7775
CatBoost Accuracy: 0.775




Trial: 14
XGBoost Accuracy: 0.791875
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.783125
CatBoost Accuracy: 0.789375




Trial: 15
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.78125
CatBoost Accuracy: 0.7775




Trial: 16
XGBoost Accuracy: 0.77875
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.677028
[LightGBM] [Info] Start training from score -0.709531
LightGBM Accuracy: 0.78
CatBoost Accuracy: 0.780625




Trial: 17
XGBoost Accuracy: 0.798125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.80625
CatBoost Accuracy: 0.80375




Trial: 18
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.7775
CatBoost Accuracy: 0.775




Trial: 19
XGBoost Accuracy: 0.791875
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.783125
CatBoost Accuracy: 0.789375




Trial: 20
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.78125
CatBoost Accuracy: 0.7775




Trial: 21
XGBoost Accuracy: 0.77875
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.677028
[LightGBM] [Info] Start training from score -0.709531
LightGBM Accuracy: 0.78
CatBoost Accuracy: 0.780625




Trial: 22
XGBoost Accuracy: 0.798125
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.80625
CatBoost Accuracy: 0.80375




Trial: 23
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 99
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.7775
CatBoost Accuracy: 0.775




Trial: 24
XGBoost Accuracy: 0.791875
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.783125
CatBoost Accuracy: 0.789375




Trial: 25
XGBoost Accuracy: 0.778125
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 6400, number of used features: 8
[LightGBM] [Info] Start training from score -0.676720
[LightGBM] [Info] Start training from score -0.709848
LightGBM Accuracy: 0.78125
CatBoost Accuracy: 0.7775


In [17]:
first_probs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.580655,0.419345,0.580655,0.419345,0.580655,0.419345,0.580655,0.419345,0.580655,0.419345,...,0.565190,0.434810,0.565190,0.434810,0.565190,0.434810,0.565190,0.434810,0.565190,0.434810
1,0.595935,0.404065,0.595935,0.404065,0.595935,0.404065,0.595935,0.404065,0.595935,0.404065,...,0.436360,0.563640,0.436360,0.563640,0.436360,0.563640,0.436360,0.563640,0.436360,0.563640
2,0.934326,0.065674,0.934326,0.065674,0.934326,0.065674,0.934326,0.065674,0.934326,0.065674,...,0.937096,0.062904,0.937096,0.062904,0.937096,0.062904,0.937096,0.062904,0.937096,0.062904
3,0.370788,0.629212,0.370788,0.629212,0.370788,0.629212,0.370788,0.629212,0.370788,0.629212,...,0.412068,0.587932,0.412068,0.587932,0.412068,0.587932,0.412068,0.587932,0.412068,0.587932
4,0.949020,0.050980,0.949020,0.050980,0.949020,0.050980,0.949020,0.050980,0.949020,0.050980,...,0.937537,0.062463,0.937537,0.062463,0.937537,0.062463,0.937537,0.062463,0.937537,0.062463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,0.063863,0.936137,0.063863,0.936137,0.063863,0.936137,0.063863,0.936137,0.063863,0.936137,...,0.067467,0.932533,0.067467,0.932533,0.067467,0.932533,0.067467,0.932533,0.067467,0.932533
7996,0.691422,0.308578,0.691422,0.308578,0.691422,0.308578,0.691422,0.308578,0.691422,0.308578,...,0.611099,0.388901,0.611099,0.388901,0.611099,0.388901,0.611099,0.388901,0.611099,0.388901
7997,0.076721,0.923279,0.076721,0.923279,0.076721,0.923279,0.076721,0.923279,0.076721,0.923279,...,0.063214,0.936786,0.063214,0.936786,0.063214,0.936786,0.063214,0.936786,0.063214,0.936786
7998,0.120339,0.879661,0.120339,0.879661,0.120339,0.879661,0.120339,0.879661,0.120339,0.879661,...,0.149198,0.850802,0.149198,0.850802,0.149198,0.850802,0.149198,0.850802,0.149198,0.850802


In [18]:
loop_counts = 1

# 学習データとテストデータに分ける
X_train, X_test, y_train, y_test = train_test_split(first_probs, y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

# 予測結果の格納用のnumpy行列を作成
test_preds = np.zeros((len(y_test), 5))

# 学習データの数だけの数列（0行から最終行まで連番）
row_no_list = list(range(len(y_train)))

# KFoldクラスをインスタンス化（これを使って5分割する）
K_fold = StratifiedKFold(n_splits=5, shuffle=True,  random_state=0)

# KFoldクラスで分割した回数だけ実行（ここでは5回）
for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y_train):
    # ilocで取り出す行を指定
    X_train_cv = X_train.iloc[train_cv_no, :]
    y_train_cv = pd.Series(y_train).iloc[train_cv_no]
    X_eval_cv = X_train.iloc[eval_cv_no, :]
    y_eval_cv = pd.Series(y_train).iloc[eval_cv_no]

    # データを格納する
    # 学習用
    xgb_train = xgb.DMatrix(X_train_cv, label=y_train_cv)
    # 検証用
    xgb_eval = xgb.DMatrix(X_eval_cv, label=y_eval_cv)
    # テスト用
    xgb_test = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'multi:softprob',  # 多値分類問題
        'num_class': 2,                 # 目的変数のクラス数
        'learning_rate': 0.1,           # 学習率
        'eval_metric': 'mlogloss'       # 学習用の指標 (Multiclass logloss)
    }

    # 学習
    evals = [(xgb_train, 'train'), (xgb_eval, 'eval')] # 学習に用いる検証用データ
    evaluation_results = {}                            # 学習の経過を保存する箱
    bst = xgb.train(xgb_params,                        # 上記で設定したパラメータ
                    xgb_train,                         # 使用するデータセット
                    num_boost_round=200,               # 学習の回数
                    early_stopping_rounds=10,          # アーリーストッピング
                    evals=evals,                       # 学習経過で表示する名称
                    evals_result=evaluation_results,   # 上記で設定した検証用データ
                    verbose_eval=0                     # 学習の経過の表示(非表示)
                    )


    y_pred = bst.predict(xgb_test, ntree_limit=bst.best_ntree_limit)
    y_pred_max = np.argmax(y_pred, axis=1)
    
    # testの予測を保存
    test_preds[:, loop_counts] = y_pred_max
 
    print('Trial: ' + str(loop_counts))
    loop_counts += 1
    acc = accuracy_score(y_test, y_pred_max)
    print('Accuracy:', acc)



Trial: 0
Accuracy: 0.78875




Trial: 1
Accuracy: 0.79125




Trial: 2
Accuracy: 0.790625




Trial: 3
Accuracy: 0.789375
Trial: 4
Accuracy: 0.7925




In [19]:
# 予測したクラスのデータをpandas.DataFrameに入れる
df_test_preds = pd.DataFrame(test_preds)

# ５つの予測の格納用のnumpy行列を作成
test_preds_max = np.zeros((len(y_test), 3))

# 各列（0,1,2）に、そのクラスを予測したモデルの数を入れる
test_preds_max[:, 0] = (df_test_preds == 0).sum(axis=1)
test_preds_max[:, 1] = (df_test_preds == 1).sum(axis=1)
test_preds_max[:, 2] = (df_test_preds == 2).sum(axis=1)

# 各行で、そのクラスを予測したモデルの数が最も多いクラスを得る
pred_max = np.argmax(test_preds_max, axis=1)

# Accuracy を計算する
accuracy = sum(y_test == pred_max) / len(y_test)
print('accuracy:', accuracy)

df_accuracy = pd.DataFrame({'va_y': y_test,
                            'y_pred_max': pred_max})
print(pd.crosstab(df_accuracy['va_y'], df_accuracy['y_pred_max']))

accuracy: 0.790625
y_pred_max    0    1
va_y                
0           641  172
1           163  624
