In [1]:
import os
os.chdir('/pc_win_loss')

In [16]:
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import catboost as catb
from sklearn.ensemble import StackingClassifier

from functions.visualizer import *
from src.config import *

# データ取り込み

In [3]:
df_prep = pd.read_pickle('output/df_prep.pkl')
df_prep.head()

Unnamed: 0,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience,blueWins
0,0,5,8,6,0,0,14536,17256,0
1,1,10,1,5,0,0,14536,17863,0
2,0,3,10,2,0,0,17409,17256,0
3,1,7,10,8,0,0,19558,18201,0
4,0,4,9,4,0,0,17409,17256,0


# モデル構築

## 設定と訓練用データフレーム作成

In [4]:
random_state = 0

In [5]:
# 20%の被験者を選ぶ
df_train, df_val = train_test_split(df_prep, test_size=0.2, random_state=random_state)

print(len(df_train))
print(len(df_val))

6400
1600


In [6]:
feature_cols = list(df_train.drop(COL_BLUEWINS, axis=1).columns)

train_x = df_train.drop(COL_BLUEWINS, axis=1)
train_y = df_train[COL_BLUEWINS]

val_x = df_val.drop(COL_BLUEWINS, axis=1)
val_y = df_val[COL_BLUEWINS]

In [7]:
train_x.head()

Unnamed: 0,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience
1001,1,11,3,8,1,1,18274,18472
7360,0,6,11,7,0,0,20619,17256
5234,1,6,3,5,1,0,16961,18201
7390,0,9,5,8,0,0,18117,18472
6841,1,9,4,9,0,0,14536,17256


## モデルの作成

In [8]:
# LightGBM
lgb_clf = lgb.LGBMClassifier(random_state=random_state)
lgb_clf.fit(train_x, train_y)

# XGBoost
xgb_clf = xgb.XGBClassifier(
    learning_rate=0.05, 
    max_depth=6, 
    min_child_weight=9, 
    n_estimators=200, 
    random_state=random_state
                         )
xgb_clf.fit(train_x, train_y)

# CatBoost
catb_clf = catb.CatBoostClassifier(
    custom_loss=['Accuracy'], 
    random_seed=random_state
                               )
catb_clf.fit(train_x, train_y, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.02276
0:	learn: 0.6825752	total: 130ms	remaining: 2m 9s
1:	learn: 0.6730174	total: 165ms	remaining: 1m 22s
2:	learn: 0.6638879	total: 196ms	remaining: 1m 5s
3:	learn: 0.6543759	total: 226ms	remaining: 56.2s
4:	learn: 0.6463655	total: 255ms	remaining: 50.7s
5:	learn: 0.6390944	total: 283ms	remaining: 46.9s
6:	learn: 0.6322786	total: 313ms	remaining: 44.4s
7:	learn: 0.6250925	total: 344ms	remaining: 42.6s
8:	learn: 0.6187264	total: 375ms	remaining: 41.3s
9:	learn: 0.6126346	total: 404ms	remaining: 40s
10:	learn: 0.6061304	total: 433ms	remaining: 38.9s
11:	learn: 0.6017958	total: 462ms	remaining: 38s
12:	learn: 0.5959793	total: 491ms	remaining: 37.3s
13:	learn: 0.5905142	total: 521ms	remaining: 36.7s
14:	learn: 0.5860614	total: 552ms	remaining: 36.3s
15:	learn: 0.5813937	total: 582ms	remaining: 35.8s
16:	learn: 0.5764726	total: 611ms	remaining: 35.4s
17:	learn: 0.5713987	total: 640ms	remaining: 34.9s
18:	learn: 0.5672096	total: 670ms	remaining: 34.6s
19:	learn: 0.56

<catboost.core.CatBoostClassifier at 0x407c57f430>

## アンサンブル学習

In [13]:
estimators = (('lgb', lgb_clf), ('xgb', xgb_clf))
stk_clf = StackingClassifier(estimators=estimators)
stk_clf.fit(train_x, train_y)
results = cross_val_score(stk_clf, val_x, val_y) 

scores = {}
scores[('Stacking', 'train_score')] = results.mean()
scores[('Stacking', 'test_score')] = stk_clf.score(val_x, val_y)

# モデル評価
pd.Series(scores).unstack()

Unnamed: 0,test_score,train_score
Stacking,0.810625,0.779375


In [17]:
# 各5つのモデルを保存するリストの初期化
xgb_models = []
lgbm_models = []
catb_models = []
# 各5つのモデルの正答率を保存するリストの初期化
xgb_accuracies = []
lgbm_accuracies = []
catb_accuracies = []
# 学習のカウンター
loop_counts = 1

# 各クラスの確率（3モデル*5seed*３クラス）
first_probs = pd.DataFrame(np.zeros((len(df_prep), 3*5*3)))


for seed_no in range(5): 
        
    # 学習データの数だけの数列（0行から最終行まで連番）
    row_no_list = list(range(len(df_prep)))

    # KFoldクラスをインスタンス化（これを使って5分割する）
    K_fold = StratifiedKFold(n_splits=5, shuffle=True,  random_state=42)

    # KFoldクラスで分割した回数だけ実行（ここでは5回）
    for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y):
        # ilocで取り出す行を指定
        X_train_cv = df_prep.iloc[train_cv_no, :]
        y_train_cv = pd.Series(y).iloc[train_cv_no]
        X_eval_cv = df_prep.iloc[eval_cv_no, :]
        y_eval_cv = pd.Series(y).iloc[eval_cv_no]
        
        # XGBoostの訓練を実行
        bst, bst_accuracy, xgb_prob = xgb_train_cv(X_train_cv, y_train_cv,
                                                   X_eval_cv, y_eval_cv, 
                                                   loop_counts)
        # LIghtGBMの訓練を実行
        model, model_accuracy, lgbm_prob = lgbm_train_cv(X_train_cv, y_train_cv, 
                                                         X_eval_cv, y_eval_cv)
        # CatBoostの訓練を実行
        catb, catb_accuracy, catb_prob = catboost_train_cv(X_train_cv, y_train_cv,
                                                           X_eval_cv, y_eval_cv)
        # 実行回数のカウント
        loop_counts += 1
        
        # 学習が終わったモデルをリストに入れておく
        xgb_models.append(bst) 
        lgbm_models.append(model) 
        catb_models.append(catb) 
        
        # 学習が終わったモデルの正答率をリストに入れておく
        xgb_accuracies.append(bst_accuracy) 
        lgbm_accuracies.append(model_accuracy) 
        catb_accuracies.append(catb_accuracy) 
        
        # 検証データの各クラスの確率
        for i in range(3):
            first_probs.iloc[eval_cv_no, (seed_no * 3) + i] = xgb_prob[:, i]
            first_probs.iloc[eval_cv_no, (seed_no * 3) + 15 + i] = lgbm_prob[:, i]
            first_probs.iloc[eval_cv_no, (seed_no * 3) + 30 + i] = catb_prob[:, i]

NameError: name 'y' is not defined

In [None]:
loop_counts = 0

# 学習データとテストデータに分ける
X_train, X_test, y_train, y_test = train_test_split(first_probs, y,
                                                    test_size=0.2,
                                                    random_state=0,
                                                    stratify=y)

# 予測結果の格納用のnumpy行列を作成
test_preds = np.zeros((len(y_test), 5))

# 学習データの数だけの数列（0行から最終行まで連番）
row_no_list = list(range(len(y_train)))

# KFoldクラスをインスタンス化（これを使って5分割する）
K_fold = StratifiedKFold(n_splits=5, shuffle=True,  random_state=0)

# KFoldクラスで分割した回数だけ実行（ここでは5回）
for train_cv_no, eval_cv_no in K_fold.split(row_no_list, y_train):
    # ilocで取り出す行を指定
    X_train_cv = X_train.iloc[train_cv_no, :]
    y_train_cv = pd.Series(y_train).iloc[train_cv_no]
    X_eval_cv = X_train.iloc[eval_cv_no, :]
    y_eval_cv = pd.Series(y_train).iloc[eval_cv_no]

    # データを格納する
    # 学習用
    xgb_train = xgb.DMatrix(X_train_cv, label=y_train_cv)
    # 検証用
    xgb_eval = xgb.DMatrix(X_eval_cv, label=y_eval_cv)
    # テスト用
    xgb_test = xgb.DMatrix(X_test, label=y_test)

    xgb_params = {
        'objective': 'multi:softprob',  # 多値分類問題
        'num_class': 3,                 # 目的変数のクラス数
        'learning_rate': 0.1,           # 学習率
        'eval_metric': 'mlogloss'       # 学習用の指標 (Multiclass logloss)
    }

    # 学習
    evals = [(xgb_train, 'train'), (xgb_eval, 'eval')] # 学習に用いる検証用データ
    evaluation_results = {}                            # 学習の経過を保存する箱
    bst = xgb.train(xgb_params,                        # 上記で設定したパラメータ
                    xgb_train,                         # 使用するデータセット
                    num_boost_round=200,               # 学習の回数
                    early_stopping_rounds=10,          # アーリーストッピング
                    evals=evals,                       # 学習経過で表示する名称
                    evals_result=evaluation_results,   # 上記で設定した検証用データ
                    verbose_eval=0                     # 学習の経過の表示(非表示)
                    )


    y_pred = bst.predict(xgb_test, ntree_limit=bst.best_ntree_limit)
    y_pred_max = np.argmax(y_pred, axis=1)
    
    # testの予測を保存
    test_preds[:, loop_counts] = y_pred_max
 
    print('Trial: ' + str(loop_counts))
    loop_counts += 1
    acc = accuracy_score(y_test, y_pred_max)
    print('Accuracy:', acc)

In [None]:
# 予測したクラスのデータをpandas.DataFrameに入れる
df_test_preds = pd.DataFrame(test_preds)

# ５つの予測の格納用のnumpy行列を作成
test_preds_max = np.zeros((len(y_test), 3))

# 各列（0,1,2）に、そのクラスを予測したモデルの数を入れる
test_preds_max[:, 0] = (df_test_preds == 0).sum(axis=1)
test_preds_max[:, 1] = (df_test_preds == 1).sum(axis=1)
test_preds_max[:, 2] = (df_test_preds == 2).sum(axis=1)

# 各行で、そのクラスを予測したモデルの数が最も多いクラスを得る
pred_max = np.argmax(test_preds_max, axis=1)

# Accuracy を計算する
accuracy = sum(y_test == pred_max) / len(y_test)
print('accuracy:', accuracy)

df_accuracy = pd.DataFrame({'va_y': y_test,
                            'y_pred_max': pred_max})
print(pd.crosstab(df_accuracy['va_y'], df_accuracy['y_pred_max']))