In [24]:
# Library
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm.auto import tqdm
import statsmodels
import statsmodels.api as sm

import lightgbm as lgb
from sklearn.model_selection import train_test_split
#from sklearn.metrics import mean_pinball_loss
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error # モデル評価用(平均二乗誤差)
from sklearn.metrics import r2_score # モデル評価用(決定係数)
import warnings
warnings.simplefilter('ignore')

# 分布確認
#import pandas_profiling as pdp

# 前処理
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

# バリデーション
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

# 評価指標
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# plot style
pd.set_option('display.max_columns', 200)
plt.rcParams['axes.facecolor'] = 'EEFFFE'

import pandas_profiling as pdp
import pickle


In [25]:
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):
    train_oof = np.zeros(len(input_x))
    metrics = []
    imp = pd.DataFrame()

    # cross-validation
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        
        # make dataset
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y.loc[idx_tr,:], input_id.loc[idx_tr, :]
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y.loc[idx_va,:], input_id.loc[idx_va, :]
        print(x_tr.shape, x_va.shape)
        
        # train
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  early_stopping_rounds=100,
                  verbose=100
                 )
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "wb") as f:
            pickle.dump(model, f, protocol=4)
        
        # evaluate
        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))
        
        # oof
        train_oof[idx_va] = y_va_pred
        
        # imp
        _imp = pd.DataFrame({"col":input_x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp])
      
    print("-"*20, "result", "-"*20)
    # metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))
    print("[oof] {:.4f}".format(
        roc_auc_score(input_y, train_oof)
    ))
    
    # oof
    train_oof = pd.concat([
        input_id,
        pd.DataFrame({"pred":train_oof})
    ], axis=1)
    
    # importance
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
    imp.columns = ["col", "imp", "imp_std"]
    
    return train_oof, imp, metrics

In [26]:
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4],
               ):
    pred = np.zeros((len(input_x), len(list_nfold)))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "rb") as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(input_x)[:,1]
    
    pred = pd.concat([
        input_id,
        pd.DataFrame({"pred": pred.mean(axis=1)}),
    ], axis=1)
    
    print("Done.")
    
    return pred

In [268]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [269]:
train["17000<blueTotalGold<18000"]=((train["blueTotalGold"]>17000)&(train["blueTotalGold"]<18000)).astype(int)
train["blueTotalExperience / blueKills"] = train["blueTotalExperience"] / train["blueKills"]
train["blueFirstBlood * blueKills"] = train["blueFirstBlood"] * train["blueKills"]
test["17000<blueTotalGold<18000"]=((test["blueTotalGold"]>17000)&(test["blueTotalGold"]<18000)).astype(int)
test["blueTotalExperience / blueKills"] = test["blueTotalExperience"] / test["blueKills"]
test["blueFirstBlood * blueKills"] = test["blueFirstBlood"] * test["blueKills"]

In [271]:
# 重要度がとても低いので除外
train=train.drop(['blueDragons'],axis=1)
test=test.drop(['blueDragons'],axis=1)

In [272]:
x_train, y_train, id_train = train.drop(['gameId','blueWins'],axis=1), \
                             train[["blueWins"]], \
                             train[["gameId"]]
print(x_train.shape, y_train.shape, id_train.shape)

(8000, 10) (8000, 1) (8000, 1)


In [273]:
# ハイパーパラメータの設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# 学習の実行
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(6400, 10) (1600, 10)
[100]	training's auc: 0.910336	valid_1's auc: 0.874201
[auc] tr:0.8960, va:0.8754
-------------------- 1 --------------------
(6400, 10) (1600, 10)
[100]	training's auc: 0.909451	valid_1's auc: 0.873149
[200]	training's auc: 0.928451	valid_1's auc: 0.872611
[auc] tr:0.9142, va:0.8734
-------------------- 2 --------------------
(6400, 10) (1600, 10)
[100]	training's auc: 0.911065	valid_1's auc: 0.866792
[200]	training's auc: 0.929599	valid_1's auc: 0.865708
[auc] tr:0.9167, va:0.8669
-------------------- 3 --------------------
(6400, 10) (1600, 10)
[100]	training's auc: 0.910515	valid_1's auc: 0.870638
[200]	training's auc: 0.92913	valid_1's auc: 0.872191
[auc] tr:0.9290, va:0.8722
-------------------- 4 --------------------
(6400, 10) (1600, 10)
[100]	training's auc: 0.909281	valid_1's auc: 0.871479
[auc] tr:0.9070, va:0.8718
-------------------- result --------------------
[[0.         0.89602938 0.87541732]
 [1.       

In [274]:
#重要度
imp.sort_values("imp", ascending=False, ignore_index=True)

Unnamed: 0,col,imp,imp_std
0,blueTotalExperience,18577.185727,667.779821
1,17000<blueTotalGold<18000,6229.108309,1202.274868
2,blueTotalGold,5947.271966,1195.937517
3,blueDeaths,2304.087436,561.765834
4,blueTotalExperience / blueKills,2210.319234,690.445367
5,blueAssists,1856.564077,633.895963
6,blueEliteMonsters,1443.638281,219.161497
7,blueFirstBlood * blueKills,1001.721596,205.550485
8,blueKills,603.726624,134.39151
9,blueFirstBlood,183.533174,44.06819


In [69]:
x_test = test.drop(['gameId'],axis=1)
id_test = test[["gameId"]]

In [70]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [72]:
test_pred

Unnamed: 0,gameId,pred
0,9,0.189067
1,15,0.204628
2,18,0.229906
3,23,0.268827
4,31,0.437375
...,...,...
1995,9971,0.294851
1996,9980,0.370147
1997,9983,0.371316
1998,9996,0.108631


In [None]:
test_pred.round().astype('int').to_csv('submission.csv',header=False,index=False)