## スタッキング

In [0]:
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb 
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

# これらの5つの基本モデルをスタッキングに使用
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
                             GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer

In [0]:
data = load_breast_cancer()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
    data.data, data.target, stratify = data.target, random_state=42
)

In [0]:
# パラメータ
SPLITS = 5
SEED = 0
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
kf = KFold(n_splits=SPLITS, shuffle=False, random_state=SEED)

In [0]:
# Sklearn分類機を拡張
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def fit(self,x,y):
        return self.clf.fit(x,y)
      
    def score(self,x,y):
        return self.clf.score(x,y)

    def feature_importances(self,x,y):
        return self.clf.fit(x,y).feature_importances_

In [0]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((SPLITS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)): # SPLITS回まわる
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [0]:
# 各モデルのパラメータ
# Random Forest
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'warm_start': False, 
    'max_features': 'auto',
    'max_depth': 10,
    'min_samples_leaf': 1,
    'verbose': 0
}

# Extra Trees
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    'max_features': 'auto',
    'max_depth': 10,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting
gb_params = {
    'n_estimators': 500,
    'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

#Support Vector Classifier 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [0]:
# 5つの学習モデルのオブジェクトを作成
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [0]:
# 第1段階(レベル0)の学習と予測を実行する
rf_oof_train, rf_oof_test = get_oof(rf,X_train, y_train, X_test)     # Random Forest Classifier
et_oof_train, et_oof_test = get_oof(et, X_train, y_train, X_test)    # Extra Trees Classifier
ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, X_test) # AdaBoost Classifier
gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, X_test)     # Gradient Boost Classifier
svc_oof_train, svc_oof_test = get_oof(svc,X_train, y_train, X_test)  # Support Vector Classifier

print("Training is complete")

In [0]:
rf.fit(X_train,y_train)
et.fit(X_train,y_train)
ada.fit(X_train,y_train)
gb.fit(X_train,y_train)
svc.fit(X_train,y_train)

In [0]:
print(rf.score(X_test,y_test))
print(et.score(X_test,y_test))
print(ada.score(X_test,y_test))
print(gb.score(X_test,y_test))
print(svc.score(X_test,y_test))

## 特徴量の考察

In [0]:
rf_features = rf.feature_importances(X_train,y_train)
et_features = et.feature_importances(X_train, y_train)
ada_features = ada.feature_importances(X_train, y_train)
gb_features = gb.feature_importances(X_train,y_train)

In [0]:
cols = data.feature_names
# 特徴の重要度データフレームを作成
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
     'AdaBoost feature importances': ada_features,
     'Gradient Boost feature importances': gb_features
    }, index=np.arange(30))

feature_dataframe

In [0]:
# 特徴の重要度の平均を計算
feature_dataframe['mean'] = feature_dataframe.mean(axis= 1)
feature_dataframe.head(5)

In [0]:
# 最も精度の高かったモデルを表示
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
                                        'ExtraTrees': et_oof_train.ravel(),
                                        'AdaBoost': ada_oof_train.ravel(),
                                        'GradientBoost': gb_oof_train.ravel(),
                                        'SVC': svc_oof_train.ravel()
                                      })
print('base_predictions_train.shape : ', base_predictions_train.shape)
base_predictions_train.head(5)

## レベル0の学習結果を反映した訓練データを定義し、学習

In [0]:
X_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
X_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
print('x_train.shape : ', X_train.shape)
print('x_test.shape : ', X_test.shape)

In [0]:
model  = xgb.XGBClassifier(
    eta = 0.02,
    n_estimators = 100,
    min_child_weight= 2,
    min_samples_leaf = 1,
    max_depth= 3,
    gamma=0.9,
    subsample=0.8,
    colsample_bytree=0.8,
    Lambda=0.2,
    alpha=0.2,
    scale_pos_weight=1,
    objective= 'binary:logistic',
    eval_metric = 'rmse',
    random_state=0
)

In [0]:
model.fit(X_train, y_train)

In [0]:
# 訓練データの精度
model.score(X_train, y_train)

In [0]:
# テストデータの精度
model.score(X_test, y_test)