In [1]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [2]:
# データ準備
df = sns.load_dataset('titanic')
df.dropna(inplace=True)
# X, yを作成
X = df.loc[:, (df.columns!='survived') & (df.columns!='alive')]
y = df['survived']
# ラベルエンコーディング
oe = OrdinalEncoder()
oe.set_output(transform='pandas')
X = oe.fit_transform(X)
# hold-out
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### Stacking

In [11]:
class StackingClassifierCV:

    def __init__(self, estimators, final_estimator, cv):
        self.estimators = estimators # [('rf', RandomForestClassifier()), ('knn', KNeighborsCalssifier()), (,), ..]
        self.final_estimator = final_estimator
        self.cv = cv

    def fit(self, X, y):
        pred_features = {}
        # 各estimatorについて学習する
        for model_name, model in self.estimators:
            preds = []
            new_y = []

            # 各Foldについて学習
            for train_idx, val_idx in self.cv.split(X):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                # model の学習
                model.fit(X_train, y_train)
                # model の推定値を2層目のモデルの特徴量とする
                pred = model.predict_proba(X_val)[:, 1].tolist()
                # preds に推定値 pred を格納
                preds += pred
                # cv.splitによりXの順番が変わっているので，それに合わせて新しくyを作成する
                new_y += y_val.tolist()
            
            # 各Foldの推定値を新たな特徴量とする
            pred_features[model_name] = preds
            # predict_proba メソッドのために, すべてのデータを使って学習したモデルを残しておく
            model.fit(X, y)
        
        # 2層目のモデル学習
        new_X =  pd.DataFrame(pred_features)
        self.final_estimator.fit(new_X, new_y)
            
    def predict_proba(self, X):
        # 1層目のモデル(すべてのデータを用いて学習済み)で予測値生成
        pred_features = {}
        for model_name, model in self.estimators:
            pred = model.predict_proba(X)[:, 1]
            pred_features[model_name] = pred
        
        new_X = pd.DataFrame(pred_features)
        final_pred = self.final_estimator.predict_proba(new_X)
        return final_pred

In [15]:
# 一層目のモデル
estimators=[('rf', RandomForestClassifier()), ('knn', KNeighborsClassifier()), ('logistic', LogisticRegression())]
# 二層目のモデル
final_estimator = LogisticRegression()
cv = KFold(n_splits=5, shuffle=True, random_state=0)
stacking_cv = StackingClassifierCV(estimators=[('rf', RandomForestClassifier()), ('knn', KNeighborsClassifier())],
                     final_estimator=final_estimator,
                     cv=cv)
stacking_cv.fit(X_train, y_train)
y_pred_stacking_cv = stacking_cv.predict_proba(X_test)

In [16]:
# 評価
print(f"stackingCV AUC: {roc_auc_score(y_test, y_pred_stacking_cv[:, 1])}")

stackingCV AUC: 0.8133012820512822
