In [19]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
df = sns.load_dataset("titanic")
# 欠損値を落とす
df.dropna(inplace=True)
# X, y を作成
X = df.loc[:, (df.columns!="survived") & (df.columns!="alive")]
y = df["survived"]
# カテゴリ変数をエンコーディングする(決定木をsklearnで扱うためにカテゴリ変数のエンコーディングが必要)
oe = OrdinalEncoder()
oe.set_output(transform="pandas")
X = oe.fit_transform(X)
# 訓練データとテストデータに分ける
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Stacking

In [36]:
class StackingClassifierCV():
    def __init__(self, estimators, final_estimator, cv):
        self.estimators = estimators # (model_name, model_instance)のタプルが配列として格納されている
        self.final_estimator = final_estimator
        self.cv = cv
        
    def fit(self, X, y):
        pred_features = {}
        for model_name, model in self.estimators:
            preds = []
            new_y = []
            for train_idx, val_idx in self.cv.split(X):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                model.fit(X_train, y_train)
                pred = model.predict_proba(X_val)[:, 1].tolist()
                preds += pred
                new_y += y_val.tolist()
            # predict proba のため、それぞれの model は全てのデータで fit した状態にしておく
            model.fit(X, y)
            pred_features[model_name] = preds
        
        # 2層目のモデルの学習
        new_X = pd.DataFrame(pred_features)
        self.final_estimator.fit(new_X, new_y)
        
    def predict_proba(self, X):
        # 1 周目のモデルで特徴量(予測値)生成
        pred_features = {}
        for model_name, model in self.estimators:
            pred = model.predict_proba(X)[:, 1]
            pred_features[model_name] = pred
        
        new_X = pd.DataFrame(pred_features)
        final_pred = self.final_estimator.predict_proba(new_X)
        return final_pred
        
            
            

In [41]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

cv = KFold(n_splits=5, shuffle=True, random_state=0)
final_estimator = LogisticRegression()
estimators=[("rf", RandomForestClassifier()), ("knn", KNeighborsClassifier()), ("logistic", LogisticRegression())]

stacking_cv = StackingClassifierCV(estimators=estimators, final_estimator=final_estimator, cv=cv)
stacking_cv.fit(X_train, y_train)
y_pred_stacking_cv = stacking_cv.predict_proba(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [42]:
from sklearn.metrics import roc_auc_score
print(f"stacking CV AUC: {roc_auc_score(y_test, y_pred_stacking_cv[:, 1])}")

stacking CV AUC: 0.858974358974359
