In [1]:
import numpy as np
import polars as pl
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
from itertools import product

In [2]:
# each prediction
train_origin = pl.read_csv("data/train.csv")

clf_feat_lst = [
    "catboost_feat00", 
    "lgbm_feat01", 
    "randomforest_feat02", 
    "logisticregression_feat03", 
    "tabnet_feat03"
]

oof_pred_df_lst = [pl.read_csv(f"pred/oof_pred_{clf_feat}.csv") for clf_feat in clf_feat_lst]
test_pred_df_lst = [pl.read_csv(f"pred/test_pred_{clf_feat}.csv") for clf_feat in clf_feat_lst]

In [3]:
# tag for filename
clf_feat_stacking = "_".join(["stacking"] + [cf.split("_")[0] for cf in clf_feat_lst])

### train layer2 model (logistic regression)

In [4]:
# Layer2特徴量：各モデルの予測確率
train = pl.concat([oof_pred_df.select(pl.all().name.prefix(f"model{i}_")) 
                   for i, oof_pred_df in enumerate(oof_pred_df_lst)], how="horizontal")
cols_exp = train.columns

# 予測ターゲット
train = train.with_columns(train_origin["health"])
col_target = "health"

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

x = train[cols_exp].to_numpy()
y = train[col_target].to_numpy()

# 学習データ全体でstandard scalerをfitting (testのstackingで利用される想定)
scaler_all = StandardScaler()
scaler_all.fit(x)

# cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=0)
y_valid_pred_lst = []
idx_valid_lst = []
clf_lst = []

for idx_train, idx_valid in kf.split(x):
    # partitioning
    x_train = x[idx_train, :]
    x_valid = x[idx_valid, :]
    y_train = y[idx_train]
    y_valid = y[idx_valid]
    
    # normalization
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_valid = scaler.transform(x_valid)
    
    # training layer2
    clf = LogisticRegression(random_state=0).fit(x_train, y_train)
    
    # oof
    y_valid_pred = clf.predict_proba(x_valid)
    y_valid_pred_lst.append(y_valid_pred)
    idx_valid_lst.append(idx_valid)
    clf_lst.append(clf)

idx_valid = np.hstack(idx_valid_lst)
y_valid_pred = np.vstack(y_valid_pred_lst)
oof_pred = y_valid_pred[np.argsort(idx_valid)]

# to dataframe
oof_pred_df = pl.DataFrame(oof_pred, schema=[f"health_is_{h}" for h in range(3)])
oof_pred_df.write_csv(f"pred/oof_pred_{clf_feat_stacking}.csv")

### calcurate prob of test

In [6]:
def predict_test(x_test, clf_lst):
    y_test_pred_lst = []

    for clf in clf_lst:
        y_test_pred = clf.predict_proba(x_test)
        y_test_pred_lst.append(y_test_pred)

    y_test_pred = np.mean(y_test_pred_lst, axis=0)
    return y_test_pred

In [7]:
# each prediction
test = pl.concat([pred_df.select(pl.all().name.prefix(f"model{i}_")) 
                   for i, pred_df in enumerate(test_pred_df_lst)], how="horizontal")

# normalization
x_test = scaler_all.transform(test[cols_exp].to_numpy())

# calcurate ensemble prob
y_test_pred = predict_test(x_test, clf_lst)

# record
test_pred_df = pl.DataFrame(y_test_pred, schema=[f"health_is_{h}" for h in range(3)])
test_pred_df.write_csv(f"pred/test_pred_{clf_feat_stacking}.csv")