In [1]:
import warnings

import config  # edit config.py as needed
import numpy as np
import pandas as pd
import polars as pl
from metric import score  # edit metric.py as needed
from scipy.optimize import minimize
from scipy.stats import rankdata
from seed import seed_everything  # edit seed.py as needed
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")


In [2]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    DRY_RUN = False
    EXP_NAME = config.EXP_NAME
    AUTHOR = "marumarukun"
    COMPETITION = config.KAGGLE_COMPETITION_NAME
    DATA_PATH = config.COMP_DATASET_DIR
    OUTPUT_DIR = config.OUTPUT_DIR
    SEED = 42


In [3]:
# ====================================================
# Seed everything
# ====================================================
seed_everything(CFG.SEED)


In [4]:
# ====================================================
# Read data
# ====================================================
train = pl.read_csv(CFG.DATA_PATH / "train.csv", try_parse_dates=True).to_pandas()
test = pl.read_csv(CFG.DATA_PATH / "test.csv", try_parse_dates=True).to_pandas()


In [5]:
# ====================================================
# Overall CV
# ====================================================
# target kaplan
oof_kaplan_lgb = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_lightgbm_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_kaplan_xgb = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_xgboost_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_kaplan_cat = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_catboost_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
# Cox models
oof_cox_xgb = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_xgboost_cox_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_cox_cat = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_catboost_cox_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
# target nelson
oof_nelson_lgb = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_lightgbm_y_nelson_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_nelson_xgb = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_xgboost_y_nelson_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_nelson_cat = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_catboost_y_nelson_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
# nn
oof_nn = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_nn_y_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv").get_column("prediction").to_numpy()
)

y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = (
    rankdata(oof_kaplan_xgb)
    + rankdata(oof_kaplan_cat)
    + rankdata(oof_kaplan_lgb)
    + rankdata(oof_cox_xgb)
    + rankdata(oof_cox_cat)
    + rankdata(oof_nelson_lgb)
    + rankdata(oof_nelson_xgb)
    + rankdata(oof_nelson_cat)
    + rankdata(oof_nn)
)
m = score(y_true.copy(), y_pred.copy(), "ID")
print("\nOverall CV for Ensemble =", m)



Overall CV for Ensemble = 0.6822494374076322


In [7]:
def ensemble_score(weights):
    # 重み付けした予測値を計算
    weighted_pred = (
        weights[0] * rankdata(oof_kaplan_xgb)
        + weights[1] * rankdata(oof_kaplan_cat)
        + weights[2] * rankdata(oof_kaplan_lgb)
        + weights[3] * rankdata(oof_cox_xgb)
        + weights[4] * rankdata(oof_cox_cat)
        + weights[5] * rankdata(oof_nelson_lgb)
        + weights[6] * rankdata(oof_nelson_xgb)
        + weights[7] * rankdata(oof_nelson_cat)
        + weights[8] * rankdata(oof_nn)
    )

    y_pred = pd.DataFrame({"ID": train["ID"], "prediction": weighted_pred})
    y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()

    return -score(y_true.copy(), y_pred.copy(), "ID")


# 5回試行して最良の結果を保存
best_score = 0.0
best_weights = None

for i in tqdm(range(5)):
    # 異なるシード値を設定
    seed_everything(CFG.SEED + i)

    # 初期重みをランダムに生成
    random_weights = np.random.random(9)
    initial_weights = random_weights / random_weights.sum()

    # 最適化実行
    result = minimize(ensemble_score, initial_weights, method="Nelder-Mead")

    if -result.fun > best_score:
        best_score = -result.fun
        best_weights = result.x

    print(f"試行 {i+1}:")
    print(f"スコア: {-result.fun:.6f}")
    print("重み:", result.x)
    print("-" * 50)

print("\n最終結果:")
print(f"最良スコア: {best_score:.6f}")
print("最適な重み:", best_weights)


  0%|          | 0/5 [00:00<?, ?it/s]

試行 1:
スコア: 0.682881
重み: [ 0.08553245 -0.00577941  0.05567771  0.17345794  0.05165869  0.05671022
  0.02171692  0.19129843  0.08987593]
--------------------------------------------------
試行 2:
スコア: 0.682529
重み: [0.03195483 0.02591072 0.04206069 0.12226512 0.15745873 0.05358597
 0.09336777 0.11453003 0.01149189]
--------------------------------------------------
試行 3:
スコア: 0.682754
重み: [0.00477925 0.01793846 0.07551991 0.14159597 0.13472972 0.11716305
 0.09939699 0.14115596 0.10058944]
--------------------------------------------------
試行 4:
スコア: 0.682450
重み: [0.04961802 0.08850728 0.12239793 0.0366907  0.24066119 0.08504804
 0.01668789 0.03159949 0.05258479]
--------------------------------------------------
試行 5:
スコア: 0.682836
重み: [0.08912316 0.11289416 0.03056608 0.26624721 0.0888246  0.12040795
 0.00909633 0.13042839 0.12253709]
--------------------------------------------------

最終結果:
最良スコア: 0.682881
最適な重み: [ 0.08553245 -0.00577941  0.05567771  0.17345794  0.05165869  0.05671022
  0