In [1]:
import gc
import itertools
import os
import pickle
import random
import sys
import warnings
from glob import glob
from pathlib import Path

import config  # edit config.py as needed
import joblib
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import scipy as sp
import seaborn as sns
import torch
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from lifelines import CoxPHFitter, KaplanMeierFitter, NelsonAalenFitter
from lightgbm import LGBMClassifier, LGBMRegressor
from metric import score  # edit metric.py as needed
from scipy.optimize import minimize
from scipy.stats import rankdata
from seed import seed_everything  # edit seed.py as needed
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from tqdm.notebook import tqdm
from xgboost import XGBClassifier, XGBRegressor

warnings.filterwarnings("ignore")


In [4]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    DRY_RUN = False
    EXP_NAME = config.EXP_NAME
    AUTHOR = "marumarukun"
    COMPETITION = config.KAGGLE_COMPETITION_NAME
    DATA_PATH = config.COMP_DATASET_DIR
    OUTPUT_DIR = config.OUTPUT_DIR
    MODEL_PATH = config.OUTPUT_DIR / "models"  # モデル作成・実験時はこちらを使用
    # MODEL_PATH = config.ARTIFACT_EXP_DIR(config.EXP_NAME) / "models"  # 提出時はこちらを使用
    METHOD_LIST = ["xgboost_cox", "catboost_cox", "lightgbm", "xgboost", "catboost"]
    SEED = 42
    n_folds = 2 if DRY_RUN else 10
    target_col_list = ["y_kaplan", "y_nelson"]
    cox_target_col_list = ["efs_time2"]
    # group_col = "race_group"  # Required for GroupKFold (edit as needed)
    stratified_col = "race_group_efs"  # Required for StratifiedKFold (edit as needed)
    num_boost_round = 100 if DRY_RUN else 1000000
    early_stopping_round = 10 if DRY_RUN else 500  # 10÷lrで設定
    verbose = 500

    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
    regression_lgb_params = {
        "objective": "regression",
        # "metric": "mae",
        "learning_rate": 0.02,
        "max_depth": 5,
        "min_child_weight": 1,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "subsample_freq": 1,
        "seed": SEED,
        "device": "cuda",  # cpu/gpu/cuda
    }
    # https://xgboost.readthedocs.io/en/stable/parameter.html
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier
    regression_xgb_params = {
        "objective": "reg:squarederror",
        # "eval_metric": "mae",
        "learning_rate": 0.02,
        "max_depth": 5,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "min_child_weight": 1,
        "enable_categorical": True,
        "random_state": SEED,
        "device": "cuda",  # cpu/gpu/cuda
    }
    regression_xgb_cox_params = {
        "objective": "survival:cox",
        "eval_metric": "cox-nloglik",
        "learning_rate": 0.02,
        "max_depth": 3,
        "colsample_bytree": 0.5,
        "subsample": 0.8,
        "min_child_weight": 80,
        "enable_categorical": True,
        "random_state": SEED,
        "device": "cuda",  # cpu/gpu/cuda
    }
    # https://catboost.ai/docs/en/references/training-parameters/
    # https://catboost.ai/docs/en/concepts/python-reference_catboostregressor
    # https://catboost.ai/docs/en/concepts/python-reference_catboostclassifier
    regression_cat_params = {
        "loss_function": "RMSE",
        "learning_rate": 0.02,
        "iterations": num_boost_round,
        # "depth": 5,
        "grow_policy": "Lossguide",
        "random_seed": SEED,
        "task_type": "GPU",  # CPU/GPU
    }
    regression_cat_cox_params = {
        "loss_function": "Cox",
        "learning_rate": 0.02,
        "iterations": num_boost_round,
        # "depth": 5,
        "grow_policy": "Lossguide",
        "random_seed": SEED,
        "task_type": "CPU",  # CPU/GPU
    }

    model_weight_dict = {"lightgbm": 0.40, "xgboost": 0.30, "catboost": 0.30}


In [5]:
# ====================================================
# Seed everything
# ====================================================
seed_everything(CFG.SEED)


In [6]:
# ====================================================
# Read data
# ====================================================
train = pl.read_csv(CFG.DATA_PATH / "train.csv", try_parse_dates=True)
test = pl.read_csv(CFG.DATA_PATH / "test.csv", try_parse_dates=True)
# make index column
# train = train.with_row_index()
# test = test.with_row_index()


In [7]:
# ====================================================
# Preprocess(ここに前処理や特徴量エンジニアリングを記述)
# ====================================================
def transform_survival_probability(df, time_col="efs_time", event_col="efs"):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y


def transform_cumulative_hazard(df, time_col="efs_time", event_col="efs"):
    naf = NelsonAalenFitter()
    naf.fit(durations=df[time_col], event_observed=df[event_col])
    y = naf.cumulative_hazard_at_times(df[time_col]).to_numpy()
    return -y


def preprocess(df: pl.DataFrame) -> pl.DataFrame:
    output = df.clone()
    # 欠損値のカウント（最初に行う）
    output = output.with_columns(pl.sum_horizontal(pl.all().is_null()).alias("null_count"))

    # ドナーと患者の性別マッチング
    output = output.with_columns(
        pl.when(pl.col("sex_match").str.contains_any(["M-M", "F-F"]))
        .then(1)
        .when(pl.col("sex_match").is_null())
        .then(None)
        .otherwise(0)
        .alias("is_sex_match"),
    )

    return output


In [8]:
train = preprocess(train)
test = preprocess(test)

# apply Kaplan-Meier
y_kaplan = transform_survival_probability(train, time_col="efs_time", event_col="efs")
train = train.with_columns(pl.Series(y_kaplan).alias("y_kaplan"))

# apply Nelson-Aalen
y_nelson = transform_cumulative_hazard(train, time_col="efs_time", event_col="efs")
train = train.with_columns(pl.Series(y_nelson).alias("y_nelson"))


In [9]:
# ====================================================
# Make fold column
# ====================================================
# race_group_efs列を作成
train = train.with_columns((pl.col("race_group").cast(str) + "_" + pl.col("efs").cast(str)).alias("race_group_efs"))

fold_array = np.zeros(train.height)
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.SEED)
for fold, (_, val_idx) in enumerate(skf.split(train, train[CFG.stratified_col]), start=1):
    fold_array[val_idx] = fold
train = train.with_columns(pl.Series(fold_array, dtype=pl.Int8).alias("fold"))

# fold_array = np.zeros(train.height)
# kf = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.SEED)
# for fold, (_, val_idx) in enumerate(kf.split(train), start=1):
#     fold_array[val_idx] = fold
# train = train.with_columns(pl.Series(fold_array, dtype=pl.Int8).alias("fold"))


In [10]:
# To pandas

train = train.to_pandas()
test = test.to_pandas()


In [11]:
# ====================================================
# Column selection
# ====================================================
# Feature columns
RMV = ["ID", "efs", "efs_time", "y_kaplan", "y_nelson", "fold", "race_group_efs", "efs_time2"]
FEATURES = [c for c in train.columns if c not in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")


There are 59 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10', 'null_count', 'is_sex_match']


In [12]:
CATS = []
for c in FEATURES:
    if train[c].dtype == "object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")


In these features, there are 35 CATEGORICAL FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


In [13]:
combined = pd.concat([train, test], axis=0, ignore_index=True)
# print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ", end="")
for c in FEATURES:
    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ", end="")
        combined[c], _ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")

    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype == "float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype == "int64":
            combined[c] = combined[c].astype("int32")

train = combined.iloc[: len(train)].copy()
test = combined.iloc[len(train) :].reset_index(drop=True).copy()


We LABEL ENCODE the CATEGORICAL FEATURES: dri_score, psych_disturb, cyto_score, diabetes, tbi_status, arrhythmia, graft_type, vent_hist, renal_issue, pulm_severe, prim_disease_hct, cmv_status, tce_imm_match, rituximab, prod_type, cyto_score_detail, conditioning_intensity, ethnicity, obesity, mrd_hct, in_vivo_tcd, tce_match, hepatic_severe, prior_tumor, peptic_ulcer, gvhd_proph, rheum_issue, sex_match, race_group, hepatic_mild, tce_div_match, donor_related, melphalan_dose, cardiac, pulm_moderate, 

In [12]:
# # Categorical features
# CATS = []
# cat_count = 0
# for c in FEATURES:
#     if train[c].dtype == pl.String:
#         cat_count += 1
#         CATS.append(c)
# print(f"There are {cat_count} CATEGORICAL FEATURES: {CATS}")


In [13]:
# # Label encode categorical features

# # train_test = pl.concat([train, test], how="diagonal")

# # 250109追記)カテゴリ型に変換するだけで充分かも
# train = train.with_columns(pl.col(CATS).fill_null("NaN").cast(pl.Categorical))
# test = test.with_columns(pl.col(CATS).fill_null("NaN").cast(pl.Categorical))

# for c in CATS:
#     pass
#     # train, testで分けているのはkaggle対策（本来のtestにアクセスできないため）
#     # OrdinalEncoderを使用しているのはtestに未知の値あっても指定の値(-1)に変換できるため
#     # 250109追記）これだと未知の値を全て同じ値として扱ってしまうので、改善が必要かも
#     # oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
#     # train = train.with_columns(
#     #     pl.Series(oe.fit_transform(train[c].fill_null("NaN").to_numpy().reshape(-1, 1)).reshape(-1))
#     #     .cast(pl.String)
#     #     .cast(pl.Categorical)
#     #     .alias(c)
#     # )
#     # test = test.with_columns(
#     #     pl.Series(oe.transform(test[c].fill_null("NaN").to_numpy().reshape(-1, 1)).reshape(-1))
#     #     .cast(pl.String)
#     #     .cast(pl.Categorical)
#     #     .alias(c)
#     # )
#     # # 本来のtestにアクセスできるコンペではtrain, testを結合してLabelEncodeすればよい
#     # le = LabelEncoder()
#     # train_test = train_test.with_columns(
#     #     pl.Series(le.fit_transform(train_test[c].fill_null("NaN")))
#     #     .cast(pl.String)
#     #     .cast(pl.Categorical)
#     #     .alias(c)
#     # )
# # train = train_test.filter(pl.col("fold").is_not_null())
# # test = train_test.filter(pl.col("fold").is_null())


In [14]:
# ====================================================
# Survival Cox model用のターゲット作成
# ====================================================
# create cox model's target

# polars
# train = train.with_columns(
#     pl.when(pl.col("efs") == 0).then(pl.col("efs_time") * -1).otherwise(pl.col("efs_time")).alias("efs_time2")
# )

# pandas
train["efs_time2"] = train.efs_time.copy()
train.loc[train.efs == 0, "efs_time2"] *= -1


In [15]:
# ====================================================
# Training functions
# ====================================================
def lightgbm_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    model = LGBMRegressor(
        **CFG.regression_lgb_params,
        n_estimators=CFG.num_boost_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        categorical_feature=categorical_features,
        callbacks=[
            lgb.early_stopping(stopping_rounds=CFG.early_stopping_round),
            lgb.log_evaluation(CFG.verbose),
        ],
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def xgboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
):
    model = XGBRegressor(
        **CFG.regression_xgb_params,
        n_estimators=CFG.num_boost_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=CFG.verbose,
        early_stopping_rounds=CFG.early_stopping_round,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def catboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostRegressor(**CFG.regression_cat_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


# Cox models
def xgboost_cox_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
):
    model = XGBRegressor(
        **CFG.regression_xgb_cox_params,
        n_estimators=CFG.num_boost_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=CFG.verbose,
        early_stopping_rounds=CFG.early_stopping_round,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def catboost_cox_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostRegressor(**CFG.regression_cat_cox_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def plot_feature_importance(model, features, method, target_col, fold):
    """特徴量の重要度をプロットする関数"""
    # 各モデルタイプに応じた特徴量重要度の取得方法
    if method == "lightgbm":
        importance = pd.DataFrame({"feature": features, "importance": model.feature_importances_})
    elif method == "xgboost" or method == "xgboost_cox":
        importance = pd.DataFrame({"feature": features, "importance": model.feature_importances_})
    elif method == "catboost" or method == "catboost_cox":
        importance = pd.DataFrame({"feature": features, "importance": model.get_feature_importance()})
    else:
        raise ValueError(f"Unknown method: {method}")

    plt.figure(figsize=(10, 6))
    sns.barplot(data=importance.sort_values("importance", ascending=False).head(20), x="importance", y="feature")
    plt.title(f"{method} Feature Importance\nTarget: {target_col}, Fold: {fold}")
    plt.tight_layout()

    # 保存先のディレクトリを作成
    save_dir = CFG.OUTPUT_DIR / "feature_importance"
    save_dir.mkdir(parents=True, exist_ok=True)
    plt.savefig(save_dir / f"feature_importance_{method}_{target_col}_fold{fold}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.png")
    plt.close()


def gradient_boosting_model_cv_training(
    method: str, train_df: pd.DataFrame, target_col_list: list, features: list, categorical_features: list
):
    # Create a numpy array to store out of folds predictions
    for target_col in target_col_list:
        oof_predictions = np.zeros(len(train_df))
        for fold in range(CFG.n_folds):
            print("-" * 50)
            print(f"{method} training fold {fold+1} {target_col}")
            x_train = train_df[train_df["fold"] != fold + 1][features]
            y_train = train_df[train_df["fold"] != fold + 1][target_col]
            x_valid = train_df[train_df["fold"] == fold + 1][features]
            y_valid = train_df[train_df["fold"] == fold + 1][target_col]

            if method == "lightgbm":
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, categorical_features)
            elif method == "xgboost":
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
            elif method == "catboost":
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, categorical_features)
            # Cox models
            elif method == "xgboost_cox":
                model, valid_pred = xgboost_cox_training(x_train, y_train, x_valid, y_valid)
            elif method == "catboost_cox":
                model, valid_pred = catboost_cox_training(x_train, y_train, x_valid, y_valid, categorical_features)
            else:
                raise ValueError(f"Unknown method: {method}")

            # Feature Importanceの可視化(最後のfoldのみ)
            if fold == CFG.n_folds - 1:
                plot_feature_importance(model, features, method, target_col, fold + 1)

            # Save best model
            save_model_path = (
                CFG.MODEL_PATH / f"{method}_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl"
            )
            save_model_path.parent.mkdir(parents=True, exist_ok=True)
            pickle.dump(
                model,
                open(
                    save_model_path,
                    "wb",
                ),
            )
            # Add to out of folds array
            oof_predictions[train_df["fold"] == fold + 1] = valid_pred
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()

        # Create a dataframe to store out of folds predictions
        oof_predictions_df = pd.DataFrame()
        oof_predictions_df["ID"] = train_df["ID"].values
        oof_predictions_df["prediction"] = oof_predictions
        oof_predictions_df.to_csv(
            CFG.OUTPUT_DIR / f"oof_{method}_{target_col}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv", index=False
        )

        # Compute out of folds metric
        y_true = train_df[["ID", "efs", "efs_time", "race_group"]].copy()
        m = score(y_true.copy(), oof_predictions_df.copy(), "ID")
        print("=" * 50)
        print(f"{method} our out of folds CV score is {m}")
        print("=" * 50)


In [16]:
# ====================================================
# Training
# ====================================================
# for method in CFG.METHOD_LIST:
#     gradient_boosting_model_cv_training(method, train, CFG.target_col_list, FEATURES, CATS)

# kaplan-meier & nelson-aalen models
for method in ["lightgbm", "xgboost", "catboost"]:
    gradient_boosting_model_cv_training(method, train, CFG.target_col_list, FEATURES, CATS)
# Cox models
for method in ["xgboost_cox", "catboost_cox"]:
    gradient_boosting_model_cv_training(method, train, CFG.cox_target_col_list, FEATURES, CATS)


--------------------------------------------------
lightgbm training fold 1 y_kaplan


[LightGBM] [Info] Total Bins 882
[LightGBM] [Info] Number of data points in the train set: 25920, number of used features: 59
[LightGBM] [Info] Start training from score 0.606188
Training until validation scores don't improve for 500 rounds
[500]	valid_0's l2: 0.0245494
[1000]	valid_0's l2: 0.0243226
[1500]	valid_0's l2: 0.0242774
Early stopping, best iteration is:
[1377]	valid_0's l2: 0.0242553
--------------------------------------------------
lightgbm training fold 2 y_kaplan
[LightGBM] [Info] Total Bins 884
[LightGBM] [Info] Number of data points in the train set: 25920, number of used features: 59
[LightGBM] [Info] Start training from score 0.606660
Training until validation scores don't improve for 500 rounds
[500]	valid_0's l2: 0.0238314
[1000]	valid_0's l2: 0.0236817
Early stopping, best iteration is:
[978]	valid_0's l2: 0.0236652
--------------------------------------------------
lightgbm training fold 3 y_kaplan
[LightGBM] [Info] Total Bins 884
[LightGBM] [Info] Number of dat

In [18]:
# ====================================================
# Overall CV
# ====================================================
# kaplan-meier models
oof_lgb_kaplan = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_lightgbm_y_kaplan_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_xgb_kaplan = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_xgboost_y_kaplan_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_cat_kaplan = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_catboost_y_kaplan_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)

# nelson-aalen models
oof_lgb_nelson = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_lightgbm_y_nelson_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_xgb_nelson = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_xgboost_y_nelson_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_cat_nelson = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_catboost_y_nelson_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
# Cox models
oof_cox_xgb = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_xgboost_cox_efs_time2_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)
oof_cox_cat = (
    pl.read_csv(CFG.OUTPUT_DIR / f"oof_catboost_cox_efs_time2_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv")
    .get_column("prediction")
    .to_numpy()
)

# # polars
# y_true = train[["ID", "efs", "efs_time", "race_group"]].clone()
# y_pred = train[["ID"]].clone()
# ensamble_prediction = (
#     rankdata(oof_xgb_kaplan)
#     + rankdata(oof_cat_kaplan)
#     + rankdata(oof_lgb_kaplan)
#     + rankdata(oof_lgb_nelson)
#     + rankdata(oof_xgb_nelson)
#     + rankdata(oof_cat_nelson)
#     + rankdata(oof_cox_xgb)
#     + rankdata(oof_cox_cat)
# )
# y_pred = y_pred.with_columns(pl.Series(ensamble_prediction).alias("prediction"))
# m = score(y_true.to_pandas().copy(), y_pred.to_pandas().copy(), "ID")
# print("\nOverall CV for Ensemble =", m)

# pandas
y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_pred = train[["ID"]].copy()
ensamble_prediction = (
    rankdata(oof_xgb_kaplan)
    + rankdata(oof_cat_kaplan)
    + rankdata(oof_lgb_kaplan)
    + rankdata(oof_lgb_nelson)
    + rankdata(oof_xgb_nelson)
    + rankdata(oof_cat_nelson)
    + rankdata(oof_cox_xgb)
    + rankdata(oof_cox_cat)
)
y_pred["prediction"] = ensamble_prediction
m = score(y_true.copy(), y_pred.copy(), "ID")
print("\nOverall CV for Ensemble =", m)



Overall CV for Ensemble = 0.6813499611954243


In [20]:
def ensemble_score(weights):
    # 重み付けした予測値を計算
    weighted_pred = (
        weights[0] * rankdata(oof_lgb_kaplan)
        + weights[1] * rankdata(oof_xgb_kaplan)
        + weights[2] * rankdata(oof_cat_kaplan)
        + weights[3] * rankdata(oof_lgb_nelson)
        + weights[4] * rankdata(oof_xgb_nelson)
        + weights[5] * rankdata(oof_cat_nelson)
        + weights[6] * rankdata(oof_cox_xgb)
        + weights[7] * rankdata(oof_cox_cat)
    )

    y_pred = pd.DataFrame({"ID": train["ID"], "prediction": weighted_pred})
    y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()

    return -score(y_true.copy(), y_pred.copy(), "ID")


# 8つのモデルの初期重みを均等に設定
initial_weights = [1 / 8] * 8

# 最適化実行
result = minimize(ensemble_score, initial_weights, method="Nelder-Mead")

print("最適な重み:", result.x)
print("最適化後のスコア:", result.fun)


最適な重み: [ 0.12081769  0.10278806  0.04831965  0.12423459 -0.06554445  0.2514433
  0.36799918  0.1182188 ]
最適化後のスコア: -0.6825346392733045


In [19]:
# ====================================================
# Inference functions
# ====================================================
def lightgbm_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"lightgbm_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def xgboost_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"xgboost_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        # pred = model.predict(xgb.DMatrix(x_test, enable_categorical=True))
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def catboost_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"catboost_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


# Cox models
def xgboost_cox_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"xgboost_cox_efs_time2_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def catboost_cox_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"catboost_cox_efs_time2_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, target_col: str):
    x_test = test_df[features]
    if method == "lightgbm":
        test_pred = lightgbm_inference(x_test, target_col)
    if method == "xgboost":
        test_pred = xgboost_inference(x_test, target_col)
    if method == "catboost":
        test_pred = catboost_inference(x_test, target_col)
    # Cox models
    elif method == "xgboost_cox":
        test_pred = xgboost_cox_inference(x_test, target_col)
    elif method == "catboost_cox":
        test_pred = catboost_cox_inference(x_test, target_col)
    return test_pred


def predicting(method_list: list, input_df: pd.DataFrame, target_col_list: list, features: list):
    output_df = input_df.copy()
    for target_col in target_col_list:
        # output_df[target_col] = 0
        for method in method_list:
            output_df[f"{method}_pred_{target_col}"] = gradient_boosting_model_inference(
                method, input_df, features, target_col
            )
            # output_df[target_col] += CFG.model_weight_dict[method] * output_df[f"{method}_pred_{target_col}"]
    return output_df


In [20]:
# ====================================================
# Inference
# ====================================================
# kaplan-meier & nelson-aalen models
output_df = predicting(["lightgbm", "xgboost", "catboost"], test, CFG.target_col_list, FEATURES)
pred_lgb_kaplan = output_df["lightgbm_pred_y_kaplan"]
pred_xgb_kaplan = output_df["xgboost_pred_y_kaplan"]
pred_cat_kaplan = output_df["catboost_pred_y_kaplan"]
pred_lgb_nelson = output_df["lightgbm_pred_y_nelson"]
pred_xgb_nelson = output_df["xgboost_pred_y_nelson"]
pred_cat_nelson = output_df["catboost_pred_y_nelson"]
# Cox models
cox_output_df = predicting(["xgboost_cox", "catboost_cox"], test, CFG.cox_target_col_list, FEATURES)
pred_cox_xgb = cox_output_df["xgboost_cox_pred_efs_time2"]
pred_cox_cat = cox_output_df["catboost_cox_pred_efs_time2"]

submission = pd.read_csv(CFG.DATA_PATH / "sample_submission.csv")
submission["prediction"] = (
    rankdata(pred_lgb_kaplan)
    + rankdata(pred_xgb_kaplan)
    + rankdata(pred_cat_kaplan)
    + rankdata(pred_lgb_nelson)
    + rankdata(pred_xgb_nelson)
    + rankdata(pred_cat_nelson)
    + rankdata(pred_cox_xgb)
    + rankdata(pred_cox_cat)
)
submission.to_csv(CFG.OUTPUT_DIR / "submission.csv", index=False)
print("Sub shape:", submission.shape)
print(submission.head())


Sub shape: (3, 2)
      ID  prediction
0  28800        16.0
1  28801        24.0
2  28802         8.0
