In [1]:
import gc
import itertools
import os
import pickle
import random
import sys
import warnings
from glob import glob
from pathlib import Path

import config  # edit config.py as needed
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
import polars as pl
import scipy as sp
import torch
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from lifelines import CoxPHFitter, KaplanMeierFitter, NelsonAalenFitter
from lightgbm import LGBMClassifier, LGBMRegressor
from metric import score  # edit metric.py as needed
from scipy.optimize import minimize
from scipy.stats import rankdata
from seed import seed_everything  # edit seed.py as needed
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
from xgboost import XGBClassifier, XGBRegressor

warnings.filterwarnings("ignore")


In [3]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    DRY_RUN = False
    EXP_NAME = config.EXP_NAME
    AUTHOR = "marumarukun"
    COMPETITION = config.KAGGLE_COMPETITION_NAME
    DATA_PATH = config.COMP_DATASET_DIR
    OUTPUT_DIR = config.OUTPUT_DIR
    MODEL_PATH = config.OUTPUT_DIR / "models"  # モデル作成・実験時はこちらを使用
    # MODEL_PATH = config.ARTIFACT_EXP_DIR(config.EXP_NAME) / "models"  # 提出時はこちらを使用
    METHOD_LIST = ["lightgbm", "xgboost", "catboost"]
    SEED = 42
    n_folds = 2 if DRY_RUN else 10
    target_col_list = ["y_cox"]
    # cox_target_col_list = ["efs_time2"]
    # group_col = "race_group"  # Required for GroupKFold (edit as needed)
    stratified_col = "race_group_efs"  # Required for StratifiedKFold (edit as needed)
    num_boost_round = 100 if DRY_RUN else 1000000
    early_stopping_round = 10 if DRY_RUN else 500  # 10÷lrで設定
    verbose = 500

    # https://lightgbm.readthedocs.io/en/latest/Parameters.html
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
    # https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html
    regression_lgb_params = {
        "objective": "regression",
        # "metric": "mae",
        "learning_rate": 0.02,
        "max_depth": 5,
        "min_child_weight": 1,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "subsample_freq": 1,
        "seed": SEED,
        "device": "cuda",  # cpu/gpu/cuda
    }
    # https://xgboost.readthedocs.io/en/stable/parameter.html
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor
    # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier
    regression_xgb_params = {
        "objective": "reg:squarederror",
        # "eval_metric": "mae",
        "learning_rate": 0.02,
        "max_depth": 5,
        "colsample_bytree": 0.8,
        "subsample": 0.8,
        "min_child_weight": 1,
        "enable_categorical": True,
        "random_state": SEED,
        "device": "cuda",  # cpu/gpu/cuda
    }
    # https://catboost.ai/docs/en/references/training-parameters/
    # https://catboost.ai/docs/en/concepts/python-reference_catboostregressor
    # https://catboost.ai/docs/en/concepts/python-reference_catboostclassifier
    regression_cat_params = {
        "loss_function": "RMSE",
        "learning_rate": 0.02,
        "iterations": num_boost_round,
        # "depth": 5,
        "grow_policy": "Lossguide",
        "random_seed": SEED,
        "task_type": "GPU",  # CPU/GPU
    }


In [4]:
# ====================================================
# Seed everything
# ====================================================
seed_everything(CFG.SEED)


In [5]:
# ====================================================
# Read data
# ====================================================
train = pl.read_csv(CFG.DATA_PATH / "train.csv", try_parse_dates=True)
test = pl.read_csv(CFG.DATA_PATH / "test.csv", try_parse_dates=True)
# make index column
# train = train.with_row_index()
# test = test.with_row_index()


In [6]:
# ====================================================
# Make fold column
# ====================================================
# race_group_efs列を作成
train = train.with_columns((pl.col("race_group").cast(str) + "_" + pl.col("efs").cast(str)).alias("race_group_efs"))

fold_array = np.zeros(train.height)
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.SEED)
for fold, (_, val_idx) in enumerate(skf.split(train, train[CFG.stratified_col]), start=1):
    fold_array[val_idx] = fold
train = train.with_columns(pl.Series(fold_array, dtype=pl.Int8).alias("fold"))

# fold_array = np.zeros(train.height)
# kf = KFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.SEED)
# for fold, (_, val_idx) in enumerate(kf.split(train), start=1):
#     fold_array[val_idx] = fold
# train = train.with_columns(pl.Series(fold_array, dtype=pl.Int8).alias("fold"))


In [7]:
train = train.to_pandas()
test = test.to_pandas()


In [8]:
# ====================================================
# Set categorical columns etc. (pandas operation from here)
# ====================================================
RMV = ["ID", "efs", "efs_time", "y_kaplan", "y_nelson", "fold", "race_group_efs"]
FEATURES = [c for c in train.columns if c not in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")


There are 57 FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'hla_match_c_high', 'hla_high_res_8', 'tbi_status', 'arrhythmia', 'hla_low_res_6', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'hla_high_res_6', 'cmv_status', 'hla_high_res_10', 'hla_match_dqb1_high', 'tce_imm_match', 'hla_nmdp_6', 'hla_match_c_low', 'rituximab', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'year_hct', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hla_match_a_high', 'hepatic_severe', 'donor_age', 'prior_tumor', 'hla_match_b_low', 'peptic_ulcer', 'age_at_hct', 'hla_match_a_low', 'gvhd_proph', 'rheum_issue', 'sex_match', 'hla_match_b_high', 'race_group', 'comorbidity_score', 'karnofsky_score', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'hla_low_res_8', 'cardiac', 'hla_match_drb1_high', 'pulm_moderate', 'hla_low_res_10']


In [9]:
CATS = []
for c in FEATURES:
    if train[c].dtype == "object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")


In these features, there are 35 CATEGORICAL FEATURES: ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']


In [10]:
combined = pd.concat([train, test], axis=0, ignore_index=True)
# print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ", end="")
for c in FEATURES:
    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ", end="")
        combined[c], _ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")

    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype == "float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype == "int64":
            combined[c] = combined[c].astype("int32")

train = combined.iloc[: len(train)].copy()
test = combined.iloc[len(train) :].reset_index(drop=True).copy()


We LABEL ENCODE the CATEGORICAL FEATURES: dri_score, psych_disturb, cyto_score, diabetes, tbi_status, arrhythmia, graft_type, vent_hist, renal_issue, pulm_severe, prim_disease_hct, cmv_status, tce_imm_match, rituximab, prod_type, cyto_score_detail, conditioning_intensity, ethnicity, obesity, mrd_hct, in_vivo_tcd, tce_match, hepatic_severe, prior_tumor, peptic_ulcer, gvhd_proph, rheum_issue, sex_match, race_group, hepatic_mild, tce_div_match, donor_related, melphalan_dose, cardiac, pulm_moderate, 

In [11]:
# ====================================================
# CoxPHFitter
# ====================================================

data = train.copy().drop(["ID", "fold", "race_group_efs"], axis=1)

# 数値列の欠損を-1で埋める
for c in data.columns:
    if c not in CATS:
        data[c] = data[c].fillna(-1)

# カテゴリカル変数のダミー変数化
data = pd.get_dummies(data, columns=CATS, drop_first=True)

# Drop constant columns if they exist
data = data.loc[:, data.nunique() > 1]

cph = CoxPHFitter(penalizer=0.01)
cph.fit(data, duration_col="efs_time", event_col="efs")

train["y_cox"] = cph.predict_partial_hazard(data)



In [12]:
y_true = train[["ID", "efs", "efs_time", "race_group"]].copy()
y_target = train[["ID"]].copy()
y_target["prediction"] = train["y_cox"]
m = score(y_true.copy(), y_target.copy(), "ID")
print("\nScore of target created by CoxPHFitter =", m)



Score of target created by CoxPHFitter = 0.6611743782604956


In [13]:
# ====================================================
# Training functions
# ====================================================
def lightgbm_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    model = LGBMRegressor(
        **CFG.regression_lgb_params,
        n_estimators=CFG.num_boost_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        categorical_feature=categorical_features,
        callbacks=[
            lgb.early_stopping(stopping_rounds=CFG.early_stopping_round),
            lgb.log_evaluation(CFG.verbose),
        ],
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def xgboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
):
    model = XGBRegressor(
        **CFG.regression_xgb_params,
        n_estimators=CFG.num_boost_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=CFG.verbose,
        early_stopping_rounds=CFG.early_stopping_round,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def catboost_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostRegressor(**CFG.regression_cat_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


# Cox models
def xgboost_cox_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
):
    model = XGBRegressor(
        **CFG.regression_xgb_cox_params,
        n_estimators=CFG.num_boost_round,
    )
    model.fit(
        x_train,
        y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=CFG.verbose,
        early_stopping_rounds=CFG.early_stopping_round,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def catboost_cox_training(
    x_train: pd.DataFrame,
    y_train: pd.DataFrame,
    x_valid: pd.DataFrame,
    y_valid: pd.DataFrame,
    categorical_features: list,
):
    cat_train = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    cat_valid = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    model = CatBoostRegressor(**CFG.regression_cat_cox_params)
    model.fit(
        cat_train,
        eval_set=[cat_valid],
        early_stopping_rounds=CFG.early_stopping_round,
        verbose=CFG.verbose,
        use_best_model=True,
    )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred


def gradient_boosting_model_cv_training(
    method: str, train_df: pd.DataFrame, target_col_list: list, features: list, categorical_features: list
):
    # Create a numpy array to store out of folds predictions
    for target_col in target_col_list:
        oof_predictions = np.zeros(len(train_df))
        for fold in range(CFG.n_folds):
            print("-" * 50)
            print(f"{method} training fold {fold+1} {target_col}")
            x_train = train_df[train_df["fold"] != fold + 1][features]
            y_train = train_df[train_df["fold"] != fold + 1][target_col]
            x_valid = train_df[train_df["fold"] == fold + 1][features]
            y_valid = train_df[train_df["fold"] == fold + 1][target_col]
            if method == "lightgbm":
                model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid, categorical_features)
            elif method == "xgboost":
                model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
            elif method == "catboost":
                model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid, categorical_features)
            # Cox models
            elif method == "xgboost_cox":
                model, valid_pred = xgboost_cox_training(x_train, y_train, x_valid, y_valid)
            elif method == "catboost_cox":
                model, valid_pred = catboost_cox_training(x_train, y_train, x_valid, y_valid, categorical_features)
            else:
                raise ValueError(f"Unknown method: {method}")

            # Save best model
            save_model_path = (
                CFG.MODEL_PATH / f"{method}_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl"
            )
            save_model_path.parent.mkdir(parents=True, exist_ok=True)
            pickle.dump(
                model,
                open(
                    save_model_path,
                    "wb",
                ),
            )
            # Add to out of folds array
            oof_predictions[train_df["fold"] == fold + 1] = valid_pred
            del x_train, x_valid, y_train, y_valid, model, valid_pred
            gc.collect()

        # Create a dataframe to store out of folds predictions
        oof_predictions_df = pd.DataFrame()
        oof_predictions_df["ID"] = train_df["ID"].values
        oof_predictions_df["prediction"] = oof_predictions
        oof_predictions_df.to_csv(
            CFG.OUTPUT_DIR / f"oof_{method}_{target_col}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.csv", index=False
        )

        # Compute out of folds metric
        y_true = train_df[["ID", "efs", "efs_time", "race_group"]].copy()
        m = score(y_true.copy(), oof_predictions_df.copy(), "ID")
        print("=" * 50)
        print(f"{method} our out of folds CV score is {m}")
        print("=" * 50)


In [None]:
# ====================================================
# Training
# ====================================================
# for method in CFG.METHOD_LIST:
#     gradient_boosting_model_cv_training(method, train, CFG.target_col_list, FEATURES, CATS)

# # Cox models
# for method in ["xgboost_cox", "catboost_cox"]:
#     gradient_boosting_model_cv_training(method, train, CFG.cox_target_col_list, FEATURES, CATS)
# Non-Cox models
for method in ["lightgbm", "xgboost", "catboost"]:
    gradient_boosting_model_cv_training(method, train, CFG.target_col_list, FEATURES, CATS)


--------------------------------------------------
lightgbm training fold 1 y_cox
[LightGBM] [Info] Total Bins 840
[LightGBM] [Info] Number of data points in the train set: 25920, number of used features: 57
[LightGBM] [Info] Start training from score 1.252319
Training until validation scores don't improve for 500 rounds
[500]	valid_0's l2: 0.0552781
[1000]	valid_0's l2: 0.032401
[1500]	valid_0's l2: 0.0259233
[2000]	valid_0's l2: 0.0229976
[2500]	valid_0's l2: 0.0214667
[3000]	valid_0's l2: 0.0204999
[3500]	valid_0's l2: 0.0197787
[4000]	valid_0's l2: 0.0192604


In [None]:
# ====================================================
# Inference functions
# ====================================================
def lightgbm_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"lightgbm_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def xgboost_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"xgboost_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        # pred = model.predict(xgb.DMatrix(x_test, enable_categorical=True))
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def catboost_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"catboost_{target_col}_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


# Cox models
def xgboost_cox_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"xgboost_cox_efs_time2_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def catboost_cox_inference(x_test: pd.DataFrame, target_col: str):
    test_pred = np.zeros(len(x_test))
    for fold in range(CFG.n_folds):
        model = pickle.load(
            open(
                CFG.MODEL_PATH / f"catboost_cox_efs_time2_fold{fold + 1}_seed{CFG.SEED}_ver{CFG.EXP_NAME}.pkl",
                "rb",
            )
        )
        # Predict
        pred = model.predict(x_test)
        test_pred += pred
    return test_pred / CFG.n_folds


def gradient_boosting_model_inference(method: str, test_df: pd.DataFrame, features: list, target_col: str):
    x_test = test_df[features]
    if method == "lightgbm":
        test_pred = lightgbm_inference(x_test, target_col)
    if method == "xgboost":
        test_pred = xgboost_inference(x_test, target_col)
    if method == "catboost":
        test_pred = catboost_inference(x_test, target_col)
    # Cox models
    elif method == "xgboost_cox":
        test_pred = xgboost_cox_inference(x_test, target_col)
    elif method == "catboost_cox":
        test_pred = catboost_cox_inference(x_test, target_col)
    return test_pred


def predicting(input_df: pd.DataFrame, features: list):
    output_df = input_df.copy()
    for target_col in CFG.target_col_list:
        # output_df[target_col] = 0
        for method in CFG.METHOD_LIST:
            output_df[f"{method}_pred_{target_col}"] = gradient_boosting_model_inference(
                method, input_df, features, target_col
            )
            # output_df[target_col] += CFG.model_weight_dict[method] * output_df[f"{method}_pred_{target_col}"]
    return output_df


In [None]:
# # ====================================================
# # Inference
# # ====================================================
# output_df = predicting(test, FEATURES)
# pred_lgb = output_df["lightgbm_pred_y"]
# pred_xgb = output_df["xgboost_pred_y"]
# pred_cat = output_df["catboost_pred_y"]
# # Cox models
# pred_cox_xgb = output_df["xgboost_cox_pred_y"]
# pred_cox_cat = output_df["catboost_cox_pred_y"]

# submission = pd.read_csv(CFG.DATA_PATH / "sample_submission.csv")
# submission["prediction"] = (
#     rankdata(pred_lgb) + rankdata(pred_xgb) + rankdata(pred_cat) + rankdata(pred_cox_xgb) + rankdata(pred_cox_cat)
# )
# submission.to_csv(CFG.OUTPUT_DIR / "submission.csv", index=False)
# print("Sub shape:", submission.shape)
# submission.head()


Sub shape: (3, 2)


Unnamed: 0,ID,prediction
0,28800,10.0
1,28801,15.0
2,28802,5.0
