In [None]:
!pip download lifelines -q

In [None]:
!pip install autograd-1.7.0-py3-none-any.whl -q
!pip install autograd-gamma-0.5.0.tar.gz -q
!pip install interface_meta-1.3.0-py3-none-any.whl -q
!pip install formulaic-1.1.1-py3-none-any.whl -q
!pip install lifelines-0.30.0-py3-none-any.whl -q
!pip install catboost -q
!pip install kaggle -q
!pip install --upgrade xgboost==2.0.3
!pip install --upgrade lightgbm==4.2.0
# !pip install --upgrade tensorflow==2.16.1 -q

In [None]:
!mkdir -p ~/.kaggle
!copy "C:\Users\Admin\Desktop\mert\kaggle.json"

In [None]:
!kaggle competitions download -c equity-post-HCT-survival-predictions

In [None]:
!mkdir -p ./data/cibmtr
!tar -xf "C:\Projects\CIBMTR\equity-post-HCT-survival-predictions.zip" -C ".\data\cibmtr"

In [None]:
# @title score
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


"""
To evaluate the equitable prediction of transplant survival outcomes,
we use the concordance index (C-index) between a series of event
times and a predicted score across each race group.

It represents the global assessment of the model discrimination power:
this is the model’s ability to correctly provide a reliable ranking
of the survival times based on the individual risk scores.

The concordance index is a value between 0 and 1 where:

0.5 is the expected result from random predictions,
1.0 is perfect concordance (with no censoring, otherwise <1.0),
0.0 is perfect anti-concordance (with no censoring, otherwise >0.0)

"""

import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """

    del solution[row_id_column_name]
    del submission[row_id_column_name]

    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))


In [None]:
import pickle

In [None]:
import os

# Define the directory where you want to save your models
catboost_dir = "catboost_models"
lgbm_dir = "lgbm_models"

# Create the directory if it does not exist
os.makedirs(catboost_dir, exist_ok=True)
os.makedirs(lgbm_dir, exist_ok=True)

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import rankdata


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

test = pd.read_csv(r"data\cibmtr\test.csv")
print("Test shape:", test.shape )

train = pd.read_csv(r"data\cibmtr\train.csv")
print("Train shape:",train.shape)
train.head()

In [None]:
import polars

HLA_COLUMNS = [

    'hla_match_a_low', 'hla_match_a_high',
    'hla_match_b_low', 'hla_match_b_high',
    'hla_match_c_low', 'hla_match_c_high',

    'hla_match_dqb1_low', 'hla_match_dqb1_high',
    'hla_match_drb1_low', 'hla_match_drb1_high',
    'hla_nmdp_6',
    'hla_low_res_6', 'hla_high_res_6',
    'hla_low_res_8', 'hla_high_res_8',
    'hla_low_res_10', 'hla_high_res_10'
]

def recalculate_hla_sums(df):

    df = polars.from_pandas(df)
    
    df = df.with_columns(
        (polars.col("hla_match_a_low").fill_null(0) + polars.col("hla_match_b_low").fill_null(0) + 
         polars.col("hla_match_drb1_high").fill_null(0)).alias("hla_nmdp_6"),
        
        (polars.col("hla_match_a_low").fill_null(0) + polars.col("hla_match_b_low").fill_null(0) + 
         polars.col("hla_match_drb1_low").fill_null(0)).alias("hla_low_res_6"),
        
        (polars.col("hla_match_a_high").fill_null(0) + polars.col("hla_match_b_high").fill_null(0) + 
         polars.col("hla_match_drb1_high").fill_null(0)).alias("hla_high_res_6"),
        
        (polars.col("hla_match_a_low").fill_null(0) + polars.col("hla_match_b_low").fill_null(0) + 
         polars.col("hla_match_c_low").fill_null(0) + polars.col("hla_match_drb1_low").fill_null(0)
        ).alias("hla_low_res_8"),
        
        (polars.col("hla_match_a_high").fill_null(0) + polars.col("hla_match_b_high").fill_null(0) + 
         polars.col("hla_match_c_high").fill_null(0) + polars.col("hla_match_drb1_high").fill_null(0)
        ).alias("hla_high_res_8"),
        
        (polars.col("hla_match_a_low").fill_null(0) + polars.col("hla_match_b_low").fill_null(0) + 
         polars.col("hla_match_c_low").fill_null(0) + polars.col("hla_match_drb1_low").fill_null(0) +
         polars.col("hla_match_dqb1_low").fill_null(0)).alias("hla_low_res_10"),
        
        (polars.col("hla_match_a_high").fill_null(0) + polars.col("hla_match_b_high").fill_null(0) + 
         polars.col("hla_match_c_high").fill_null(0) + polars.col("hla_match_drb1_high").fill_null(0) +
         polars.col("hla_match_dqb1_high").fill_null(0)).alias("hla_high_res_10"),
    )

    return df.to_pandas()

train = recalculate_hla_sums(train)
test = recalculate_hla_sums(test)

In [None]:
plt.hist(train.loc[train.efs==1,"efs_time"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"efs_time"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Time of Observation, efs_time")
plt.ylabel("Density")
plt.title("Times of Observation. Either time to event, or time observed without event.")
plt.legend()
plt.show()

In [None]:
train['cyto_score'].value_counts()

In [None]:
!pip install --upgrade scikit-learn==1.6.1 -q

In [None]:
from sklearn.preprocessing import OneHotEncoder, quantile_transform, power_transform, FunctionTransformer, PolynomialFeatures, StandardScaler
from lifelines import CoxPHFitter
import warnings

def transform_partial_hazard(time, event):
    """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times

    From https://www.kaggle.com/code/andreasbis/cibmtr-eda-ensemble-model
    """
    data = pd.DataFrame({'efs_time': time, 'efs': event, 'time': time, 'event': event})
    cph = CoxPHFitter()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        cph.fit(data, duration_col='time', event_col='event')
    return cph.predict_partial_hazard(data)

def transform_separate(time, event):
    """Transform the target by separating events from non-events

    From https://www.kaggle.com/code/mtinti/cibmtr-lofo-feature-importance-gpu-accelerated"""
    transformed = time.values.copy()
    mx = transformed[event == 1].max() # last patient who dies
    mn = transformed[event == 0].min() # first patient who survives
    transformed[event == 0] = time[event == 0] + mx - mn
    transformed = rankdata(transformed)
    transformed[event == 0] += len(transformed) // 2
    transformed = transformed / transformed.max()
    return - transformed

def transform_rank_log(time, event):
    """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times

    From https://www.kaggle.com/code/cdeotte/nn-mlp-baseline-cv-670-lb-676"""
    transformed = time.values.copy()
    mx = transformed[event == 1].max() # last patient who dies
    mn = transformed[event == 0].min() # first patient who survives
    transformed[event == 0] = time[event == 0] + mx - mn
    transformed = rankdata(transformed)
    transformed[event == 0] += len(transformed) * 2
    transformed = transformed / transformed.max()
    transformed = np.log(transformed)
    return - transformed

def transform_quantile(time, event):
    """Transform the target by stretching the range of eventful efs_times and compressing the range of event_free efs_times

    From https://www.kaggle.com/code/ambrosm/esp-eda-which-makes-sense"""
    transformed = np.full(len(time), np.nan)
    transformed_dead = quantile_transform(- time[event == 1].values.reshape(-1, 1)).ravel()
    transformed[event == 1] = transformed_dead
    transformed[event == 0] = transformed_dead.min() - 0.3
    return transformed

In [None]:
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

In [None]:
CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")

print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

num_cols = [c for c in FEATURES if not c in CATS]
print(f"In these features, there are {len(num_cols)} NUMERICAL FEATURES: {num_cols}")

In [None]:
train['karnofsky_score'].value_counts()

In [None]:
combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")

    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")

train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [None]:
# XGBoost: MSE loss with five different quantile transformation
for transformation in [
                      transform_partial_hazard,
                       transform_separate,
                       transform_rank_log,
                       transform_quantile
                      ]:
    plt.figure(figsize=(6, 1.5))
    target = transformation(time=train.efs_time, event=train.efs)
    vmin, vmax = 1.09 * target.min() - 0.09 * target.max(), 1.09 * target.max() - 0.09 * target.min()
    plt.hist(target[train.efs == 0], bins=np.linspace(vmin, vmax, 31), density=True, label='efs=0: patient still lives at this time', alpha=0.5)
    plt.hist(target[train.efs == 1], bins=np.linspace(vmin, vmax, 31), density=True, label='efs=1: patient dies at this time', alpha=0.5)
    plt.xlim(vmin, vmax)
    plt.yticks([])
    plt.title('Target histogram: ' + transformation.__name__)
    plt.show()

    print(transformation.__name__)

In [None]:
from lightgbm import LGBMRegressor
import lightgbm as lgb
print("Using LightGBM version",lgb.__version__) # 4.2.0 olacak
from sklearn.model_selection import StratifiedKFold


In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
print("Using XGBoost version",xgb.__version__) # 2.0.3 olacak

In [None]:
import pandas as pd

def transform_time_qcut(time, event, q=4):
    """
    Discretize event times into q quantile bins using ordinal encoding.
    Censored observations receive the highest ordinal value.
    """
    time_arr = np.array(time)
    event_arr = np.array(event)
    transformed = np.zeros_like(time_arr, dtype=float)
    
    # Only use events to compute quantile bins.
    events_time = time_arr[event_arr == 1]
    if len(events_time) > 0:
        # Use pandas qcut for quantile binning.
        bins = pd.qcut(events_time, q=q, duplicates="drop", labels=False)
        # Assign the ordinal value back to the original indices
        transformed[event_arr == 1] = bins
    # For censored observations, assign the highest ordinal value.
    transformed[event_arr == 0] = q  
    return -transformed

def transform_time_weighted_risk(time, event):
    """
    Incorporates both time-based weighting and event status
    """
    # Convert to numpy arrays if they're pandas Series
    time_arr = time.values if hasattr(time, 'values') else np.array(time)
    event_arr = event.values if hasattr(event, 'values') else np.array(event)

    transformed = np.zeros_like(time_arr, dtype=float)

    # For events: exponential decay with time
    event_mask = (event_arr == 1)
    if event_mask.any():
        mean_time = time_arr[event_mask].mean()
        transformed[event_mask] = np.exp(-time_arr[event_mask] / mean_time)

    # For censored: linear scaling with upper bound
    censored_mask = (event_arr == 0)
    transformed[censored_mask] = 0.1 * (time_arr[censored_mask] / time_arr.max())

    return transformed


from lifelines import KaplanMeierFitter, NelsonAalenFitter
import numpy as np

def transform_neg_log_survival(time, event):
    kmf = KaplanMeierFitter()
    kmf.fit(time, event)
    # Compute survival probabilities at the observed times
    surv_probs = kmf.survival_function_at_times(time).values
    # Transform to negative log survival
    transformed = -np.log(surv_probs + 1e-8)  # add a tiny epsilon to avoid log(0)
    return -transformed

def transform_time_buckets(time, event):
    """Transform using time-based buckets with different weights"""
    time_arr = time.values if hasattr(time, 'values') else np.array(time)
    event_arr = event.values if hasattr(event, 'values') else np.array(event)
    
    # Create buckets based on quantiles of event times
    event_times = time_arr[event_arr == 1]
    quantiles = np.percentile(event_times, [25, 50, 75])
    
    transformed = np.zeros_like(time_arr, dtype=float)
    
    # For events: assign weights based on time buckets
    event_mask = (event_arr == 1)
    transformed[event_mask & (time_arr < quantiles[0])] = 1.0
    transformed[event_mask & (time_arr >= quantiles[0]) & (time_arr < quantiles[1])] = 0.75
    transformed[event_mask & (time_arr >= quantiles[1]) & (time_arr < quantiles[2])] = 0.5
    transformed[event_mask & (time_arr >= quantiles[2])] = 0.25
    
    # For censored: weight based on observation time
    censored_mask = (event_arr == 0)
    transformed[censored_mask] = 0.1 * (time_arr[censored_mask] / time_arr.max())
    
    return transformed


import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
from lifelines import NelsonAalenFitter

def transform_martingale_residuals(time, event):
    """
    Compute martingale residuals for a no-predictor Cox model as:
        martingale_residual = event - baseline_cumulative_hazard(time)
    using the Nelson-Aalen estimator for the cumulative hazard.
    
    Parameters:
        time (array-like): Observed times.
        event (array-like): Event indicators (1 for event, 0 for censored).
    
    Returns:
        numpy.ndarray: Martingale residuals for each observation.
    """
    time = np.array(time)
    event = np.array(event)
    
    naf = NelsonAalenFitter()
    naf.fit(time, event)
    
    # Evaluate the cumulative hazard at each observed time.
    # The result is a pandas Series; we convert it to a 1D numpy array.
    H_t = naf.cumulative_hazard_at_times(time).values.flatten()
    
    # Martingale residuals: observed event indicator minus estimated cumulative hazard.
    martingale_res = event - H_t
    return martingale_res

### XGBOOST TRANSFORM TIME QCUT

In [None]:
%%time
from sklearn.model_selection import StratifiedKFold
FOLDS = 10
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

oof_xgb_qcut = np.zeros(len(train))
pred_xgb_qcut = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_qcut(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_qcut(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_qcut[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_qcut += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_qcut /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_qcut
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost oof_xgb_qcut =",m)


# Option 1: Pickle
import pickle
with open(r'gbdt-models\oof_xgb_qcut.pkl', 'wb') as f:
    pickle.dump(oof_xgb_qcut, f)
# Pickle
with open(r'gbdt-models\oof_xgb_qcut.pkl', 'rb') as f:
    oof_xgb_qcut = pickle.load(f)

### CATBOOST TIME QCUT

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier
import catboost as cb
print("Using CatBoost version",cb.__version__)

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_qcut = np.zeros(len(train))
pred_cat_qcut = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_qcut(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_qcut(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        iterations=2000,
        depth=4,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_qcut[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_qcut += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_qcut /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_qcut
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for oof_cat_qcut =",m)


# Option 1: Pickle
import pickle
with open(r'gbdt-models\oof_cat_qcut.pkl', 'wb') as f:
    pickle.dump(oof_cat_qcut, f)
# Pickle
with open(r'gbdt-models\oof_cat_qcut.pkl', 'rb') as f:
    oof_cat_qcut = pickle.load(f)

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_qcut_2 = np.zeros(len(train))
pred_cat_qcut_2 = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_qcut(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_qcut(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        iterations=2000,
        depth=4,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_qcut_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_qcut_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_qcut_2 /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_qcut_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for oof_cat_qcut =",m)


# Option 1: Pickle
import pickle
with open(r'gbdt-models\oof_cat_qcut_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_qcut_2, f)
# Pickle
with open(r'gbdt-models\oof_cat_qcut_2.pkl', 'rb') as f:
    oof_cat_qcut_2 = pickle.load(f)

# XGBOOST WITH TIME WEIGHTED RISK

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
print("Using XGBoost version",xgb.__version__) # 2.0.3 olacak

In [None]:
%%time
FOLDS = 10
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

oof_xgb_martingale = np.zeros(len(train))
pred_xgb_martingale = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_martingale_residuals(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_martingale_residuals(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_martingale[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_martingale += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_martingale /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_martingale
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost martingale residuals RMSE =",m)

# Option 1: Pickle
import pickle
with open(r'gbdt-models\oof_xgb_martingale.pkl', 'wb') as f:
    pickle.dump(oof_xgb_martingale, f)
# Pickle
with open(r'gbdt-models\oof_xgb_martingale.pkl', 'rb') as f:
    oof_xgb_martingale = pickle.load(f)

In [None]:
%%time
FOLDS = 10
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

oof_xgb_twr_rmse = np.zeros(len(train))
pred_xgb_twr_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_weighted_risk(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_weighted_risk(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_twr_rmse[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_twr_rmse += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_twr_rmse /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_twr_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Quantile Transformation RMSE =",m)

In [None]:
# Option 1: Pickle
import pickle
with open(r'gbdt-models\oof_xgb_twr_rmse.pkl', 'wb') as f:
    pickle.dump(oof_xgb_twr_rmse, f)
# Pickle
with open(r'gbdt-models\oof_xgb_twr_rmse.pkl', 'rb') as f:
    oof_xgb_twr_rmse = pickle.load(f)

In [None]:
%%time
FOLDS = 10
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

oof_xgb_nls_rmse = np.zeros(len(train))
pred_xgb_nls_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_neg_log_survival(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_neg_log_survival(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_nls_rmse[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_nls_rmse += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_nls_rmse /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_nls_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Quantile Transformation RMSE =",m)

In [None]:
# Option 1: Pickle
import pickle
with open(r'gbdt-models\oof_xgb_nls_rmse.pkl', 'wb') as f:
    pickle.dump(oof_xgb_nls_rmse, f)
# Pickle
with open(r'gbdt-models\oof_xgb_nls_rmse.pkl', 'rb') as f:
    oof_xgb_nls_rmse = pickle.load(f)

In [None]:
%%time
FOLDS = 10
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

oof_xgb_tb_rmse = np.zeros(len(train))
pred_xgb_tb_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_buckets(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_buckets(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_tb_rmse[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_tb_rmse += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_tb_rmse /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_tb_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Quantile Transformation RMSE =",m)

In [None]:
# Option 1: Pickle
import pickle
with open(r'gbdt-models\oof_xgb_tb_rmse.pkl', 'wb') as f:
    pickle.dump(oof_xgb_tb_rmse, f)
# Pickle
with open(r'gbdt-models\oof_xgb_tb_rmse.pkl', 'rb') as f:
    oof_xgb_tb_rmse = pickle.load(f)

# XGBOOST WITH QUANTILE TRANSFORM

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_quantile_rmse = np.zeros(len(train))
pred_xgb_quantile_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_quantile_rmse[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_quantile_rmse += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_quantile_rmse /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_quantile_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Quantile Transformation RMSE =",m)

In [None]:
# Option 1: Pickle
import pickle
with open(r'gbdt-models\oof_xgb_quantile_rmse.pkl', 'wb') as f:
    pickle.dump(oof_xgb_quantile_rmse, f)
# Pickle
with open(r'gbdt-models\oof_xgb_quantile_rmse.pkl', 'rb') as f:
    oof_xgb_quantile_rmse = pickle.load(f)

In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

oof_xgb_quantile_rmse_skf = np.zeros(len(train))
pred_xgb_quantile_rmse_skf = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_quantile_rmse_skf[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_quantile_rmse_skf += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_quantile_rmse_skf /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_quantile_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Quantile Transformation RMSE =",m)

In [None]:
with open(r'gbdt-models\oof_xgb_quantile_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_xgb_quantile_rmse_skf, f)
with open(r'gbdt-models\oof_xgb_quantile_rmse_skf.pkl', 'rb') as f:
    oof_xgb_quantile_rmse_skf = pickle.load(f)

In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

oof_xgb_quantile_rmse_skf_dri = np.zeros(len(train))
pred_xgb_quantile_rmse_skf_dri = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_quantile_rmse_skf_dri[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_quantile_rmse_skf_dri += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_quantile_rmse_skf_dri /= FOLDS

In [None]:
# 0.6796 with max_cat_to_onehot=15

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_quantile_rmse_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Quantile Transformation RMSE =",m)

In [None]:
with open(r'gbdt-models\oof_xgb_quantile_rmse_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_xgb_quantile_rmse_skf_dri, f)
with open(r'gbdt-models\oof_xgb_quantile_rmse_skf_dri.pkl', 'rb') as f:
    oof_xgb_quantile_rmse_skf_dri = pickle.load(f)



In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_quantile_mae = np.zeros(len(train))
pred_xgb_quantile_mae = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        objective="reg:absoluteerror",
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_quantile_mae[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_quantile_mae += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_quantile_mae /= FOLDS

In [None]:
# 0.6712

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_quantile_mae
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Quantile Transformation MAE =",m)

In [None]:
with open(r'gbdt-models\oof_xgb_quantile_mae.pkl', 'wb') as f:
    pickle.dump(oof_xgb_quantile_mae, f)
with open(r'gbdt-models\oof_xgb_quantile_mae.pkl', 'rb') as f:
    oof_xgb_quantile_mae = pickle.load(f)



# CATBOOST WITH TIME WEIGHTED RISK

In [None]:
from catboost import CatBoostRegressor, CatBoostClassifier
import catboost as cb
print("Using CatBoost version",cb.__version__)

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_twr_rmse_skf = np.zeros(len(train))
pred_cat_twr_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_weighted_risk(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_weighted_risk(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        iterations=2000,
        depth=3,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_twr_rmse_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_twr_rmse_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_twr_rmse_skf /= FOLDS

In [None]:
# 0.6773- 0.6775

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_twr_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)

with open(r'gbdt-models\oof_cat_twr_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_twr_rmse_skf, f)
with open(r'gbdt-models\oof_cat_twr_rmse_skf.pkl', 'rb') as f:
    oof_cat_twr_rmse_skf = pickle.load(f)

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_twr_rmse_skf_2 = np.zeros(len(train))
pred_cat_twr_rmse_skf_2 = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_weighted_risk(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_weighted_risk(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        iterations=2000,
        depth=3,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_twr_rmse_skf_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_twr_rmse_skf_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_twr_rmse_skf_2 /= FOLDS

In [None]:
# 0.6773- 0.6775

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_twr_rmse_skf_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)

with open(r'gbdt-models\oof_cat_twr_rmse_skf_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_twr_rmse_skf_2, f)
with open(r'gbdt-models\oof_cat_twr_rmse_skf_2.pkl', 'rb') as f:
    oof_cat_twr_rmse_skf_2 = pickle.load(f)

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_nl_rmse_skf = np.zeros(len(train))
pred_cat_nl_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_neg_log_survival(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_neg_log_survival(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        iterations=2000,
        depth=3,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_nl_rmse_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_nl_rmse_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_nl_rmse_skf /= FOLDS

In [None]:
# 0.6773- 0.6775

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_nl_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)

with open(r'gbdt-models\oof_cat_nl_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_nl_rmse_skf, f)
with open(r'gbdt-models\oof_cat_nl_rmse_skf.pkl', 'rb') as f:
    oof_cat_nl_rmse_skf = pickle.load(f)

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_nl_rmse_skf_2 = np.zeros(len(train))
pred_cat_nl_rmse_skf_2 = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_neg_log_survival(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_neg_log_survival(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        iterations=2000,
        depth=3,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_nl_rmse_skf_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_nl_rmse_skf_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_nl_rmse_skf_2 /= FOLDS

In [None]:
# 0.6773- 0.6775

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_nl_rmse_skf_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)

with open(r'gbdt-models\oof_cat_nl_rmse_skf_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_nl_rmse_skf_2, f)
with open(r'gbdt-models\oof_cat_nl_rmse_skf_2.pkl', 'rb') as f:
    oof_cat_nl_rmse_skf_2 = pickle.load(f)

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_tb_rmse_skf = np.zeros(len(train))
pred_cat_tb_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_buckets(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_buckets(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        iterations=2000,
        depth=3,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_tb_rmse_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_tb_rmse_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_tb_rmse_skf /= FOLDS

In [None]:
# 0.6773- 0.6775

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_tb_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)

with open(r'gbdt-models\oof_cat_tb_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_tb_rmse_skf, f)
with open(r'gbdt-models\oof_cat_tb_rmse_skf.pkl', 'rb') as f:
    oof_cat_tb_rmse_skf = pickle.load(f)

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_tb_rmse_skf_2 = np.zeros(len(train))
pred_cat_tb_rmse_skf_2 = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_time_buckets(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_time_buckets(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        iterations=2000,
        depth=3,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_tb_rmse_skf_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_tb_rmse_skf_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_tb_rmse_skf_2 /= FOLDS

In [None]:
# 0.6773- 0.6775

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_tb_rmse_skf_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)

with open(r'gbdt-models\oof_cat_tb_rmse_skf_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_tb_rmse_skf_2, f)
with open(r'gbdt-models\oof_cat_tb_rmse_skf_2.pkl', 'rb') as f:
    oof_cat_tb_rmse_skf_2 = pickle.load(f)

# CATBOOST WITH QUANTILE TRANSFORM

In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_quantile_rmse_skf = np.zeros(len(train))
pred_cat_quantile_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.075,
        grow_policy='Lossguide',
        objective="RMSE",
        iterations=2000,
        depth=4,
        l2_leaf_reg=150,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_quantile_rmse_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_quantile_rmse_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_quantile_rmse_skf /= FOLDS

In [None]:
# 0.67866

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_quantile_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)

In [None]:
with open(r'gbdt-models\oof_cat_quantile_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_quantile_rmse_skf, f)
with open(r'gbdt-models\oof_cat_quantile_rmse_skf.pkl', 'rb') as f:
    oof_cat_quantile_rmse_skf = pickle.load(f)

In [None]:
%%time
FOLDS = 10
skf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_quantile_rmse_2_skf = np.zeros(len(train))
pred_cat_quantile_rmse_2_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.075,
        grow_policy='Depthwise',
        objective="RMSE",
        iterations=3000,
        depth=4,
        l2_leaf_reg=200,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_quantile_rmse_2_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_quantile_rmse_2_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_quantile_rmse_2_skf /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_quantile_rmse_2_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE 2 =",m)
with open(r'gbdt-models\oof_cat_quantile_rmse_2_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_quantile_rmse_2_skf, f)
with open(r'gbdt-models\oof_cat_quantile_rmse_2_skf.pkl', 'rb') as f:
    oof_cat_quantile_rmse_2_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_quantile_rmse_skf_dri = np.zeros(len(train))
pred_cat_quantile_rmse_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()
    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.075,
        grow_policy='Lossguide',
        objective="RMSE",
        iterations=2000,
        depth=4,
        l2_leaf_reg=150,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_quantile_rmse_skf_dri[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_quantile_rmse_skf_dri += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_quantile_rmse_skf_dri /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_quantile_rmse_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)
with open(r'gbdt-models\oof_cat_quantile_rmse_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_quantile_rmse_skf_dri, f)
with open(r'gbdt-models\oof_cat_quantile_rmse_skf_dri.pkl', 'rb') as f:
    oof_cat_quantile_rmse_skf_dri = pickle.load(f)


In [None]:
%%time
FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_quantile_rmse_2_skf_dri = np.zeros(len(train))
pred_cat_quantile_rmse_2_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",

        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_quantile_rmse_2_skf_dri[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_quantile_rmse_2_skf_dri += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_quantile_rmse_2_skf_dri /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_quantile_rmse_2_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE 2 =",m)
with open(r'gbdt-models\oof_cat_quantile_rmse_2_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_quantile_rmse_2_skf_dri, f)
with open(r'gbdt-models\oof_cat_quantile_rmse_2_skf_dri.pkl', 'rb') as f:
    oof_cat_quantile_rmse_2_skf_dri = pickle.load(f)


In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_quantile_rmse = np.zeros(len(train))
pred_cat_quantile_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.08,
        grow_policy='Lossguide',
        objective="RMSE",
        iterations=2000,
        depth=4,
        l2_leaf_reg=150,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_quantile_rmse[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_quantile_rmse += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_quantile_rmse /= FOLDS

In [None]:
# 0.6773- 0.6775

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_quantile_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE =",m)

In [None]:
with open(r'gbdt-models\oof_cat_quantile_rmse.pkl', 'wb') as f:
    pickle.dump(oof_cat_quantile_rmse, f)
with open(r'gbdt-models\oof_cat_quantile_rmse.pkl', 'rb') as f:
    oof_cat_quantile_rmse = pickle.load(f)



In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_quantile_rmse_2 = np.zeros(len(train))
pred_cat_quantile_rmse_2 = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_quantile(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_quantile(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.08,
        grow_policy='Depthwise',
        objective="RMSE",
        iterations=3000,
        depth=4,
        l2_leaf_reg=150,
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_quantile_rmse_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_quantile_rmse_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_quantile_rmse_2 /= FOLDS

In [None]:
# 0.6778

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_quantile_rmse_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost QUANTILE TRANSFORMER with RMSE 2 =",m)

In [None]:
with open(r'gbdt-models\oof_cat_quantile_rmse_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_quantile_rmse_2, f)
with open(r'gbdt-models\oof_cat_quantile_rmse_2.pkl', 'rb') as f:
    oof_cat_quantile_rmse_2 = pickle.load(f)



# XGBOOST WITH PH TRANSFORM

In [None]:
                      # transform_partial_hazard,
                      #  transform_separate,
                      #  transform_rank_log,
                      #  transform_quantile,

FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_ph_rmse = np.zeros(len(train))
pred_xgb_ph_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_partial_hazard(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_partial_hazard(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=10,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_ph_rmse[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_ph_rmse += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_ph_rmse /= FOLDS

In [None]:
# 0.674

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_ph_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost PH Transformation RMSE =",m)

In [None]:
with open(r'gbdt-models\oof_xgb_ph_rmse.pkl', 'wb') as f:
    pickle.dump(oof_xgb_ph_rmse, f)
with open(r'gbdt-models\oof_xgb_ph_rmse.pkl', 'rb') as f:
    oof_xgb_ph_rmse = pickle.load(f)



In [None]:
                      # transform_partial_hazard,
                      #  transform_separate,
                      #  transform_rank_log,
                      #  transform_quantile,

FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_ph_rmse_skf = np.zeros(len(train))
pred_xgb_ph_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_partial_hazard(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_partial_hazard(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=20,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_ph_rmse_skf[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_ph_rmse_skf += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_ph_rmse_skf /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_ph_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost PH Transformation with Stratified KFold RMSE =",m)
with open(r'gbdt-models\oof_xgb_ph_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_xgb_ph_rmse_skf, f)
with open(r'gbdt-models\oof_xgb_ph_rmse_skf.pkl', 'rb') as f:
    oof_xgb_ph_rmse_skf = pickle.load(f)


In [None]:
                      # transform_partial_hazard,
                      #  transform_separate,
                      #  transform_rank_log,
                      #  transform_quantile,

FOLDS = 10

skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_ph_rmse_skf_dri = np.zeros(len(train))
pred_xgb_ph_rmse_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = transform_partial_hazard(time=train.iloc[train_index].efs_time, event=train.iloc[train_index].efs)
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = transform_partial_hazard(time=train.iloc[test_index].efs_time, event=train.iloc[test_index].efs)
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=20,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_ph_rmse_skf_dri[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_ph_rmse_skf_dri += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_ph_rmse_skf_dri /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_ph_rmse_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost PH Transformation with Stratified KFold RMSE =",m)
with open(r'gbdt-models\oof_xgb_ph_rmse_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_xgb_ph_rmse_skf_dri, f)
with open(r'gbdt-models\oof_xgb_ph_rmse_skf_dri.pkl', 'rb') as f:
    oof_xgb_ph_rmse_skf_dri = pickle.load(f)



# XGBOOST WITH KAPLANMEIER

In [None]:
import numpy as np
from lifelines import KaplanMeierFitter

def transform_survival_probability(df, time_col='efs_time', event_col='efs'):

    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], event_observed=df[event_col])
    survival_probabilities = kmf.survival_function_at_times(df[time_col]).values.flatten()
    return survival_probabilities

def update_target_with_survival_probabilities(df, time_col='efs_time', event_col='efs'):

    race_group = sorted(df['race_group'].unique())
    survival_probs_dict = {}
    for race in race_group:
        race_df = df[df['race_group'] == race]
        survival_probs_dict[race] = transform_survival_probability(race_df, time_col, event_col)
    for race in race_group:
        df.loc[df['race_group'] == race, 'target'] = survival_probs_dict[race]
    df.loc[df[event_col] == 0, 'target'] -= 0.15
    
    return df['target']


# Example usage:
train["y"] = update_target_with_survival_probabilities(train, time_col='efs_time', event_col='efs')

plt.hist(train.loc[train.efs==1,"y"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"y"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y")
plt.ylabel("Density")
plt.title("KaplanMeier Transformed Target y using both efs and efs_time.")
plt.legend()
plt.show()

In [None]:
train.head()

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_km_rmse = np.zeros(len(train))
pred_xgb_km_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )

    # model_xgb = XGBRegressor(
    #     device="cuda",
    #     max_depth=3,
    #     colsample_bytree=0.5,
    #     subsample=0.8,
    #     n_estimators=2000,
    #     learning_rate=0.02,
    #     enable_categorical=True,
    #     min_child_weight=80,
    #     #early_stopping_rounds=25,
    # )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_km_rmse[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_km_rmse += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_km_rmse /= FOLDS

In [None]:
# 0.674

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_km_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost KaplanMeier RMSE =",m)

In [None]:

with open(r'gbdt-models\oof_xgb_km_rmse.pkl', 'wb') as f:
    pickle.dump(oof_xgb_km_rmse, f)
with open(r'gbdt-models\oof_xgb_km_rmse.pkl', 'rb') as f:
    oof_xgb_km_rmse = pickle.load(f)

In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_km_rmse_skf = np.zeros(len(train))
pred_xgb_km_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )

    # model_xgb = XGBRegressor(
    #     device="cuda",
    #     max_depth=3,
    #     colsample_bytree=0.5,
    #     subsample=0.8,
    #     n_estimators=2000,
    #     learning_rate=0.02,
    #     enable_categorical=True,
    #     min_child_weight=80,
    #     #early_stopping_rounds=25,
    # )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_km_rmse_skf[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_km_rmse_skf += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_km_rmse_skf /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_km_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost KaplanMeier with Stratified KFold RMSE =",m)
with open(r'gbdt-models\oof_xgb_km_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_xgb_km_rmse_skf, f)
with open(r'gbdt-models\oof_xgb_km_rmse_skf.pkl', 'rb') as f:
    oof_xgb_km_rmse_skf = pickle.load(f)

In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_km_rmse_skf_dri = np.zeros(len(train))
pred_xgb_km_rmse_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        max_cat_to_onehot=15,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )

    # model_xgb = XGBRegressor(
    #     device="cuda",
    #     max_depth=3,
    #     colsample_bytree=0.5,
    #     subsample=0.8,
    #     n_estimators=2000,
    #     learning_rate=0.02,
    #     enable_categorical=True,
    #     min_child_weight=80,
    #     #early_stopping_rounds=25,
    # )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_km_rmse_skf_dri[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_km_rmse_skf_dri += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_km_rmse_skf_dri /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_km_rmse_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost KaplanMeier with Stratified KFold RMSE =",m)

with open(r'gbdt-models\oof_xgb_km_rmse_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_xgb_km_rmse_skf_dri, f)
with open(r'gbdt-models\oof_xgb_km_rmse_skf_dri.pkl', 'rb') as f:
    oof_xgb_km_rmse_skf_dri = pickle.load(f)


# CATBOOST WITH KAPLANMEIER

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_km_rmse = np.zeros(len(train))
pred_cat_km_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_km_rmse[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_km_rmse += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_km_rmse /= FOLDS

In [None]:
# 0.6728 - 0.673
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_km_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost KaplanMeier RMSE =",m)

with open(r'gbdt-models\oof_cat_km_rmse.pkl', 'wb') as f:
    pickle.dump(oof_cat_km_rmse, f)
with open(r'gbdt-models\oof_cat_km_rmse.pkl', 'rb') as f:
    oof_cat_km_rmse = pickle.load(f)



In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_km_rmse_2 = np.zeros(len(train))
pred_cat_km_rmse_2 = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_km_rmse_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_km_rmse_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_km_rmse_2 /= FOLDS

In [None]:
# 0.6728 - 0.673
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_km_rmse_2    
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost KaplanMeier RMSE 2 =",m)

with open(r'gbdt-models\oof_cat_km_rmse_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_km_rmse_2, f)
with open(r'gbdt-models\oof_cat_km_rmse_2.pkl', 'rb') as f:
    oof_cat_km_rmse_2 = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_km_rmse_skf = np.zeros(len(train))
pred_cat_km_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_km_rmse_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_km_rmse_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_km_rmse_skf /= FOLDS

In [None]:
# 0.6746

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_km_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost KaplanMeier with Stratified KFold RMSE =",m)
with open(r'gbdt-models\oof_cat_km_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_km_rmse_skf, f)
with open(r'gbdt-models\oof_cat_km_rmse_skf.pkl', 'rb') as f:
    oof_cat_km_rmse_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_km_rmse_2_skf = np.zeros(len(train))
pred_cat_km_rmse_2_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    model_filename = os.path.join(catboost_dir, f"oof_cat_km_rmse_2_skf{i+1}.cbm")
    model_cat.save_model(model_filename)
    print(f"Model for fold {i+1} saved as {model_filename}")

    # INFER OOF
    oof_cat_km_rmse_2_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_km_rmse_2_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_km_rmse_2_skf /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_km_rmse_2_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost KaplanMeier with Stratified KFold RMSE 2 =",m)
with open(r'gbdt-models\oof_cat_km_rmse_2_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_km_rmse_2_skf, f)
with open(r'gbdt-models\oof_cat_km_rmse_2_skf.pkl', 'rb') as f:
    oof_cat_km_rmse_2_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_km_rmse_skf_dri = np.zeros(len(train))
pred_cat_km_rmse_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_km_rmse_skf_dri[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_km_rmse_skf_dri += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_km_rmse_skf_dri /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_km_rmse_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost KaplanMeier with Stratified KFold RMSE =",m)

with open(r'gbdt-models\oof_cat_km_rmse_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_km_rmse_skf_dri, f)
with open(r'gbdt-models\oof_cat_km_rmse_skf_dri.pkl', 'rb') as f:
    oof_cat_km_rmse_skf_dri = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_km_rmse_2_skf_dri = np.zeros(len(train))
pred_cat_km_rmse_2_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_km_rmse_2_skf_dri[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_km_rmse_2_skf_dri += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_km_rmse_2_skf_dri /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_km_rmse_2_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost KaplanMeier with Stratified KFold RMSE 2 =",m)

with open(r'gbdt-models\oof_cat_km_rmse_2_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_km_rmse_2_skf_dri, f)
with open(r'gbdt-models\oof_cat_km_rmse_2_skf_dri.pkl', 'rb') as f:
    oof_cat_km_rmse_2_skf_dri = pickle.load(f)


# XGBOOST WITH SURVIVAL:COX

In [None]:
# SURVIVAL COX NEEDS THIS TARGET (TO DIGEST EFS AND EFS_TIME)
train["efs_time2"] = train.efs_time.copy()
train.loc[train.efs==0,"efs_time2"] *= -1

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_cox = np.zeros(len(train))
pred_xgb_cox = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_xgb_cox = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        max_cat_to_onehot=10,
        objective='survival:cox',
        eval_metric='cox-nloglik',
    )
    model_xgb_cox.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_cox[test_index] = model_xgb_cox.predict(x_valid)
    # INFER TEST
    pred_xgb_cox += model_xgb_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_cox /= FOLDS

In [None]:
# 0.6737
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_cox
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Survival:Cox =",m)

with open(r'gbdt-models\oof_xgb_cox.pkl', 'wb') as f:
    pickle.dump(oof_xgb_cox, f)
with open(r'gbdt-models\oof_xgb_cox.pkl', 'rb') as f:
    oof_xgb_cox = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_cox_skf = np.zeros(len(train))
pred_xgb_cox_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_xgb_cox = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        max_cat_to_onehot=10,
        objective='survival:cox',
        eval_metric='cox-nloglik',
    )
    model_xgb_cox.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_cox_skf[test_index] = model_xgb_cox.predict(x_valid)
    # INFER TEST
    pred_xgb_cox_skf += model_xgb_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_cox_skf /= FOLDS



In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_cox_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Survival:Cox with Stratified KFold =",m)

with open(r'gbdt-models\oof_xgb_cox_skf.pkl', 'wb') as f:
    pickle.dump(oof_xgb_cox_skf, f)
with open(r'gbdt-models\oof_xgb_cox_skf.pkl', 'rb') as f:
    oof_xgb_cox_skf = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_cox_skf_dri = np.zeros(len(train))
pred_xgb_cox_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_xgb_cox = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        max_cat_to_onehot=10,
        objective='survival:cox',
        eval_metric='cox-nloglik',
    )
    model_xgb_cox.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_cox_skf_dri[test_index] = model_xgb_cox.predict(x_valid)
    # INFER TEST
    pred_xgb_cox_skf_dri += model_xgb_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_cox_skf_dri /= FOLDS




In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_cox_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost Survival:Cox with Stratified KFold =",m)
with open(r'gbdt-models\oof_xgb_cox_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_xgb_cox_skf_dri, f)
with open(r'gbdt-models\oof_xgb_cox_skf_dri.pkl', 'rb') as f:
    oof_xgb_cox_skf_dri = pickle.load(f)



# CATBOOST WITH SURVIVAL:COX

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_cox = np.zeros(len(train))
pred_cat_cox = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_cat_cox = CatBoostRegressor(
        loss_function="Cox",
        #task_type="GPU",
        iterations=400,
        learning_rate=0.1,
        grow_policy='Lossguide',
        use_best_model=False,
    )
    model_cat_cox.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=100)

    # INFER OOF
    oof_cat_cox[test_index] = model_cat_cox.predict(x_valid)
    # INFER TEST
    pred_cat_cox += model_cat_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_cox /= FOLDS

In [None]:
# 0.6707
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_cox
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost Survival:Cox =",m)

with open(r'gbdt-models\oof_cat_cox.pkl', 'wb') as f:
    pickle.dump(oof_cat_cox, f)
with open(r'gbdt-models\oof_cat_cox.pkl', 'rb') as f:
    oof_cat_cox = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_cox_skf = np.zeros(len(train))
pred_cat_cox_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_cat_cox = CatBoostRegressor(
        loss_function="Cox",
        #task_type="GPU",
        iterations=400,
        learning_rate=0.1,
        grow_policy='Lossguide',
        use_best_model=False,
    )
    model_cat_cox.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=100)

    # INFER OOF
    oof_cat_cox_skf[test_index] = model_cat_cox.predict(x_valid)
    # INFER TEST
    pred_cat_cox_skf += model_cat_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_cox_skf /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_cox_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost Survival:Cox with Stratified KFold =",m)

with open(r'gbdt-models\oof_cat_cox_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_cox_skf, f)
with open(r'gbdt-models\oof_cat_cox_skf.pkl', 'rb') as f:
    oof_cat_cox_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_cox_skf_dri = np.zeros(len(train))
pred_cat_cox_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_cat_cox = CatBoostRegressor(
        loss_function="Cox",
        #task_type="GPU",
        iterations=400,
        learning_rate=0.1,
        grow_policy='Lossguide',
        use_best_model=False,
    )
    model_cat_cox.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=100)

    # INFER OOF
    oof_cat_cox_skf_dri[test_index] = model_cat_cox.predict(x_valid)
    # INFER TEST
    pred_cat_cox_skf_dri += model_cat_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_cox_skf_dri /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_cox_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost Survival:Cox with Stratified KFold =",m)

with open(r'gbdt-models\oof_cat_cox_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_cox_skf_dri, f)
with open(r'gbdt-models\oof_cat_cox_skf_dri.pkl', 'rb') as f:
    oof_cat_cox_skf_dri = pickle.load(f)


In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_cox_2 = np.zeros(len(train))
pred_cat_cox_2 = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_cat_cox = CatBoostRegressor(
        loss_function="Cox",
        #task_type="GPU",
        iterations=400,
        learning_rate=0.1,
        grow_policy='Depthwise',
        use_best_model=False,
    )
    model_cat_cox.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=100)

    # INFER OOF
    oof_cat_cox_2[test_index] = model_cat_cox.predict(x_valid)
    # INFER TEST
    pred_cat_cox_2 += model_cat_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_cox_2 /= FOLDS

In [None]:
# 0.6707
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_cox_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost Survival:Cox =",m)

with open(r'gbdt-models\oof_cat_cox_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_cox_2, f)
with open(r'gbdt-models\oof_cat_cox_2.pkl', 'rb') as f:
    oof_cat_cox_2 = pickle.load(f)

In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_cox_2_skf = np.zeros(len(train))
pred_cat_cox_2_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_cat_cox = CatBoostRegressor(
        loss_function="Cox",
        #task_type="GPU",
        iterations=400,
        learning_rate=0.1,
        grow_policy='Depthwise',
        use_best_model=False,
    )
    model_cat_cox.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=100)

    # INFER OOF
    oof_cat_cox_2_skf[test_index] = model_cat_cox.predict(x_valid)
    # INFER TEST
    pred_cat_cox_2_skf += model_cat_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_cox_2_skf /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_cox_2_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost Survival:Cox with Stratified KFold =",m)

with open(r'gbdt-models\oof_cat_cox_2_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_cox_2_skf, f)
with open(r'gbdt-models\oof_cat_cox_2_skf.pkl', 'rb') as f:
    oof_cat_cox_2_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_cox_2_skf_dri = np.zeros(len(train))
pred_cat_cox_2_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"efs_time2"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"efs_time2"]
    x_test = test[FEATURES].copy()

    model_cat_cox = CatBoostRegressor(
        loss_function="Cox",
        #task_type="GPU",
        iterations=400,
        learning_rate=0.1,
        grow_policy='Depthwise',
        use_best_model=False,
    )
    model_cat_cox.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=100)

    # INFER OOF
    oof_cat_cox_2_skf_dri[test_index] = model_cat_cox.predict(x_valid)
    # INFER TEST
    pred_cat_cox_2_skf_dri += model_cat_cox.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_cox_2_skf_dri /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_cox_2_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost Survival:Cox with Stratified KFold =",m)

with open(r'gbdt-models\oof_cat_cox_2_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_cox_2_skf_dri, f)
with open(r'gbdt-models\oof_cat_cox_2_skf_dri.pkl', 'rb') as f:
    oof_cat_cox_2_skf_dri = pickle.load(f)


# XGBOOST WITH NELSON-AALEN

In [None]:
from lifelines import NelsonAalenFitter
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    naf = NelsonAalenFitter()
    naf.fit(df[time_col], df[event_col])
    y = -naf.cumulative_hazard_at_times(df[time_col]).values
    return y

train["y_naf"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')


plt.hist(train.loc[train.efs==1,"y_naf"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"y_naf"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y")
plt.ylabel("Density")
plt.title("NELSON AALEN Transformed Target y using both efs and efs_time.")
plt.legend()
plt.show()

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_naf_rmse = np.zeros(len(train))
pred_xgb_naf_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        max_cat_to_onehot=10,
        objective='reg:squarederror',
        #early_stopping_rounds=25,
    )

    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_naf_rmse[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_naf_rmse += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_naf_rmse /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_naf_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost NelsonAalen RMSE =",m)


In [None]:
with open(r'gbdt-models\oof_xgb_naf_rmse.pkl', 'wb') as f:
    pickle.dump(oof_xgb_naf_rmse, f)
with open(r'gbdt-models\oof_xgb_naf_rmse.pkl', 'rb') as f:
    oof_xgb_naf_rmse = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

oof_xgb_naf_rmse_skf = np.zeros(len(train))
pred_xgb_naf_rmse_skf = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        max_cat_to_onehot=10,
        objective='reg:squarederror',
        #early_stopping_rounds=25,
    )

    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_naf_rmse_skf[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_naf_rmse_skf += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_naf_rmse_skf /= FOLDS

In [None]:
# 0.6782

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_naf_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost NelsonAalen RMSE =",m)

with open(r'gbdt-models\oof_xgb_naf_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_xgb_naf_rmse_skf, f)
with open(r'gbdt-models\oof_xgb_naf_rmse_skf.pkl', 'rb') as f:
    oof_xgb_naf_rmse_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_naf_rmse_skf_dri = np.zeros(len(train))
pred_xgb_naf_rmse_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        max_cat_to_onehot=10,
        objective='reg:squarederror',
        #early_stopping_rounds=25,
    )

    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_naf_rmse_skf_dri[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_naf_rmse_skf_dri += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_naf_rmse_skf_dri /= FOLDS

In [None]:
# 0.6775

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_naf_rmse_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost NelsonAalen RMSE =",m)

with open(r'gbdt-models\oof_xgb_naf_rmse_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_xgb_naf_rmse_skf_dri, f)
with open(r'gbdt-models\oof_xgb_naf_rmse_skf_dri.pkl', 'rb') as f:
    oof_xgb_naf_rmse_skf_dri = pickle.load(f)



# CATBOOST WITH NELSON-AALEN

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_naf = np.zeros(len(train))
pred_cat_naf = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_naf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_naf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_naf /= FOLDS

In [None]:
# 0.6755
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_naf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost NelsonAalen =",m)

with open(r'gbdt-models\oof_cat_naf.pkl', 'wb') as f:
    pickle.dump(oof_cat_naf, f)
with open(r'gbdt-models\oof_cat_naf.pkl', 'rb') as f:
    oof_cat_naf = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_naf_skf = np.zeros(len(train))
pred_cat_naf_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_naf_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_naf_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_naf_skf /= FOLDS

In [None]:
# 0.6755
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_naf_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost NelsonAalen =",m)

with open(r'gbdt-models\oof_cat_naf_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_naf_skf, f)
with open(r'gbdt-models\oof_cat_naf_skf.pkl', 'rb') as f:
    oof_cat_naf_skf = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_naf_skf_dri = np.zeros(len(train))
pred_cat_naf_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_naf_skf_dri[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_naf_skf_dri += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_naf_skf_dri /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_naf_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost NelsonAalen =",m)

with open(r'gbdt-models\oof_cat_naf_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_naf_skf_dri, f)
with open(r'gbdt-models\oof_cat_naf_skf_dri.pkl', 'rb') as f:
    oof_cat_naf_skf_dri = pickle.load(f)


In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_naf_2 = np.zeros(len(train))
pred_cat_naf_2 = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_naf_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_naf_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_naf_2 /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_naf_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost NelsonAalen =",m)

with open(r'gbdt-models\oof_cat_naf_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_naf_2, f)
with open(r'gbdt-models\oof_cat_naf_2.pkl', 'rb') as f:
    oof_cat_naf_2 = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_naf_2_skf = np.zeros(len(train))
pred_cat_naf_2_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)
    
    model_filename = os.path.join(catboost_dir, f"oof_cat_naf_2_skf{i+1}.cbm")
    model_cat.save_model(model_filename)
    print(f"Model for fold {i+1} saved as {model_filename}")


    # INFER OOF
    oof_cat_naf_2_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_naf_2_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_naf_2_skf /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_naf_2_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost NelsonAalen =",m)

with open(r'gbdt-models\oof_cat_naf_2_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_naf_2_skf, f)
with open(r'gbdt-models\oof_cat_naf_2_skf.pkl', 'rb') as f:
    oof_cat_naf_2_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_naf_2_skf_dri = np.zeros(len(train))
pred_cat_naf_2_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_naf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_naf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_naf_2_skf_dri[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_naf_2_skf_dri += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_naf_2_skf_dri /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_naf_2_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost NelsonAalen =",m)

with open(r'gbdt-models\oof_cat_naf_2_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_naf_2_skf_dri, f)
with open(r'gbdt-models\oof_cat_naf_2_skf_dri.pkl', 'rb') as f:
    oof_cat_naf_2_skf_dri = pickle.load(f)


# XGB WITH SURVIVAL:AFT

In [None]:
%%time


FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_aft = np.zeros(len(train))
pred_xgb_aft = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    x_valid = train.loc[test_index,FEATURES].copy()
    d_tr = xgb.DMatrix(x_train, enable_categorical=True)
    d_tr.set_float_info('label_lower_bound', train.efs_time.iloc[train_index])
    d_tr.set_float_info('label_upper_bound', np.where(train.efs.iloc[train_index]==0, np.inf,
                                                      train.efs_time.iloc[train_index]))
    d_val = xgb.DMatrix(x_valid, enable_categorical=True)
    d_val.set_float_info('label_lower_bound', train.efs_time.iloc[test_index])
    d_val.set_float_info('label_upper_bound', np.where(train.efs.iloc[test_index]==0, np.inf,
                                                      train.efs_time.iloc[test_index]))


    y_train = train.loc[train_index,"efs_time2"]
    y_valid = train.loc[test_index,"efs_time2"]


    x_test = test[FEATURES].copy()

    xgboost_aft_params = {'learning_rate': 0.025, 'max_depth': 3,
                          'colsample_bytree': 0.5, 'subsample': 0.8,
                          'min_child_weight': 100,
                          'enable_categorical': True, 'objective': 'survival:aft',
                          'aft_loss_distribution_scale': 0.9,
                          'aft_loss_distribution': 'normal',
                          'eval_metric': 'aft-nloglik'}

    model_xgb_aft = xgb.train(xgboost_aft_params, d_tr, num_boost_round=2000)

    # INFER OOF
    oof_xgb_aft[test_index] = -(model_xgb_aft.predict(d_val))
    # INFER TEST
#     pred_xgb_aft += model_xgb_cox.predict(x_test)

# # COMPUTE AVERAGE TEST PREDS
# pred_xgb_aft /= FOLDS

In [None]:
# 0.6698

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_aft
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost PartialHazard =",m)

with open(r'gbdt-models\oof_xgb_aft.pkl', 'wb') as f:
    pickle.dump(oof_xgb_aft, f)
with open(r'gbdt-models\oof_xgb_aft.pkl', 'rb') as f:
    oof_xgb_aft = pickle.load(f)



# CATBOOST WITH SURVIVAL:AFT

In [None]:
%%time

y_aft = np.column_stack([train.efs_time,
                     np.where(train.efs == 1, train.efs_time, -1)])

FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_aft = np.zeros(len(train))
pred_cat_aft = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = y_aft[train_index]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = y_aft[test_index]
    x_test = test[FEATURES].copy()

    model_cat_aft = CatBoostRegressor(
        loss_function='SurvivalAft:dist=Normal',
        eval_metric='SurvivalAft',
        #task_type="GPU",
        iterations=600,
        learning_rate=0.1,
        grow_policy='SymmetricTree',
        use_best_model=False,
    )
    model_cat_aft.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=100)

    # INFER OOF
    oof_cat_aft[test_index] = -(model_cat_aft.predict(x_valid))
    # INFER TEST
    pred_cat_aft += model_cat_aft.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_aft /= FOLDS

In [None]:
# 0.6682
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_aft
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost AFT =",m)

with open(r'gbdt-models\oof_cat_aft.pkl', 'wb') as f:
    pickle.dump(oof_cat_aft, f)
with open(r'gbdt-models\oof_cat_aft.pkl', 'rb') as f:
    oof_cat_aft = pickle.load(f)


# XGBOOST WITH BreslowFlemingHarringtonFitter

In [None]:
from lifelines import BreslowFlemingHarringtonFitter
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    bfhf = BreslowFlemingHarringtonFitter()
    bfhf.fit(df[time_col], df[event_col])
    y = bfhf.survival_function_at_times(df[time_col]).values
    return y
train["y_bfhf"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

plt.hist(train.loc[train.efs==1,"y_bfhf"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"y_bfhf"],bins=100,label="efs=0, Maybe Event")
plt.xlabel("Transformed Target y")
plt.ylabel("Density")
plt.title("bfhf Transformed Target y using both efs and efs_time.")
plt.legend()
plt.show()

In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_bfhf_rmse = np.zeros(len(train))
pred_xgb_bfhf_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )

    # model_xgb = XGBRegressor(
    #     device="cuda",
    #     max_depth=3,
    #     colsample_bytree=0.5,
    #     subsample=0.8,
    #     n_estimators=2000,
    #     learning_rate=0.02,
    #     enable_categorical=True,
    #     min_child_weight=80,
    #     #early_stopping_rounds=25,
    # )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_bfhf_rmse[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_bfhf_rmse += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_bfhf_rmse /= FOLDS

In [None]:
# 0.6745

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_bfhf_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_xgb_bfhf_rmse.pkl', 'wb') as f:
    pickle.dump(oof_xgb_bfhf_rmse, f)
with open(r'gbdt-models\oof_xgb_bfhf_rmse.pkl', 'rb') as f:
    oof_xgb_bfhf_rmse = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_bfhf_rmse_skf = np.zeros(len(train))
pred_xgb_bfhf_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )

    # model_xgb = XGBRegressor(
    #     device="cuda",
    #     max_depth=3,
    #     colsample_bytree=0.5,
    #     subsample=0.8,
    #     n_estimators=2000,
    #     learning_rate=0.02,
    #     enable_categorical=True,
    #     min_child_weight=80,
    #     #early_stopping_rounds=25,
    # )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_bfhf_rmse_skf[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_bfhf_rmse_skf += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_bfhf_rmse_skf /= FOLDS


In [None]:
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_bfhf_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_xgb_bfhf_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_xgb_bfhf_rmse_skf, f)
with open(r'gbdt-models\oof_xgb_bfhf_rmse_skf.pkl', 'rb') as f:
    oof_xgb_bfhf_rmse_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb_bfhf_rmse_skf_dri = np.zeros(len(train))
pred_xgb_bfhf_rmse_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_xgb = XGBRegressor(
        device="cuda",
        max_depth=4,
        colsample_bytree=0.5,
        subsample=0.8,
        n_estimators=2000,
        learning_rate=0.025,
        enable_categorical=True,
        min_child_weight=100,
        #early_stopping_rounds=25,
    )

    # model_xgb = XGBRegressor(
    #     device="cuda",
    #     max_depth=3,
    #     colsample_bytree=0.5,
    #     subsample=0.8,
    #     n_estimators=2000,
    #     learning_rate=0.02,
    #     enable_categorical=True,
    #     min_child_weight=80,
    #     #early_stopping_rounds=25,
    # )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],
        verbose=500
    )

    # INFER OOF
    oof_xgb_bfhf_rmse_skf_dri[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb_bfhf_rmse_skf_dri += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb_bfhf_rmse_skf_dri /= FOLDS



In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_xgb_bfhf_rmse_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for XGBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_xgb_bfhf_rmse_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_xgb_bfhf_rmse_skf_dri, f)
with open(r'gbdt-models\oof_xgb_bfhf_rmse_skf_dri.pkl', 'rb') as f:
    oof_xgb_bfhf_rmse_skf_dri = pickle.load(f)



# CATBOOST WITH BFHF

In [None]:

FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_bfhf_rmse = np.zeros(len(train))
pred_cat_bfhf_rmse = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.05,
        grow_policy='Lossguide',
        objective="RMSE",
        l2_leaf_reg=200,
        iterations=2000,
        bootstrap_type='Bernoulli'
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_bfhf_rmse[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_bfhf_rmse += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_bfhf_rmse /= FOLDS

In [None]:
# 0.6732
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_bfhf_rmse
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_cat_bfhf_rmse.pkl', 'wb') as f:
    pickle.dump(oof_cat_bfhf_rmse, f)
with open(r'gbdt-models\oof_cat_bfhf_rmse.pkl', 'rb') as f:
    oof_cat_bfhf_rmse = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_bfhf_rmse_skf = np.zeros(len(train))
pred_cat_bfhf_rmse_skf = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)
    # Save the model for this fold to the specified directory
    model_filename = os.path.join(catboost_dir, f"cat_bfhf_rmse_skf_fold_{i+1}.cbm")
    model_cat.save_model(model_filename)
    print(f"Model for fold {i+1} saved as {model_filename}")

    # INFER OOF
    oof_cat_bfhf_rmse_skf[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_bfhf_rmse_skf += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_bfhf_rmse_skf /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_bfhf_rmse_skf
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_cat_bfhf_rmse_skf.pkl', 'wb') as f:
    pickle.dump(oof_cat_bfhf_rmse_skf, f)
with open(r'gbdt-models\oof_cat_bfhf_rmse_skf.pkl', 'rb') as f:
    oof_cat_bfhf_rmse_skf = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_bfhf_rmse_skf_dri = np.zeros(len(train))
pred_cat_bfhf_rmse_skf_dri = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Lossguide',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_bfhf_rmse_skf_dri[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_bfhf_rmse_skf_dri += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_bfhf_rmse_skf_dri /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_bfhf_rmse_skf_dri
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_cat_bfhf_rmse_skf_dri.pkl', 'wb') as f:
    pickle.dump(oof_cat_bfhf_rmse_skf_dri, f)
with open(r'gbdt-models\oof_cat_bfhf_rmse_skf_dri.pkl', 'rb') as f:
    oof_cat_bfhf_rmse_skf_dri = pickle.load(f)




In [None]:
%%time
FOLDS = 10
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_bfhf_rmse_2 = np.zeros(len(train))
pred_cat_bfhf_rmse_2 = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",

        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_bfhf_rmse_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_bfhf_rmse_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_bfhf_rmse_2 /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_bfhf_rmse_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_cat_bfhf_rmse_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_bfhf_rmse_2, f)
with open(r'gbdt-models\oof_cat_bfhf_rmse_2.pkl', 'rb') as f:
    oof_cat_bfhf_rmse_2 = pickle.load(f)


In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_bfhf_rmse_skf_2 = np.zeros(len(train))
pred_cat_bfhf_rmse_skf_2 = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["efs"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)
    
    model_filename = os.path.join(catboost_dir, f"cat_bfhf_rmse_skf_2_fold_{i+1}.cbm")
    model_cat.save_model(model_filename)
    print(f"Model for fold {i+1} saved as {model_filename}")

    # INFER OOF
    oof_cat_bfhf_rmse_skf_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_bfhf_rmse_skf_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_bfhf_rmse_skf_2 /= FOLDS


In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_bfhf_rmse_skf_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_cat_bfhf_rmse_skf_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_bfhf_rmse_skf_2, f)
with open(r'gbdt-models\oof_cat_bfhf_rmse_skf_2.pkl', 'rb') as f:
    oof_cat_bfhf_rmse_skf_2 = pickle.load(f)



In [None]:
%%time
FOLDS = 10
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_cat_bfhf_rmse_skf_dri_2 = np.zeros(len(train))
pred_cat_bfhf_rmse_skf_dri_2 = np.zeros(len(test))

stratify_labels = train["race_group"].astype(str) + "_" + train["dri_score"].astype(str)

for i, (train_index, test_index) in enumerate(skf.split(train, stratify_labels)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)

    x_train = train.loc[train_index,FEATURES].copy()
    y_train = train.loc[train_index,"y_bfhf"]
    x_valid = train.loc[test_index,FEATURES].copy()
    y_valid = train.loc[test_index,"y_bfhf"]
    x_test = test[FEATURES].copy()

    model_cat = CatBoostRegressor(
        task_type="GPU",
        learning_rate=0.1,
        grow_policy='Depthwise',
        objective="RMSE",
        #early_stopping_rounds=25,
    )
    model_cat.fit(x_train,y_train,
              eval_set=(x_valid, y_valid),
              cat_features=CATS,
              verbose=250)

    # INFER OOF
    oof_cat_bfhf_rmse_skf_dri_2[test_index] = model_cat.predict(x_valid)
    # INFER TEST
    pred_cat_bfhf_rmse_skf_dri_2 += model_cat.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_cat_bfhf_rmse_skf_dri_2 /= FOLDS

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_cat_bfhf_rmse_skf_dri_2
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for CatBoost BFHF RMSE =",m)

with open(r'gbdt-models\oof_cat_bfhf_rmse_skf_dri_2.pkl', 'wb') as f:
    pickle.dump(oof_cat_bfhf_rmse_skf_dri_2, f)
with open(r'gbdt-models\oof_cat_bfhf_rmse_skf_dri_2.pkl', 'rb') as f:
    oof_cat_bfhf_rmse_skf_dri_2 = pickle.load(f)


# ENSEMBLE

In [None]:
# 0.6823

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = rankdata(oof_xgb_km_rmse) + rankdata(oof_cat_km_rmse) + rankdata(oof_cat_naf) + rankdata(oof_cat_quantile_rmse)\
                        + rankdata(oof_xgb_quantile_rmse)\
                     + rankdata(oof_xgb_quantile_mae) + rankdata(oof_xgb_naf_rmse) + rankdata(oof_xgb_cox) + rankdata(oof_cat_cox)\
                    + rankdata(oof_cat_aft) + rankdata(oof_xgb_ph_rmse)\
                     + rankdata(oof_cat_quantile_rmse_2) + rankdata(oof_cat_km_rmse_2) + rankdata(oof_cat_cox_2)

m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for Ensemble =",m)

# NN PART

In [None]:
!pip download -q lightning pytorch-tabular


In [None]:
!pip install -q /content/pytorch_lightning-2.4.0-py3-none-any.whl
!pip install -q /content/scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q /content/torchmetrics-1.5.2-py3-none-any.whl
!pip install -q /content/pytorch_tabnet-4.1.0-py3-none-any.whl
!pip install -q /content/einops-0.7.0-py3-none-any.whl
!pip install -q /content/pytorch_tabular-1.1.1-py2.py3-none-any.whl

In [None]:
import numpy as np
import pandas as pd
import torch
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import TensorDataset


def get_X_cat(df, cat_cols, transformers=None):
    """
    Apply a specific categorical data transformer or a LabelEncoder if None.
    """
    if transformers is None:
        transformers = [LabelEncoder().fit(df[col]) for col in cat_cols]
    return transformers, np.array(
        [transformer.transform(df[col]) for col, transformer in zip(cat_cols, transformers)]
    ).T


def preprocess_data(train, val):
    """
    Standardize numerical variables and transform (Label-encode) categoricals.
    Fill NA values with mean for numerical.
    Create torch dataloaders to prepare data for training and evaluation.
    """
    X_cat_train, X_cat_val, numerical, transformers = get_categoricals(train, val)
    scaler = StandardScaler()
    imp = SimpleImputer(missing_values=np.nan, strategy='mean', add_indicator=True)
    X_num_train = imp.fit_transform(train[numerical])
    X_num_train = scaler.fit_transform(X_num_train)
    X_num_val = imp.transform(val[numerical])
    X_num_val = scaler.transform(X_num_val)
    dl_train = init_dl(X_cat_train, X_num_train, train, training=True)
    dl_val = init_dl(X_cat_val, X_num_val, val)
    return X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers


def get_categoricals(train, val):
    """
    Remove constant categorical columns and transform them using LabelEncoder.
    Return the label-transformers for each categorical column, categorical dataframes and numerical columns.
    """
    categorical_cols, numerical = get_feature_types(train)
    remove = []
    for col in categorical_cols:
        if train[col].nunique() == 1:
            remove.append(col)
        ind = ~val[col].isin(train[col])
        if ind.any():
            val.loc[ind, col] = np.nan
    categorical_cols = [col for col in categorical_cols if col not in remove]
    transformers, X_cat_train = get_X_cat(train, categorical_cols)
    _, X_cat_val = get_X_cat(val, categorical_cols, transformers)
    return X_cat_train, X_cat_val, numerical, transformers


def init_dl(X_cat, X_num, df, training=False):
    """
    Initialize data loaders with 4 dimensions : categorical dataframe, numerical dataframe and target values (efs and efs_time).
    Notice that efs_time is log-transformed.
    Fix batch size to 2048 and return dataloader for training or validation depending on training value.
    """
    ds_train = TensorDataset(
        torch.tensor(X_cat, dtype=torch.long),
        torch.tensor(X_num, dtype=torch.float32),
        torch.tensor(df.efs_time.values, dtype=torch.float32).log(),
        torch.tensor(df.efs.values, dtype=torch.long)
    )
    bs = 2048
    dl_train = torch.utils.data.DataLoader(ds_train, batch_size=bs, pin_memory=True, shuffle=training)
    return dl_train


def get_feature_types(train):
    """
    Utility function to return categorical and numerical column names.
    """
    categorical_cols = [col for i, col in enumerate(train.columns) if ((train[col].dtype == "object") | (2 < train[col].nunique() < 25))]
    RMV = ["ID", "efs", "efs_time", "y"]
    FEATURES = [c for c in train.columns if not c in RMV]
    print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")
    numerical = [i for i in FEATURES if i not in categorical_cols]
    return categorical_cols, numerical


def add_features(df):
    """
    Create some new features to help the model focus on specific patterns.
    """
    df['is_cyto_score_same'] = (df['cyto_score'] == df['cyto_score_detail']).astype(int)
    df['year_hct'] -= 2000

    return df


def load_data():
    """
    Load data and add features.
    """
    test = pd.read_csv(r"data\cibmtr\test.csv")
    test = add_features(test)
    print("Test shape:", test.shape)
    train = pd.read_csv(r"data\cibmtr\train.csv")
    train = add_features(train)
    print("Train shape:", train.shape)
    return test, train

In [None]:
import functools
from typing import List

import pytorch_lightning as pl
import numpy as np
import torch
from lifelines.utils import concordance_index
from pytorch_lightning.cli import ReduceLROnPlateau
from pytorch_tabular.models.common.layers import ODST
from torch import nn
from pytorch_lightning.utilities import grad_norm


class CatEmbeddings(nn.Module):
    """
    Embedding module for the categorical dataframe.
    """
    def __init__(
        self,
        projection_dim: int,
        categorical_cardinality: List[int],
        embedding_dim: int
    ):
        """
        projection_dim: The dimension of the final output after projecting the concatenated embeddings into a lower-dimensional space.
        categorical_cardinality: A list where each element represents the number of unique categories (cardinality) in each categorical feature.
        embedding_dim: The size of the embedding space for each categorical feature.
        self.embeddings: list of embedding layers for each categorical feature.
        self.projection: sequential neural network that goes from the embedding to the output projection dimension with GELU activation.
        """
        super(CatEmbeddings, self).__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(cardinality, embedding_dim)
            for cardinality in categorical_cardinality
        ])
        self.projection = nn.Sequential(
            nn.Linear(embedding_dim * len(categorical_cardinality), projection_dim),
            nn.GELU(),
            nn.Linear(projection_dim, projection_dim)
        )

    def forward(self, x_cat):
        """
        Apply the projection on concatened embeddings that contains all categorical features.
        """
        x_cat = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embeddings)]
        x_cat = torch.cat(x_cat, dim=1)
        return self.projection(x_cat)


class NN(nn.Module):
    """
    Train a model on both categorical embeddings and numerical data.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            dropout: float = 0
    ):
        """
        continuous_dim: The number of continuous features.
        categorical_cardinality: A list of integers representing the number of unique categories in each categorical feature.
        embedding_dim: The dimensionality of the embedding space for each categorical feature.
        projection_dim: The size of the projected output space for the categorical embeddings.
        hidden_dim: The number of neurons in the hidden layer of the MLP.
        dropout: The dropout rate applied in the network.
        self.embeddings: previous embeddings for categorical data.
        self.mlp: defines an MLP model with an ODST layer followed by batch normalization and dropout.
        self.out: linear output layer that maps the output of the MLP to a single value
        self.dropout: defines dropout
        Weights initialization with xavier normal algorithm and biases with zeros.
        """
        super(NN, self).__init__()
        self.embeddings = CatEmbeddings(projection_dim, categorical_cardinality, embedding_dim)
        self.mlp = nn.Sequential(
            ODST(projection_dim + continuous_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.Dropout(dropout)
        )
        self.out = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

        # initialize weights
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x_cat, x_cont):
        """
        Create embedding layers for categorical data, concatenate with continous variables.
        Add dropout and goes through MLP and return raw output and 1-dimensional output as well.
        """
        x = self.embeddings(x_cat)
        x = torch.cat([x, x_cont], dim=1)
        x = self.dropout(x)
        x = self.mlp(x)
        return self.out(x), x


@functools.lru_cache
def combinations(N):
    """
    calculates all possible 2-combinations (pairs) of a tensor of indices from 0 to N-1,
    and caches the result using functools.lru_cache for optimization
    """
    ind = torch.arange(N)
    comb = torch.combinations(ind, r=2)
    return comb.cuda()


class LitNN(pl.LightningModule):
    """
    Main Model creation and losses definition to fully train the model.
    """
    def __init__(
            self,
            continuous_dim: int,
            categorical_cardinality: List[int],
            embedding_dim: int,
            projection_dim: int,
            hidden_dim: int,
            lr: float = 1e-3,
            dropout: float = 0.2,
            weight_decay: float = 1e-3,
            aux_weight: float = 0.1,
            margin: float = 0.5,
            race_index: int = 0
    ):
        """
        continuous_dim: The number of continuous input features.
        categorical_cardinality: A list of integers, where each element corresponds to the number of unique categories for each categorical feature.
        embedding_dim: The dimension of the embeddings for the categorical features.
        projection_dim: The dimension of the projected space after embedding concatenation.
        hidden_dim: The size of the hidden layers in the feedforward network (MLP).
        lr: The learning rate for the optimizer.
        dropout: Dropout probability to avoid overfitting.
        weight_decay: The L2 regularization term for the optimizer.
        aux_weight: Weight used for auxiliary tasks.
        margin: Margin used in some loss functions.
        race_index: An index that refer to race_group in the input data.
        """
        super(LitNN, self).__init__()
        self.save_hyperparameters()

        # Creates an instance of the NN model defined above
        self.model = NN(
            continuous_dim=self.hparams.continuous_dim,
            categorical_cardinality=self.hparams.categorical_cardinality,
            embedding_dim=self.hparams.embedding_dim,
            projection_dim=self.hparams.projection_dim,
            hidden_dim=self.hparams.hidden_dim,
            dropout=self.hparams.dropout
        )
        self.targets = []

        # Defines a small feedforward neural network that performs an auxiliary task with 1-dimensional output
        self.aux_cls = nn.Sequential(
            nn.Linear(self.hparams.hidden_dim, self.hparams.hidden_dim // 3),
            nn.GELU(),
            nn.Linear(self.hparams.hidden_dim // 3, 1)
        )

    def on_before_optimizer_step(self, optimizer):
        """
        Compute the 2-norm for each layer
        If using mixed precision, the gradients are already unscaled here
        """
        norms = grad_norm(self.model, norm_type=2)
        self.log_dict(norms)

    def forward(self, x_cat, x_cont):
        """
        Forward pass that outputs the 1-dimensional prediction and the embeddings (raw output)
        """
        x, emb = self.model(x_cat, x_cont)
        return x.squeeze(1), emb

    def training_step(self, batch, batch_idx):
        """
        defines how the model processes each batch of data during training.
        A batch is a combination of : categorical data, continuous data, efs_time (y) and efs event.
        y_hat is the efs_time prediction on all data and aux_pred is auxiliary prediction on embeddings.
        Calculates loss and race_group loss on full data.
        Auxiliary loss is calculated with an event mask, ignoring efs=0 predictions and taking the average.
        Returns loss and aux_loss multiplied by weight defined above.
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        aux_pred = self.aux_cls(emb).squeeze(1)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        aux_loss = nn.functional.mse_loss(aux_pred, y, reduction='none')
        aux_mask = efs == 1
        aux_loss = (aux_loss * aux_mask).sum() / aux_mask.sum()
        self.log("train_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        self.log("race_loss", race_loss, on_epoch=True, prog_bar=True, logger=True, on_step=False)
        self.log("aux_loss", aux_loss, on_epoch=True, prog_bar=True, logger=True, on_step=False)
        return loss + aux_loss * self.hparams.aux_weight

    def get_full_loss(self, efs, x_cat, y, y_hat):
        """
        Output loss and race_group loss.
        """
        loss = self.calc_loss(y, y_hat, efs)
        race_loss = self.get_race_losses(efs, x_cat, y, y_hat)
        loss += 0.1 * race_loss
        return loss, race_loss

    def get_race_losses(self, efs, x_cat, y, y_hat):
        """
        Calculate loss for each race_group based on deviation/variance.
        """
        races = torch.unique(x_cat[:, self.hparams.race_index])
        race_losses = []
        for race in races:
            ind = x_cat[:, self.hparams.race_index] == race
            race_losses.append(self.calc_loss(y[ind], y_hat[ind], efs[ind]))
        race_loss = sum(race_losses) / len(race_losses)
        races_loss_std = sum((r - race_loss)**2 for r in race_losses) / len(race_losses)
        return torch.sqrt(races_loss_std)

    def calc_loss(self, y, y_hat, efs):
        """
        Most important part of the model : loss function used for training.
        We face survival data with event indicators along with time-to-event.

        This function computes the main loss by the following the steps :
        * create all data pairs with "combinations" function (= all "two subjects" combinations)
        * make sure that we have at least 1 event in each pair
        * convert y to +1 or -1 depending on the correct ranking
        * loss is computed using a margin-based hinge loss
        * mask is applied to ensure only valid pairs are being used (censored data can't be ranked with event in some cases)
        * average loss on all pairs is returned
        """
        N = y.shape[0]
        comb = combinations(N)
        comb = comb[(efs[comb[:, 0]] == 1) | (efs[comb[:, 1]] == 1)]
        pred_left = y_hat[comb[:, 0]]
        pred_right = y_hat[comb[:, 1]]
        y_left = y[comb[:, 0]]
        y_right = y[comb[:, 1]]
        y = 2 * (y_left > y_right).int() - 1
        loss = nn.functional.relu(-y * (pred_left - pred_right) + self.hparams.margin)
        mask = self.get_mask(comb, efs, y_left, y_right)
        loss = (loss.double() * (mask.double())).sum() / mask.sum()
        return loss

    def get_mask(self, comb, efs, y_left, y_right):
        """
        Defines all invalid comparisons :
        * Case 1: "Left outlived Right" but Right is censored
        * Case 2: "Right outlived Left" but Left is censored
        Masks for case 1 and case 2 are combined using |= operator and inverted using ~ to create a "valid pair mask"
        """
        left_outlived = y_left >= y_right
        left_1_right_0 = (efs[comb[:, 0]] == 1) & (efs[comb[:, 1]] == 0)
        mask2 = (left_outlived & left_1_right_0)
        right_outlived = y_right >= y_left
        right_1_left_0 = (efs[comb[:, 1]] == 1) & (efs[comb[:, 0]] == 0)
        mask2 |= (right_outlived & right_1_left_0)
        mask2 = ~mask2
        mask = mask2
        return mask

    def validation_step(self, batch, batch_idx):
        """
        This method defines how the model processes each batch during validation
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        self.targets.append([y, y_hat.detach(), efs, x_cat[:, self.hparams.race_index]])
        self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def on_validation_epoch_end(self):
        """
        At the end of the validation epoch, it computes and logs the concordance index
        """
        cindex, metric = self._calc_cindex()
        self.log("cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("cindex_simple", cindex, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()

    def _calc_cindex(self):
        """
        Calculate c-index accounting for each race_group or global.
        """
        y = torch.cat([t[0] for t in self.targets]).cpu().numpy()
        y_hat = torch.cat([t[1] for t in self.targets]).cpu().numpy()
        efs = torch.cat([t[2] for t in self.targets]).cpu().numpy()
        races = torch.cat([t[3] for t in self.targets]).cpu().numpy()
        metric = self._metric(efs, races, y, y_hat)
        cindex = concordance_index(y, y_hat, efs)
        return cindex, metric

    def _metric(self, efs, races, y, y_hat):
        """
        Calculate c-index accounting for each race_group
        """
        metric_list = []
        for race in np.unique(races):
            y_ = y[races == race]
            y_hat_ = y_hat[races == race]
            efs_ = efs[races == race]
            metric_list.append(concordance_index(y_, y_hat_, efs_))
        metric = float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))
        return metric

    def test_step(self, batch, batch_idx):
        """
        Same as training step but to log test data
        """
        x_cat, x_cont, y, efs = batch
        y_hat, emb = self(x_cat, x_cont)
        loss, race_loss = self.get_full_loss(efs, x_cat, y, y_hat)
        self.targets.append([y, y_hat.detach(), efs, x_cat[:, self.hparams.race_index]])
        self.log("test_loss", loss)
        return loss

    def on_test_epoch_end(self) -> None:
        """
        At the end of the test epoch, calculates and logs the concordance index for the test set
        """
        cindex, metric = self._calc_cindex()
        self.log("test_cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("test_cindex_simple", cindex, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()


    def configure_optimizers(self):
        """
        configures the optimizer and learning rate scheduler:
        * Optimizer: Adam optimizer with weight decay (L2 regularization).
        * Scheduler: Cosine Annealing scheduler, which adjusts the learning rate according to a cosine curve.
        """
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=self.hparams.weight_decay)
        scheduler_config = {
            "scheduler": torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer,
                T_max=45,
                eta_min=6e-3
            ),
            "interval": "epoch",
            "frequency": 1,
            "strict": False,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler_config}

In [None]:
import json
import pytorch_lightning as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from pytorch_lightning.callbacks import LearningRateMonitor, TQDMProgressBar
from pytorch_lightning.callbacks import StochasticWeightAveraging
from sklearn.model_selection import StratifiedKFold

pl.seed_everything(42)

def main(hparams):
    """
    Main function to train the model.
    The steps are as following :
    * load data and fill efs and efs time for test data with 1
    * initialize pred array with 0
    * get categorical and numerical columns
    * split the train data on the stratified criterion : race_group * newborns yes/no
    * preprocess the fold data (create dataloaders)
    * train the model and create final submission output
    * SAVE OOF predictions as well
    """
    # 1) Load data
    test, train_original = load_data()
    test['efs_time'] = 1
    test['efs'] = 1

    # 2) Initialize arrays to store predictions
    test_pred = np.zeros(test.shape[0])  # For final test predictions
    oof_pred = np.zeros(train_original.shape[0])  # For OOF predictions

    # 3) Get feature columns
    categorical_cols, numerical = get_feature_types(train_original)

    # 4) Set up folds
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    stratify_col = train_original.race_group.astype(str) + (train_original.age_at_hct == 0.044).astype(str)

    for i, (train_index, val_index) in enumerate(kf.split(train_original, stratify_col)):
        print(f"\n=== Fold {i+1} / {kf.n_splits} ===")

        # Split the data
        tt = train_original.copy()
        train_df = tt.iloc[train_index]
        val_df = tt.iloc[val_index]

        # 5) Preprocess train/val
        X_cat_val, X_num_train, X_num_val, dl_train, dl_val, transformers = preprocess_data(train_df, val_df)

        # 6) Train the model on this fold
        model = train_final(X_num_train, dl_train, dl_val, transformers, categorical_cols=categorical_cols)

        # 7) Inference on validation set => store OOF
        with torch.no_grad():
            val_preds, _ = model.cuda().eval()(
                torch.tensor(X_cat_val, dtype=torch.long).cuda(),
                torch.tensor(X_num_val, dtype=torch.float32).cuda()
            )
        # Place the predictions into our OOF array
        oof_pred[val_index] = val_preds.detach().cpu().numpy()

        # 8) Inference on test set => accumulate for final submission
        #    (We typically preprocess the test with the train_df from the same fold)
        X_cat_test, _, X_num_test, _, _, _ = preprocess_data(train_df, test)
        with torch.no_grad():
            fold_test_pred, _ = model.cuda().eval()(
                torch.tensor(X_cat_test, dtype=torch.long).cuda(),
                torch.tensor(X_num_test, dtype=torch.float32).cuda()
            )
        test_pred += fold_test_pred.detach().cpu().numpy()

    # 9) Average the test predictions across folds
    test_pred /= kf.n_splits

    # 10) Save final submission
    subm_data = pd.read_csv("/content/data/cibmtr/sample_submission.csv")
    subm_data['prediction'] = -test_pred
    subm_data.to_csv('submission.csv', index=False)
    display(subm_data.head())

    # 11) Optionally, save the OOF predictions
    # Attach oof_pred to the original train data or store in a separate CSV
    train_original["oof_pred"] = -oof_pred
    train_original.to_csv("oof_predictions.csv", index=False)

    return -test_pred, -oof_pred  # Return them if desired



def train_final(X_num_train, dl_train, dl_val, transformers, hparams=None, categorical_cols=None):
    """
    Defines model hyperparameters and fit the model.
    """
    if hparams is None:
        hparams = {
            "embedding_dim": 16,
            "projection_dim": 112,
            "hidden_dim": 56,
            "lr": 0.06464861983337984,
            "dropout": 0.05463240181423116,
            "aux_weight": 0.26545778308743806,
            "margin": 0.2588153271003354,
            "weight_decay": 0.0002773544957610778
        }
    model = LitNN(
        continuous_dim=X_num_train.shape[1],
        categorical_cardinality=[len(t.classes_) for t in transformers],
        race_index=categorical_cols.index("race_group"),
        **hparams
    )

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
        monitor="val_loss",
        save_top_k=1,
        mode='min'
    )
    trainer = pl.Trainer(
        accelerator='cuda',
        max_epochs=60,
        callbacks=[
            checkpoint_callback,
            LearningRateMonitor(logging_interval='epoch'),
            TQDMProgressBar(),
            StochasticWeightAveraging(swa_lrs=1e-5, swa_epoch_start=45, annealing_epochs=15)
        ],
    )
    trainer.fit(model, dl_train)
    trainer.test(model, dl_val)
    return model.eval()


hparams = None
test_pred_nn, oof_pred_nn = main(hparams)
print("done")

In [None]:
# 0.6820

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = rankdata(oof_xgb_km_rmse) + rankdata(oof_cat_km_rmse) + rankdata(oof_cat_naf) + rankdata(oof_cat_quantile_rmse)\
                        + rankdata(oof_xgb_quantile_rmse) + rankdata(oof_xgb_quantile_mae) + rankdata(oof_xgb_naf_rmse) + rankdata(oof_xgb_cox) + rankdata(oof_cat_cox)\
                    + rankdata(oof_cat_aft) + rankdata(oof_xgb_ph_rmse) + rankdata(oof_cat_quantile_rmse_2) + rankdata(oof_cat_km_rmse_2) + rankdata(oof_cat_cox_2)\
                    + rankdata(oof_pred_nn)



m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for Ensemble =",m)

# TABM FIRST MODEL

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!pip download hillclimbers -q
!pip download scikit-learn==1.4.0 -q
!pip download ydf -q
!pip download rtdl_num_embeddings -q
!pip download delu -q

In [None]:
!pip install scikit-learn==1.4.0 -q --no-index --find-links=/content/scikit_learn-1.4.0-1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install rtdl_num_embeddings -q --no-index --find-links=/content/rtdl_num_embeddings-0.0.11-py3-none-any.whl
!pip install delu -q --no-index --find-links=/content/delu-0.0.26-py3-none-any.whl

In [None]:
# @title tabm_reference
# %% [code]
# License: https://github.com/yandex-research/tabm/blob/main/LICENSE

# NOTE
# The minimum required versions of the dependencies are specified in README.md.

import itertools
from typing import Any, Literal

import rtdl_num_embeddings
import torch
import torch.nn as nn
from torch import Tensor


# ======================================================================================
# Initialization
# ======================================================================================
def init_rsqrt_uniform_(x: Tensor, d: int) -> Tensor:
    assert d > 0
    d_rsqrt = d**-0.5
    return nn.init.uniform_(x, -d_rsqrt, d_rsqrt)


@torch.inference_mode()
def init_random_signs_(x: Tensor) -> Tensor:
    return x.bernoulli_(0.5).mul_(2).add_(-1)


# ======================================================================================
# Modules
# ======================================================================================
class NLinear(nn.Module):
    """N linear layers applied in parallel to N disjoint parts of the input.

    **Shape**

    - Input: ``(B, N, in_features)``
    - Output: ``(B, N, out_features)``

    The i-th linear layer is applied to the i-th matrix of the shape (B, in_features).

    Technically, this is a simplified version of delu.nn.NLinear:
    https://yura52.github.io/delu/stable/api/generated/delu.nn.NLinear.html.
    The difference is that this layer supports only 3D inputs
    with exactly one batch dimension. By contrast, delu.nn.NLinear supports
    any number of batch dimensions.
    """

    def __init__(
        self, n: int, in_features: int, out_features: int, bias: bool = True
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.empty(n, in_features, out_features))
        self.bias = nn.Parameter(torch.empty(n, out_features)) if bias else None
        self.reset_parameters()

    def reset_parameters(self):
        d = self.weight.shape[-2]
        init_rsqrt_uniform_(self.weight, d)
        if self.bias is not None:
            init_rsqrt_uniform_(self.bias, d)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        assert x.ndim == 3
        assert x.shape[-(self.weight.ndim - 1) :] == self.weight.shape[:-1]

        x = x.transpose(0, 1)
        x = x @ self.weight
        x = x.transpose(0, 1)
        if self.bias is not None:
            x = x + self.bias
        return x


class OneHotEncoding0d(nn.Module):
    # Input:  (*, n_cat_features=len(cardinalities))
    # Output: (*, sum(cardinalities))

    def __init__(self, cardinalities: list[int]) -> None:
        super().__init__()
        self._cardinalities = cardinalities

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim >= 1
        assert x.shape[-1] == len(self._cardinalities)

        return torch.cat(
            [
                # NOTE
                # This is a quick hack to support out-of-vocabulary categories.
                #
                # Recall that lib.data.transform_cat encodes categorical features
                # as follows:
                # - In-vocabulary values receive indices from `range(cardinality)`.
                # - All out-of-vocabulary values (i.e. new categories in validation
                #   and test data that are not presented in the training data)
                #   receive the index `cardinality`.
                #
                # As such, the line below will produce the standard one-hot encoding for
                # known categories, and the all-zeros encoding for unknown categories.
                # This may not be the best approach to deal with unknown values,
                # but should be enough for our purposes.
                nn.functional.one_hot(x[..., i], cardinality + 1)[..., :-1]
                for i, cardinality in enumerate(self._cardinalities)
            ],
            -1,
        )


class ScaleEnsemble(nn.Module):
    def __init__(
        self,
        k: int,
        d: int,
        *,
        init: Literal['ones', 'normal', 'random-signs'],
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.empty(k, d))
        self._weight_init = init
        self.reset_parameters()

    def reset_parameters(self) -> None:
        if self._weight_init == 'ones':
            nn.init.ones_(self.weight)
        elif self._weight_init == 'normal':
            nn.init.normal_(self.weight)
        elif self._weight_init == 'random-signs':
            init_random_signs_(self.weight)
        else:
            raise ValueError(f'Unknown weight_init: {self._weight_init}')

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim >= 2
        return x * self.weight


class LinearEfficientEnsemble(nn.Module):
    """
    This layer is a more configurable version of the "BatchEnsemble" layer
    from the paper
    "BatchEnsemble: An Alternative Approach to Efficient Ensemble and Lifelong Learning"
    (link: https://arxiv.org/abs/2002.06715).

    First, this layer allows to select only some of the "ensembled" parts:
    - the input scaling  (r_i in the BatchEnsemble paper)
    - the output scaling (s_i in the BatchEnsemble paper)
    - the output bias    (not mentioned in the BatchEnsemble paper,
                          but is presented in public implementations)

    Second, the initialization of the scaling weights is configurable
    through the `scaling_init` argument.

    NOTE
    The term "adapter" is used in the TabM paper only to tell the story.
    The original BatchEnsemble paper does NOT use this term. So this class also
    avoids the term "adapter".
    """

    r: None | Tensor
    s: None | Tensor
    bias: None | Tensor

    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        *,
        k: int,
        ensemble_scaling_in: bool,
        ensemble_scaling_out: bool,
        ensemble_bias: bool,
        scaling_init: Literal['ones', 'random-signs'],
    ):
        assert k > 0
        if ensemble_bias:
            assert bias
        super().__init__()

        self.weight = nn.Parameter(torch.empty(out_features, in_features))
        self.register_parameter(
            'r',
            (
                nn.Parameter(torch.empty(k, in_features))
                if ensemble_scaling_in
                else None
            ),  # type: ignore[code]
        )
        self.register_parameter(
            's',
            (
                nn.Parameter(torch.empty(k, out_features))
                if ensemble_scaling_out
                else None
            ),  # type: ignore[code]
        )
        self.register_parameter(
            'bias',
            (
                nn.Parameter(torch.empty(out_features))  # type: ignore[code]
                if bias and not ensemble_bias
                else nn.Parameter(torch.empty(k, out_features))
                if ensemble_bias
                else None
            ),
        )

        self.in_features = in_features
        self.out_features = out_features
        self.k = k
        self.scaling_init = scaling_init

        self.reset_parameters()

    def reset_parameters(self):
        init_rsqrt_uniform_(self.weight, self.in_features)
        scaling_init_fn = {'ones': nn.init.ones_, 'random-signs': init_random_signs_}[
            self.scaling_init
        ]
        if self.r is not None:
            scaling_init_fn(self.r)
        if self.s is not None:
            scaling_init_fn(self.s)
        if self.bias is not None:
            bias_init = torch.empty(
                # NOTE: the shape of bias_init is (out_features,) not (k, out_features).
                # It means that all biases have the same initialization.
                # This is similar to having one shared bias plus
                # k zero-initialized non-shared biases.
                self.out_features,
                dtype=self.weight.dtype,
                device=self.weight.device,
            )
            bias_init = init_rsqrt_uniform_(bias_init, self.in_features)
            with torch.inference_mode():
                self.bias.copy_(bias_init)

    def forward(self, x: Tensor) -> Tensor:
        # x.shape == (B, K, D)
        assert x.ndim == 3

        # >>> The equation (5) from the BatchEnsemble paper (arXiv v2).
        if self.r is not None:
            x = x * self.r
        x = x @ self.weight.T
        if self.s is not None:
            x = x * self.s
        # <<<

        if self.bias is not None:
            x = x + self.bias
        return x


class MLP(nn.Module):
    def __init__(
        self,
        *,
        d_in: None | int = None,
        d_out: None | int = None,
        n_blocks: int,
        d_block: int,
        dropout: float,
        activation: str = 'ReLU',
    ) -> None:
        super().__init__()

        d_first = d_block if d_in is None else d_in
        self.blocks = nn.ModuleList(
            [
                nn.Sequential(
                    nn.Linear(d_first if i == 0 else d_block, d_block),
                    getattr(nn, activation)(),
                    nn.Dropout(dropout),
                )
                for i in range(n_blocks)
            ]
        )
        self.output = None if d_out is None else nn.Linear(d_block, d_out)

    def forward(self, x: Tensor) -> Tensor:
        for block in self.blocks:
            x = block(x)
        if self.output is not None:
            x = self.output(x)
        return x


def make_efficient_ensemble(module: nn.Module, **kwargs) -> None:
    """Replace torch.nn.Linear modules with LinearEfficientEnsemble.

    NOTE
    In the paper, there are no experiments with networks with normalization layers.
    Perhaps, their trainable weights (the affine transformations) also need
    "ensemblification" as in the paper about "FiLM-Ensemble".
    Additional experiments are required to make conclusions.
    """
    for name, submodule in list(module.named_children()):
        if isinstance(submodule, nn.Linear):
            module.add_module(
                name,
                LinearEfficientEnsemble(
                    in_features=submodule.in_features,
                    out_features=submodule.out_features,
                    bias=submodule.bias is not None,
                    **kwargs,
                ),
            )
        else:
            make_efficient_ensemble(submodule, **kwargs)


def _get_first_ensemble_layer(backbone: MLP) -> LinearEfficientEnsemble:
    if isinstance(backbone, MLP):
        return backbone.blocks[0][0]  # type: ignore[code]
    else:
        raise RuntimeError(f'Unsupported backbone: {backbone}')


@torch.inference_mode()
def _init_first_adapter(
    weight: Tensor,
    distribution: Literal['normal', 'random-signs'],
    init_sections: list[int],
) -> None:
    """Initialize the first adapter.

    NOTE
    The `init_sections` argument is a historical artifact that accidentally leaked
    from irrelevant experiments to the final models. Perhaps, the code related
    to `init_sections` can be simply removed, but this was not tested.
    """
    assert weight.ndim == 2
    assert weight.shape[1] == sum(init_sections)

    if distribution == 'normal':
        init_fn_ = nn.init.normal_
    elif distribution == 'random-signs':
        init_fn_ = init_random_signs_
    else:
        raise ValueError(f'Unknown distribution: {distribution}')

    section_bounds = [0, *torch.tensor(init_sections).cumsum(0).tolist()]
    for i in range(len(init_sections)):
        # NOTE
        # As noted above, this section-based initialization is an arbitrary historical
        # artifact. Consider the first adapter of one ensemble member.
        # This adapter vector is implicitly split into "sections",
        # where one section corresponds to one feature. The code below ensures that
        # the adapter weights in one section are initialized with the same random value
        # from the given distribution.
        w = torch.empty((len(weight), 1), dtype=weight.dtype, device=weight.device)
        init_fn_(w)
        weight[:, section_bounds[i] : section_bounds[i + 1]] = w


_CUSTOM_MODULES = {
    # https://docs.python.org/3/library/stdtypes.html#definition.__name__
    CustomModule.__name__: CustomModule
    for CustomModule in [
        rtdl_num_embeddings.LinearEmbeddings,
        rtdl_num_embeddings.LinearReLUEmbeddings,
        rtdl_num_embeddings.PeriodicEmbeddings,
        rtdl_num_embeddings.PiecewiseLinearEmbeddings,
        MLP,
    ]
}


def make_module(type: str, *args, **kwargs) -> nn.Module:
    Module = getattr(nn, type, None)
    if Module is None:
        Module = _CUSTOM_MODULES[type]
    return Module(*args, **kwargs)


# ======================================================================================
# Optimization
# ======================================================================================
def default_zero_weight_decay_condition(
    module_name: str, module: nn.Module, parameter_name: str, parameter: nn.Parameter
):
    from rtdl_num_embeddings import _Periodic

    del module_name, parameter
    return parameter_name.endswith('bias') or isinstance(
        module,
        nn.BatchNorm1d
        | nn.LayerNorm
        | nn.InstanceNorm1d
        | rtdl_num_embeddings.LinearEmbeddings
        | rtdl_num_embeddings.LinearReLUEmbeddings
        | _Periodic,
    )


def make_parameter_groups(
    module: nn.Module,
    zero_weight_decay_condition=default_zero_weight_decay_condition,
    custom_groups: None | list[dict[str, Any]] = None,
) -> list[dict[str, Any]]:
    if custom_groups is None:
        custom_groups = []
    custom_params = frozenset(
        itertools.chain.from_iterable(group['params'] for group in custom_groups)
    )
    assert len(custom_params) == sum(
        len(group['params']) for group in custom_groups
    ), 'Parameters in custom_groups must not intersect'
    zero_wd_params = frozenset(
        p
        for mn, m in module.named_modules()
        for pn, p in m.named_parameters()
        if p not in custom_params and zero_weight_decay_condition(mn, m, pn, p)
    )
    default_group = {
        'params': [
            p
            for p in module.parameters()
            if p not in custom_params and p not in zero_wd_params
        ]
    }
    return [
        default_group,
        {'params': list(zero_wd_params), 'weight_decay': 0.0},
        *custom_groups,
    ]


# ======================================================================================
# The model
# ======================================================================================
class Model(nn.Module):
    """MLP & TabM."""

    def __init__(
        self,
        *,
        n_num_features: int,
        cat_cardinalities: list[int],
        n_classes: None | int,
        backbone: dict,
        bins: None | list[Tensor],  # For piecewise-linear encoding/embeddings.
        num_embeddings: None | dict = None,
        arch_type: Literal[
            # Plain feed-forward network without any kind of ensembling.
            'plain',
            #
            # TabM-mini
            'tabm-mini',
            #
            # TabM-mini. The first adapter is initialized from the normal distribution.
            # This is used in Section 5.1 of the paper.
            'tabm-mini-normal',
            #
            # TabM
            'tabm',
            #
            # TabM. The first adapter is initialized from the normal distribution.
            # This variation is not used in the paper, but there is a preliminary
            # evidence that may be a better default strategy.
            'tabm-normal',
        ],
        k: None | int = None,
    ) -> None:
        # >>> Validate arguments.
        assert n_num_features >= 0
        assert n_num_features or cat_cardinalities
        if arch_type == 'plain':
            assert k is None
        else:
            assert k is not None
            assert k > 0

        super().__init__()

        # >>> Continuous (numerical) features
        first_adapter_sections = []  # See the comment in `_init_first_adapter`.

        if n_num_features == 0:
            assert bins is None
            self.num_module = None
            d_num = 0

        elif num_embeddings is None:
            assert bins is None
            self.num_module = None
            d_num = n_num_features
            first_adapter_sections.extend(1 for _ in range(n_num_features))

        else:
            if bins is None:
                self.num_module = make_module(
                    **num_embeddings, n_features=n_num_features
                )
            else:
                assert num_embeddings['type'].startswith('PiecewiseLinearEmbeddings')
                self.num_module = make_module(**num_embeddings, bins=bins)
            d_num = n_num_features * num_embeddings['d_embedding']
            first_adapter_sections.extend(
                num_embeddings['d_embedding'] for _ in range(n_num_features)
            )

        # >>> Categorical features
        self.cat_module = (
            OneHotEncoding0d(cat_cardinalities) if cat_cardinalities else None
        )
        first_adapter_sections.extend(cat_cardinalities)
        d_cat = sum(cat_cardinalities)

        # >>> Backbone
        d_flat = d_num + d_cat
        self.minimal_ensemble_adapter = None
        # Any backbone can be here but we provide only MLP
        self.backbone = make_module(d_in=d_flat, **backbone)

        if arch_type != 'plain':
            assert k is not None
            first_adapter_init = (
                'normal'
                if arch_type in ('tabm-mini-normal', 'tabm-normal')
                # For other arch_types, the initialization depends
                # on the presense of num_embeddings.
                else 'random-signs'
                if num_embeddings is None
                else 'normal'
            )

            if arch_type in ('tabm-mini', 'tabm-mini-normal'):
                # Minimal ensemble
                self.minimal_ensemble_adapter = ScaleEnsemble(
                    k,
                    d_flat,
                    init='random-signs' if num_embeddings is None else 'normal',
                )
                _init_first_adapter(
                    self.minimal_ensemble_adapter.weight,  # type: ignore[code]
                    first_adapter_init,
                    first_adapter_sections,
                )

            elif arch_type in ('tabm', 'tabm-normal'):
                # Like BatchEnsemble, but all multiplicative adapters,
                # except for the very first one, are initialized with ones.
                make_efficient_ensemble(
                    self.backbone,
                    k=k,
                    ensemble_scaling_in=True,
                    ensemble_scaling_out=True,
                    ensemble_bias=True,
                    scaling_init='ones',
                )
                _init_first_adapter(
                    _get_first_ensemble_layer(self.backbone).r,  # type: ignore[code]
                    first_adapter_init,
                    first_adapter_sections,
                )

            else:
                raise ValueError(f'Unknown arch_type: {arch_type}')

        # >>> Output
        d_block = backbone['d_block']
        d_out = 1 if n_classes is None else n_classes
        self.output = (
            nn.Linear(d_block, d_out)
            if arch_type == 'plain'
            else NLinear(k, d_block, d_out)  # type: ignore[code]
        )

        # >>>
        self.arch_type = arch_type
        self.k = k

    def forward(
        self, x_num: None | Tensor = None, x_cat: None | Tensor = None
    ) -> Tensor:
        x = []
        if x_num is not None:
            x.append(x_num if self.num_module is None else self.num_module(x_num))
        if x_cat is None:
            assert self.cat_module is None
        else:
            assert self.cat_module is not None
            x.append(self.cat_module(x_cat).float())
        x = torch.column_stack([x_.flatten(1, -1) for x_ in x])

        if self.k is not None:
            x = x[:, None].expand(-1, self.k, -1)  # (B, D) -> (B, K, D)
            if self.minimal_ensemble_adapter is not None:
                x = self.minimal_ensemble_adapter(x)
        else:
            assert self.minimal_ensemble_adapter is None

        x = self.backbone(x)
        x = self.output(x)
        if self.k is None:
            # Adjust the output shape for plain networks to make them compatible
            # with the rest of the script (loss, metrics, predictions, ...).
            # (B, D_OUT) -> (B, 1, D_OUT)
            x = x[:, None]
        return x

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import rtdl_num_embeddings

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, root_mean_squared_error, roc_auc_score, root_mean_squared_log_error, mean_squared_log_error

from IPython.display import clear_output

import warnings
warnings.filterwarnings('ignore')

import joblib
from torch.utils.data import TensorDataset, DataLoader, Dataset, ConcatDataset

import delu
import math

from collections import OrderedDict
from tqdm import tqdm

In [None]:
train = pd.read_csv(r'data\cibmtr\train.csv')
test = pd.read_csv(r'data\cibmtr\test.csv')

In [None]:
from lifelines import KaplanMeierFitter
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y

train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

train["label"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')
train.loc[train['efs']==0, 'label'] -= 0.2

train["efs_time2"] = train.efs_time.copy()
train.loc[train.efs==0,"efs_time2"] *= -1

In [None]:
combined = pd.concat([train, test], axis=0)

RMV = ["ID","efs","efs_time", "label", "y", "efs_time2"]
FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

CATS = []
for c in FEATURES:
    num_unique = combined[c].nunique()
    if num_unique < 100:
        CATS.append(c)
        train[c] = train[c].fillna(999)
        test[c] = test[c].fillna(999)
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

NUMS = [c for c in FEATURES if not c in CATS]

combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")

    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")



cat_unique = combined[CATS].nunique().to_list()


for c in NUMS:
    combined[c] = combined[c].fillna(0)

train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [None]:
cats_index = [train[FEATURES].columns.get_loc(cat) for cat in CATS]


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train[NUMS] = scaler.fit_transform(train[NUMS])
test[NUMS] = scaler.transform(test[NUMS])

folds = 5
train['kfold'] = -1

target = 'label'
kf = KFold(n_splits=5, random_state=42, shuffle=True)
groups = train['efs'].astype(str)
for fold, (train_idx, val_idx) in enumerate(kf.split(X=train)):
    train.loc[val_idx, 'kfold'] = fold

oof_metric = train[['kfold','ID','efs','efs_time','label','race_group']].copy()
oof_metric['prediction'] = 0.0

oof_tabm = np.zeros(train.shape[0])
test_tabm = np.zeros((5, test.shape[0]))

X_num = train[NUMS].values
X_cat = train[CATS].values

X_num_test = test[NUMS].values
X_cat_test = test[CATS].values

y = train[target].values


test_dl = DataLoader(TensorDataset(torch.tensor(X_num_test, dtype=torch.float32), torch.tensor(X_cat_test, dtype=torch.int64)), batch_size=1024, shuffle=False)

n_cont_features = len(NUMS)
n_cat_features = len(CATS)
n_classes = None
cat_cardinalities = cat_unique

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# TabM
# arch_type = 'tabm'
# bins = None

# TabM-mini with the piecewise-linear embeddings.
arch_type = 'tabm-mini'

class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, y_pred, y_true):
        return torch.sqrt(self.mse(y_pred, y_true))

loss_fn = RMSELoss()

In [None]:
val_rmse_scores = []

val_cindex_scores = []
for i, (train_index, val_index) in enumerate(kf.split(train[FEATURES])):
    best = {
        "val": -math.inf,
        "epoch": -1,
    }
    ds_true = oof_metric.loc[oof_metric.kfold==i, ["ID","efs","efs_time","race_group"]].copy().reset_index(drop=True)
    ds_pred = oof_metric.loc[oof_metric.kfold==i, ["ID"]].copy().reset_index(drop=True)

    X_num_train = X_num[train_index]
    X_cat_train = X_cat[train_index]
    y_train = y[train_index]

    X_num_val = X_num[val_index]
    X_cat_val = X_cat[val_index]
    y_val_all = y[val_index]

    train_dl = DataLoader(TensorDataset(torch.tensor(X_num_train, dtype=torch.float32), torch.tensor(X_cat_train, dtype=torch.int64),
                                        torch.tensor(y_train, dtype=torch.float32)), batch_size=32, shuffle=True)
    valid_dl = DataLoader(TensorDataset(torch.tensor(X_num_val, dtype=torch.float32), torch.tensor(X_cat_val, dtype=torch.int64),
                                        torch.tensor(y_val_all, dtype=torch.float32)), batch_size=32, shuffle=False)

    bins = rtdl_num_embeddings.compute_bins(torch.tensor(X_num_train, dtype=torch.float32))

    model = Model(
        n_num_features=n_cont_features,
        cat_cardinalities=cat_cardinalities,
        n_classes=n_classes,
        backbone={
            'type': 'MLP',
            'n_blocks': 3 ,
            'd_block': 512,
            'dropout': 0.1,
        },
        bins=bins,
        num_embeddings=(
            None
            if bins is None
            else {
                'type': 'PiecewiseLinearEmbeddings',
                'd_embedding': 64,
                'activation': True,
                'version': 'B',
            }
        ),
        arch_type=arch_type,
        k=32,
    ).to(device)

    optimizer = torch.optim.AdamW(
        # Instead of model.parameters(),
        make_parameter_groups(model),
        lr=1e-4,
        weight_decay=1e-3 ,
    )

    patience = 15
    early_stopping = delu.tools.EarlyStopping(patience, mode="max")

    for epoch in range(100):
        model.train()
        with tqdm(train_dl, total=len(train_dl), leave=True) as phar :
            for train_tensor in phar:
                optimizer.zero_grad()
                X_num_train, X_cat_train, y_train = [t.to(device) for t in train_tensor]

                output = model(X_num_train, X_cat_train).squeeze(-1)
                loss = loss_fn(output.flatten(0, 1), y_train.repeat_interleave(32))
                loss.backward()
                optimizer.step()

                phar.set_postfix(
                    OrderedDict(
                        epoch=f'{epoch+1}/{100}',
                        loss=f'{loss.item():.6f}'
                    )
                )
                phar.update(1)


        model.eval()
        valid_pred_list = []
        for valid_tensor in valid_dl:
            X_num_val, X_cat_val, y_val = [t.to(device) for t in valid_tensor]
            with torch.no_grad():
                output = model(X_num_val, X_cat_val).squeeze(-1)
            valid_pred_list.append((output.mean(1).cpu().numpy(), y_val.cpu().numpy()))

        valid_pred = np.concatenate([p[0] for p in valid_pred_list])
        valid_true = np.concatenate([p[1] for p in valid_pred_list])
        val_loss = loss_fn(torch.tensor(valid_pred), torch.tensor(valid_true)).item()

        ds_pred["prediction"] = valid_pred
        val_cindex = score(ds_true.copy(), ds_pred.copy(), "ID")

        if val_cindex > best["val"]:
            print("🌸 New best epoch! 🌸 with cindex: ", val_cindex)
            best = {
                "val": val_cindex,
                "epoch": epoch,
                'pred' : valid_pred,
            }

        early_stopping.update(val_cindex)
        if early_stopping.should_stop():
            print("Early stopping")
            break

    oof_tabm[val_index] = best['pred']
    val_rmse = root_mean_squared_error(y_val_all, best['pred'])
    val_rmse_scores.append(val_rmse)

    ds_pred["prediction"] = best['pred']
    val_cindex = score(ds_true.copy(), ds_pred.copy(), "ID")

    val_cindex_scores.append(val_cindex)

    # predict test
    model.eval()
    test_pred_list = []
    with torch.no_grad():
        for test_tensor in test_dl:
            X_num_test, X_cat_test = [t.to(device) for t in test_tensor]
            output = model(X_num_test, X_cat_test).squeeze(-1)
            test_pred_list.append(output.mean(1).cpu().numpy())

    test_pred = np.concatenate([p for p in test_pred_list])
    test_tabm[i] = test_pred

    print(" *************************************************************************************** ")
    print(f"Fold {i+1} RMSE: {val_rmse:.6f}", f"Fold {i+1} C-Index: {val_cindex:.6f}")
    print("\n")
    print(" *************************************************************************************** ")

In [None]:
print("Mean Validation RMSE: {:.6f}".format(np.mean(val_rmse_scores)))
print("Mean Validation C-Index: {:.6f}".format( np.mean(val_cindex_scores)))
print("OOF RMSE: {:.6f}".format(root_mean_squared_error(train[target], oof_tabm)))

results_df = pd.DataFrame({
        'Fold': np.arange(1, 5+1),
        'Validation RMSE': val_rmse_scores,
        'Validation C-Index': val_cindex_scores
    })


print("\n=== KFold RMSE Results ===")
print(results_df)

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_tabm
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for TabM KaplanMeier =",m)

In [None]:
# 0.6833

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = rankdata(oof_xgb_km_rmse) + rankdata(oof_cat_km_rmse) + rankdata(oof_cat_naf) + rankdata(oof_cat_quantile_rmse)\
                        + rankdata(oof_xgb_quantile_rmse) + rankdata(oof_xgb_quantile_mae) + rankdata(oof_xgb_naf_rmse) + rankdata(oof_xgb_cox) + rankdata(oof_cat_cox)\
                    + rankdata(oof_cat_aft) + rankdata(oof_xgb_ph_rmse) + rankdata(oof_cat_quantile_rmse_2) + rankdata(oof_cat_km_rmse_2) + rankdata(oof_cat_cox_2)\
                    + rankdata(oof_pred_nn) + rankdata(oof_tabm)



m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for Ensemble =",m)

In [None]:
test_mean = np.mean( test_tabm , axis=0)


# TABM PAIRWISE RANKING LOSS

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import rtdl_num_embeddings

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from scipy.stats import rankdata

from sklearn.model_selection import KFold, StratifiedGroupKFold, StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

import joblib
from torch.utils.data import TensorDataset, DataLoader, Dataset, ConcatDataset

import math
from collections import OrderedDict
from tqdm import tqdm

import functools

from pytorch_lightning.callbacks import LearningRateMonitor, TQDMProgressBar
from pytorch_lightning.callbacks import StochasticWeightAveraging

from lifelines.utils import concordance_index

from typing import List
from pytorch_lightning.cli import ReduceLROnPlateau
from torch import nn
from pytorch_lightning.utilities import grad_norm
import pytorch_lightning as pl

import os

pl.seed_everything(42)

In [None]:
# @title tabm_reference

# License: https://github.com/yandex-research/tabm/blob/main/LICENSE

# NOTE
# The minimum required versions of the dependencies are specified in README.md.

import itertools
from typing import Any, Literal

import rtdl_num_embeddings
import torch
import torch.nn as nn
from torch import Tensor

import math


# ======================================================================================
# Initialization
# ======================================================================================
def init_rsqrt_uniform_(x: Tensor, d: int) -> Tensor:
    assert d > 0
    d_rsqrt = d**-0.5
    return nn.init.uniform_(x, -d_rsqrt, d_rsqrt)


@torch.inference_mode()
def init_random_signs_(x: Tensor) -> Tensor:
    return x.bernoulli_(0.5).mul_(2).add_(-1)


# ======================================================================================
# Modules
# ======================================================================================
class NLinear(nn.Module):
    """N linear layers applied in parallel to N disjoint parts of the input.

    **Shape**

    - Input: ``(B, N, in_features)``
    - Output: ``(B, N, out_features)``

    The i-th linear layer is applied to the i-th matrix of the shape (B, in_features).

    Technically, this is a simplified version of delu.nn.NLinear:
    https://yura52.github.io/delu/stable/api/generated/delu.nn.NLinear.html.
    The difference is that this layer supports only 3D inputs
    with exactly one batch dimension. By contrast, delu.nn.NLinear supports
    any number of batch dimensions.
    """

    def __init__(
        self, n: int, in_features: int, out_features: int, bias: bool = True
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.empty(n, in_features, out_features))
        self.bias = nn.Parameter(torch.empty(n, out_features)) if bias else None
        self.reset_parameters()

    def reset_parameters(self):
        d = self.weight.shape[-2]
        init_rsqrt_uniform_(self.weight, d)
        if self.bias is not None:
            init_rsqrt_uniform_(self.bias, d)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        assert x.ndim == 3
        assert x.shape[-(self.weight.ndim - 1) :] == self.weight.shape[:-1]

        x = x.transpose(0, 1)
        x = x @ self.weight
        x = x.transpose(0, 1)
        if self.bias is not None:
            x = x + self.bias
        return x


class OneHotEncoding0d(nn.Module):
    # Input:  (*, n_cat_features=len(cardinalities))
    # Output: (*, sum(cardinalities))

    def __init__(self, cardinalities: list[int]) -> None:
        super().__init__()
        self._cardinalities = cardinalities

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim >= 1
        assert x.shape[-1] == len(self._cardinalities)

        return torch.cat(
            [
                # NOTE
                # This is a quick hack to support out-of-vocabulary categories.
                #
                # Recall that lib.data.transform_cat encodes categorical features
                # as follows:
                # - In-vocabulary values receive indices from `range(cardinality)`.
                # - All out-of-vocabulary values (i.e. new categories in validation
                #   and test data that are not presented in the training data)
                #   receive the index `cardinality`.
                #
                # As such, the line below will produce the standard one-hot encoding for
                # known categories, and the all-zeros encoding for unknown categories.
                # This may not be the best approach to deal with unknown values,
                # but should be enough for our purposes.
                nn.functional.one_hot(x[..., i], cardinality + 1)[..., :-1]
                for i, cardinality in enumerate(self._cardinalities)
            ],
            -1,
        )

class SinPositionEncoding(nn.Module):
    def __init__(self, d_embedding, base=10000):
        super().__init__()
        """
        d_model : list [num_cat1, num_cat2, ...]
        """
        self.d_embedding = d_embedding
        self.base = base

    def pos_single(self, pos, d_embedding):
        """
        对单个类别向量进行位置编码
        """
        pos = pos.flatten()  # Ensure input is a 1D array
        PE = torch.zeros((len(pos), d_embedding))
        for i in range(d_embedding):
            if i % 2 == 0:
                PE[:, i] = torch.sin(pos / (10000 ** (i / d_embedding)))
            else:
                PE[:, i] = torch.cos(pos / (10000 ** (i / d_embedding)))
        return PE


    def forward(self, x):
        """
        x : (batch_size, num_cat)
        """

        encode_data = []
        for i, d_embedding in enumerate(self.d_embedding):
            encode_data.append(self.pos_single(x[:, i], d_embedding))

        # 把encode_data拼接起来，变成(batch_size,sum(self.d_embedding)
        pe = torch.cat(encode_data, dim=1)
        return pe

class TrainablePositionEncoding(nn.Module):
    def __init__(self, cardinality, d_embedding):
        super().__init__()
        """
        d_embedding : list [num_cat1, num_cat2, ...]
        cardinality : list [card1, card2, ...]
        """
        self.d_embedding = d_embedding
        self.cardinality = cardinality
        # 为每个类别创建一个embedding
        self.embedding = nn.ModuleList([nn.Embedding(card, d) for card, d in zip(cardinality, d_embedding)])


    def forward(self, x):

        encode_data = []
        for i in range(len(self.d_embedding)):
            data = x[:, i]
            data = self.embedding[i](data)
            encode_data.append(data)
        pe = torch.cat(encode_data, dim=1)

        # (batch_size,sum(self.d_embedding)
        return pe

class ScaleEnsemble(nn.Module):
    def __init__(
        self,
        k: int,
        d: int,
        *,
        init: Literal['ones', 'normal', 'random-signs'],
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.empty(k, d))
        self._weight_init = init
        self.reset_parameters()

    def reset_parameters(self) -> None:
        if self._weight_init == 'ones':
            nn.init.ones_(self.weight)
        elif self._weight_init == 'normal':
            nn.init.normal_(self.weight)
        elif self._weight_init == 'random-signs':
            init_random_signs_(self.weight)
        else:
            raise ValueError(f'Unknown weight_init: {self._weight_init}')

    def forward(self, x: Tensor) -> Tensor:
        assert x.ndim >= 2
        return x * self.weight


class LinearEfficientEnsemble(nn.Module):
    """
    This layer is a more configurable version of the "BatchEnsemble" layer
    from the paper
    "BatchEnsemble: An Alternative Approach to Efficient Ensemble and Lifelong Learning"
    (link: https://arxiv.org/abs/2002.06715).

    First, this layer allows to select only some of the "ensembled" parts:
    - the input scaling  (r_i in the BatchEnsemble paper)
    - the output scaling (s_i in the BatchEnsemble paper)
    - the output bias    (not mentioned in the BatchEnsemble paper,
                          but is presented in public implementations)

    Second, the initialization of the scaling weights is configurable
    through the `scaling_init` argument.

    NOTE
    The term "adapter" is used in the TabM paper only to tell the story.
    The original BatchEnsemble paper does NOT use this term. So this class also
    avoids the term "adapter".
    """

    r: None | Tensor
    s: None | Tensor
    bias: None | Tensor

    def __init__(
        self,
        in_features: int,
        out_features: int,
        bias: bool = True,
        *,
        k: int,
        ensemble_scaling_in: bool,
        ensemble_scaling_out: bool,
        ensemble_bias: bool,
        scaling_init: Literal['ones', 'random-signs'],
    ):
        assert k > 0
        if ensemble_bias:
            assert bias
        super().__init__()

        self.weight = nn.Parameter(torch.empty(out_features, in_features))
        self.register_parameter(
            'r',
            (
                nn.Parameter(torch.empty(k, in_features))
                if ensemble_scaling_in
                else None
            ),  # type: ignore[code]
        )
        self.register_parameter(
            's',
            (
                nn.Parameter(torch.empty(k, out_features))
                if ensemble_scaling_out
                else None
            ),  # type: ignore[code]
        )
        self.register_parameter(
            'bias',
            (
                nn.Parameter(torch.empty(out_features))  # type: ignore[code]
                if bias and not ensemble_bias
                else nn.Parameter(torch.empty(k, out_features))
                if ensemble_bias
                else None
            ),
        )

        self.in_features = in_features
        self.out_features = out_features
        self.k = k
        self.scaling_init = scaling_init

        self.reset_parameters()

    def reset_parameters(self):
        init_rsqrt_uniform_(self.weight, self.in_features)
        scaling_init_fn = {'ones': nn.init.ones_, 'random-signs': init_random_signs_}[
            self.scaling_init
        ]
        if self.r is not None:
            scaling_init_fn(self.r)
        if self.s is not None:
            scaling_init_fn(self.s)
        if self.bias is not None:
            bias_init = torch.empty(
                # NOTE: the shape of bias_init is (out_features,) not (k, out_features).
                # It means that all biases have the same initialization.
                # This is similar to having one shared bias plus
                # k zero-initialized non-shared biases.
                self.out_features,
                dtype=self.weight.dtype,
                device=self.weight.device,
            )
            bias_init = init_rsqrt_uniform_(bias_init, self.in_features)
            with torch.inference_mode():
                self.bias.copy_(bias_init)

    def forward(self, x: Tensor) -> Tensor:
        # x.shape == (B, K, D)
        assert x.ndim == 3

        # >>> The equation (5) from the BatchEnsemble paper (arXiv v2).
        if self.r is not None:
            x = x * self.r
        x = x @ self.weight.T
        if self.s is not None:
            x = x * self.s
        # <<<

        if self.bias is not None:
            x = x + self.bias
        return x


class MLP(nn.Module):
    def __init__(
        self,
        *,
        d_in: None | int = None,
        d_out: None | int = None,
        n_blocks: None | int = None,
        d_block: list[int],
        dropout: float,
        activation: str = 'ReLU',
    ) -> None:
        super().__init__()
        if n_blocks is None:
            n_blocks = len(d_block)
        else:
            assert len(d_block) == n_blocks
        d_first = d_block[0] if d_in is None else d_in
        self.blocks = nn.ModuleList(
            [
                nn.Sequential(
                    nn.Linear(d_first if i == 0 else d_block[i - 1], d_block[i]),
                    getattr(nn, activation)(),
                    nn.Dropout(dropout),
                )
                for i in range(n_blocks)
            ]
        )
        self.output = None if d_out is None else nn.Linear(d_block, d_out)

    def forward(self, x: Tensor) -> Tensor:

        for block in self.blocks:
            x = block(x)

        if self.output is not None:
            x = self.output(x)
        return x


def make_efficient_ensemble(module: nn.Module, **kwargs) -> None:
    """Replace torch.nn.Linear modules with LinearEfficientEnsemble.

    NOTE
    In the paper, there are no experiments with networks with normalization layers.
    Perhaps, their trainable weights (the affine transformations) also need
    "ensemblification" as in the paper about "FiLM-Ensemble".
    Additional experiments are required to make conclusions.
    """
    for name, submodule in list(module.named_children()):
        if isinstance(submodule, nn.Linear):
            module.add_module(
                name,
                LinearEfficientEnsemble(
                    in_features=submodule.in_features,
                    out_features=submodule.out_features,
                    bias=submodule.bias is not None,
                    **kwargs,
                ),
            )
        else:
            make_efficient_ensemble(submodule, **kwargs)


def _get_first_ensemble_layer(backbone: MLP) -> LinearEfficientEnsemble:
    if isinstance(backbone, MLP):
        return backbone.blocks[0][0]  # type: ignore[code]
    else:
        raise RuntimeError(f'Unsupported backbone: {backbone}')


@torch.inference_mode()
def _init_first_adapter(
    weight: Tensor,
    distribution: Literal['normal', 'random-signs'],
    init_sections: list[int],
) -> None:
    """Initialize the first adapter.

    NOTE
    The `init_sections` argument is a historical artifact that accidentally leaked
    from irrelevant experiments to the final models. Perhaps, the code related
    to `init_sections` can be simply removed, but this was not tested.
    """
    assert weight.ndim == 2
    assert weight.shape[1] == sum(init_sections)

    if distribution == 'normal':
        init_fn_ = nn.init.normal_
    elif distribution == 'random-signs':
        init_fn_ = init_random_signs_
    else:
        raise ValueError(f'Unknown distribution: {distribution}')

    section_bounds = [0, *torch.tensor(init_sections).cumsum(0).tolist()]
    for i in range(len(init_sections)):
        # NOTE
        # As noted above, this section-based initialization is an arbitrary historical
        # artifact. Consider the first adapter of one ensemble member.
        # This adapter vector is implicitly split into "sections",
        # where one section corresponds to one feature. The code below ensures that
        # the adapter weights in one section are initialized with the same random value
        # from the given distribution.
        w = torch.empty((len(weight), 1), dtype=weight.dtype, device=weight.device)
        init_fn_(w)
        weight[:, section_bounds[i] : section_bounds[i + 1]] = w


_CUSTOM_MODULES = {
    # https://docs.python.org/3/library/stdtypes.html#definition.__name__
    CustomModule.__name__: CustomModule
    for CustomModule in [
        rtdl_num_embeddings.LinearEmbeddings,
        rtdl_num_embeddings.LinearReLUEmbeddings,
        rtdl_num_embeddings.PeriodicEmbeddings,
        rtdl_num_embeddings.PiecewiseLinearEmbeddings,
        MLP,
        SinPositionEncoding,
        TrainablePositionEncoding,
    ]
}


def make_module(type: str, *args, **kwargs) -> nn.Module:
    Module = getattr(nn, type, None)
    if Module is None:
        Module = _CUSTOM_MODULES[type]
    return Module(*args, **kwargs)


# ======================================================================================
# Optimization
# ======================================================================================
def default_zero_weight_decay_condition(
    module_name: str, module: nn.Module, parameter_name: str, parameter: nn.Parameter
):
    from rtdl_num_embeddings import _Periodic

    del module_name, parameter
    return parameter_name.endswith('bias') or isinstance(
        module,
        nn.BatchNorm1d
        | nn.LayerNorm
        | nn.InstanceNorm1d
        | rtdl_num_embeddings.LinearEmbeddings
        | rtdl_num_embeddings.LinearReLUEmbeddings
        | _Periodic,
    )


def make_parameter_groups(
    module: nn.Module,
    zero_weight_decay_condition=default_zero_weight_decay_condition,
    custom_groups: None | list[dict[str, Any]] = None,
) -> list[dict[str, Any]]:
    if custom_groups is None:
        custom_groups = []
    custom_params = frozenset(
        itertools.chain.from_iterable(group['params'] for group in custom_groups)
    )
    assert len(custom_params) == sum(
        len(group['params']) for group in custom_groups
    ), 'Parameters in custom_groups must not intersect'
    zero_wd_params = frozenset(
        p
        for mn, m in module.named_modules()
        for pn, p in m.named_parameters()
        if p not in custom_params and zero_weight_decay_condition(mn, m, pn, p)
    )
    default_group = {
        'params': [
            p
            for p in module.parameters()
            if p not in custom_params and p not in zero_wd_params
        ]
    }
    return [
        default_group,
        {'params': list(zero_wd_params), 'weight_decay': 0.0},
        *custom_groups,
    ]


# ======================================================================================
# The model
# ======================================================================================
class Model(nn.Module):
    """MLP & TabM."""

    def __init__(
        self,
        *,
        n_num_features: int,
        cat_cardinalities: list[int],
        n_classes: None | int,
        backbone: dict,
        bins: None | list[Tensor],  # For piecewise-linear encoding/embeddings.
        num_embeddings: None | dict = None,
        cat_dmodel: None | list[int] = None, # 元素数量必须和cat_cardinalities一一对应
        cat_embeddings: None | dict = None, # 默认使用one-hot编码
        arch_type: Literal[
            # Plain feed-forward network without any kind of ensembling.
            'plain',
            #
            # TabM-mini
            'tabm-mini',
            #
            # TabM-mini. The first adapter is initialized from the normal distribution.
            # This is used in Section 5.1 of the paper.
            'tabm-mini-normal',
            #
            # TabM
            'tabm',
            #
            # TabM. The first adapter is initialized from the normal distribution.
            # This variation is not used in the paper, but there is a preliminary
            # evidence that may be a better default strategy.
            'tabm-normal',
        ],
        k: None | int = None,
    ) -> None:
        # >>> Validate arguments.
        assert n_num_features >= 0
        assert n_num_features or cat_cardinalities
        if arch_type == 'plain':
            assert k is None
        else:
            assert k is not None
            assert k > 0

        super().__init__()

        # >>> Continuous (numerical) features
        first_adapter_sections = []  # See the comment in `_init_first_adapter`.

        if n_num_features == 0:
            assert bins is None
            self.num_module = None
            d_num = 0

        elif num_embeddings is None:
            assert bins is None
            self.num_module = None
            d_num = n_num_features
            first_adapter_sections.extend(1 for _ in range(n_num_features))

        else:
            if bins is None:
                self.num_module = make_module(
                    **num_embeddings, n_features=n_num_features
                )
            else:
                assert num_embeddings['type'].startswith('PiecewiseLinearEmbeddings')
                self.num_module = make_module(**num_embeddings, bins=bins)
            d_num = n_num_features * num_embeddings['d_embedding']
            first_adapter_sections.extend(
                num_embeddings['d_embedding'] for _ in range(n_num_features)
            )

        # >>> Categorical features
        if cat_embeddings is None:
            self.cat_module = (
                OneHotEncoding0d(cat_cardinalities) if cat_cardinalities else None
            )
            first_adapter_sections.extend(cat_cardinalities)
            d_cat = sum(cat_cardinalities)
        else:
            assert cat_dmodel is not None
            self.cat_module = make_module(
                **cat_embeddings
            )
            # cat_module返回的维度为(batch_size, sum(d_embedding))
            first_adapter_sections.extend(cat_dmodel)
            d_cat = sum(cat_dmodel)

        # >>> Backbone
        d_flat = d_num + d_cat


        self.minimal_ensemble_adapter = None
        # Any backbone can be here but we provide only MLP
        self.backbone = make_module(d_in=d_flat, **backbone)

        if arch_type != 'plain':
            assert k is not None
            first_adapter_init = (
                'normal'
                if arch_type in ('tabm-mini-normal', 'tabm-normal')
                # For other arch_types, the initialization depends
                # on the presense of num_embeddings.
                else 'random-signs'
                if num_embeddings is None
                else 'normal'
            )

            if arch_type in ('tabm-mini', 'tabm-mini-normal'):
                # Minimal ensemble
                self.minimal_ensemble_adapter = ScaleEnsemble(
                    k,
                    d_flat,
                    init='random-signs' if num_embeddings is None else 'normal',
                )
                _init_first_adapter(
                    self.minimal_ensemble_adapter.weight,  # type: ignore[code]
                    first_adapter_init,
                    first_adapter_sections,
                )

            elif arch_type in ('tabm', 'tabm-normal'):
                # Like BatchEnsemble, but all multiplicative adapters,
                # except for the very first one, are initialized with ones.
                make_efficient_ensemble(
                    self.backbone,
                    k=k,
                    ensemble_scaling_in=True,
                    ensemble_scaling_out=True,
                    ensemble_bias=True,
                    scaling_init='ones',
                )
                _init_first_adapter(
                    _get_first_ensemble_layer(self.backbone).r,  # type: ignore[code]
                    first_adapter_init,
                    first_adapter_sections,
                )

            else:
                raise ValueError(f'Unknown arch_type: {arch_type}')


        # >>> Output
        d_block = backbone['d_block'][-1]
        d_out = 1 if n_classes is None else n_classes
        self.output = (
            nn.Linear(d_block, d_out)
            if arch_type == 'plain'
            else NLinear(k, d_block, d_out)  # type: ignore[code]
        )

        # >>>
        self.arch_type = arch_type
        self.k = k

    def forward(
        self, x_num: None | Tensor = None, x_cat: None | Tensor = None
    ) -> Tensor:
        x = []
        if x_num is not None:
            x.append(x_num if self.num_module is None else self.num_module(x_num))
        if x_cat is None:
            assert self.cat_module is None
        else:
            assert self.cat_module is not None
            x.append(self.cat_module(x_cat).float())
        x = torch.column_stack([x_.flatten(1, -1) for x_ in x])

        if self.k is not None:
            x = x[:, None].expand(-1, self.k, -1)  # (B, D) -> (B, K, D)
            if self.minimal_ensemble_adapter is not None:
                x = self.minimal_ensemble_adapter(x)
        else:
            assert self.minimal_ensemble_adapter is None

        x = self.backbone(x)


        x = self.output(x)
        if self.k is None:
            # Adjust the output shape for plain networks to make them compatible
            # with the rest of the script (loss, metrics, predictions, ...).
            # (B, D_OUT) -> (B, 1, D_OUT)
            x = x[:, None]
        return x

In [None]:
# @title custom_score
def custom_score(solution, submission, row_id_column_name, prediction_label='prediction', print_info=True):

    del solution[row_id_column_name]
    del submission[row_id_column_name]

    event_label = 'efs'
    interval_label = 'efs_time'

    for col in submission.columns:
        if not pd.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_dict = {}
    for race in sorted(merged_df_race_dict.keys()):
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])

        metric_dict[race] = c_index_race

    race_c_index = list(metric_dict.values())
    c_score = float(np.mean(race_c_index) - np.std(race_c_index))

    return c_score, metric_dict

In [None]:
def recalculate_hla_sums(df):
    # 填充缺失值为 0
    df['hla_match_a_low'] = df['hla_match_a_low'].fillna(0)
    df['hla_match_b_low'] = df['hla_match_b_low'].fillna(0)
    df['hla_match_drb1_high'] = df['hla_match_drb1_high'].fillna(0)
    df['hla_match_drb1_low'] = df['hla_match_drb1_low'].fillna(0)
    df['hla_match_a_high'] = df['hla_match_a_high'].fillna(0)
    df['hla_match_b_high'] = df['hla_match_b_high'].fillna(0)
    df['hla_match_c_low'] = df['hla_match_c_low'].fillna(0)
    df['hla_match_c_high'] = df['hla_match_c_high'].fillna(0)
    df['hla_match_dqb1_low'] = df['hla_match_dqb1_low'].fillna(0)
    df['hla_match_dqb1_high'] = df['hla_match_dqb1_high'].fillna(0)

    # 计算新的列
    df['hla_nmdp_6'] = df['hla_match_a_low'] + df['hla_match_b_low'] + df['hla_match_drb1_high']
    df['hla_low_res_6'] = df['hla_match_a_low'] + df['hla_match_b_low'] + df['hla_match_drb1_low']
    df['hla_high_res_6'] = df['hla_match_a_high'] + df['hla_match_b_high'] + df['hla_match_drb1_high']
    df['hla_low_res_8'] = df['hla_match_a_low'] + df['hla_match_b_low'] + df['hla_match_c_low'] + df['hla_match_drb1_low']
    df['hla_high_res_8'] = df['hla_match_a_high'] + df['hla_match_b_high'] + df['hla_match_c_high'] + df['hla_match_drb1_high']
    df['hla_low_res_10'] = df['hla_match_a_low'] + df['hla_match_b_low'] + df['hla_match_c_low'] + df['hla_match_drb1_low'] + df['hla_match_dqb1_low']
    df['hla_high_res_10'] = df['hla_match_a_high'] + df['hla_match_b_high'] + df['hla_match_c_high'] + df['hla_match_drb1_high'] + df['hla_match_dqb1_high']

    return df

train = pd.read_csv(r'data\cibmtr\train.csv')
test = pd.read_csv(r'data\cibmtr\test.csv')

In [None]:
train = recalculate_hla_sums(train)
test = recalculate_hla_sums(test)

train['race_group_copy'] = train['race_group'].copy()
test['race_group_copy'] = test['race_group'].copy()

train['year_hct'] -= 2000
test['year_hct'] -= 2000

train['is_cyto_score_same'] = (train['cyto_score'] == train['cyto_score_detail']).astype(int)
test['is_cyto_score_same'] = (test['cyto_score'] == test['cyto_score_detail']).astype(int)

RMV = ["ID","efs","efs_time", "y", "efs_time2", "race_group", "KM", "NA", "BFH"]

FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

CATS = []
for c in FEATURES:
    if c == 'cos_year' or c == 'sin_year':
        continue
    num_unique = train[c].nunique()
    if num_unique < 100:
        CATS.append(c)

print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

def update(df):

    global CATS

    j_ch=',[]{}:"\\<'
    for ch in j_ch:
        for c in CATS:
            df[c] = df[c].apply(lambda x:str(x).replace(ch,''))
            df[c] = df[c].replace('nan', np.nan)
    return df


train = update(train)
test = update(test)

NUMS = [c for c in FEATURES if not c in CATS]

print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
category_mappings = {}
for c in FEATURES:
    if c in CATS:
        print(f"{c}, ", end="")
        train[c] = train[c].fillna(0)
        train[c], uniques = train[c].factorize()
        train[c] -= train[c].min()
        train[c] = train[c].astype("int32")
        train[c] = train[c].astype("category")

        if c not in ['race_group_copy']:
            train[c] = train[c].cat.add_categories(-1) # for new category
        category_mappings[c] = {cat: code for code, cat in enumerate(uniques)}
    else:
        if train[c].dtype == "float64":
            train[c] = train[c].astype("float32")
        if train[c].dtype == "int64":
            train[c] = train[c].astype("int32")

for c in FEATURES:
    if c in CATS:
        print(f"{c}, ", end="")
        test[c] = test[c].fillna(0)
        test[c] = test[c].map(category_mappings[c]).fillna(-1).astype("int32")
        test[c] = test[c].astype("category")
    else:
        if test[c].dtype == "float64":
            test[c] = test[c].astype("float32")
        if test[c].dtype == "int64":
            test[c] = test[c].astype("int32")

cat_unique = train[CATS].nunique().to_list()

train[CATS] = train[CATS].astype("category")
test[CATS] = test[CATS].astype("category")


train_input = train[FEATURES].copy()
test_input = test[FEATURES].copy()

NUMS = [c for c in train_input.columns if c not in CATS]
for c in NUMS:
    train_input[c] = train_input[c].fillna(0)
    test_input[c] = test_input[c].fillna(0)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_input[NUMS] = scaler.fit_transform(train_input[NUMS])
test_input[NUMS] = scaler.transform(test_input[NUMS])


oof_tabm_pl = np.zeros(train.shape[0])

X_num = train_input[NUMS].values
X_cat = train_input[CATS].values

X_num_test = test_input[NUMS].values
X_cat_test = test_input[CATS].values

y = np.log(train['efs_time'].values)


test_dl = DataLoader(TensorDataset(torch.tensor(X_cat_test, dtype=torch.int64), torch.tensor(X_num_test, dtype=torch.float32)), batch_size=2048, shuffle=False)


In [None]:
n_cont_features = len(NUMS)
n_cat_features = len(CATS)
n_classes = 1
cat_cardinalities = cat_unique


@functools.lru_cache
def combinations(N):
    ind = torch.arange(N)
    comb = torch.combinations(ind, r=2)
    return comb.cuda()

def get_mask(comb, efs, y_left, y_right):
    # mask1 = (efs[comb[:, 0]] == 1) | (efs[comb[:, 1]] == 1)
    left_outlived = y_left >= y_right
    left_1_right_0 = (efs[comb[:, 0]] == 1) & (efs[comb[:, 1]] == 0)
    mask2 = (left_outlived & left_1_right_0)
    right_outlived = y_right >= y_left
    right_1_left_0 = (efs[comb[:, 1]] == 1) & (efs[comb[:, 0]] == 0)
    mask2 |= (right_outlived & right_1_left_0)
    mask2 = ~mask2
    mask = mask2
    return mask

In [None]:
class LitNN(pl.LightningModule):
    def __init__(
            self,
            config,
            bins,
    ):
        super(LitNN, self).__init__()
        self.save_hyperparameters()
        self.config = config
        self.model = Model(
                n_num_features=n_cont_features,
                cat_cardinalities=cat_cardinalities,
                n_classes=n_classes,
                backbone=self.config['backbone'],
                bins=bins,
                num_embeddings=(
                    None
                    if bins is None
                    else self.config['num_embeddings']
                ),
                cat_embeddings={
                    None if cat_cardinalities is None else
                        'type': 'TrainablePositionEncoding',
                        'd_embedding' : [8] * len(cat_cardinalities),
                        'cardinality' : cat_cardinalities,
                },
                cat_dmodel = [8] * len(cat_cardinalities),
                arch_type=self.config['arch_type'],
                k=self.config['k'],
            )

        aux_hidden_dim = self.config['backbone']['d_block'][-1]
        self.targets = []

    def on_before_optimizer_step(self, optimizer):
        # Compute the 2-norm for each layer
        # If using mixed precision, the gradients are already unscaled here
        norms = grad_norm(self.model, norm_type=2)
        self.log_dict(norms)

    def forward(self, x_cat, x_cont):
        x = self.model(x_cont, x_cat)
        x = x.squeeze(-1)
        x = (x - x.mean(dim=0, keepdim=True).detach())
        return x

    def training_step(self, batch, batch_idx):
        x_cat, x_cont, y, efs = batch
        y_hat = self(x_cat, x_cont)

        loss = self.get_full_loss(efs, y, y_hat)

        self.log("train_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        return loss


    def get_full_loss(self, efs, y, y_hat):
        y_hat = torch.cat([y_hat, y_hat.mean(dim=1, keepdim=True)], dim=1)
        loss = self.calc_loss(y, y_hat, efs)
        return loss

    def calc_loss(self, y, y_hat, efs):
        N = y.shape[0]
        comb = combinations(N)
        comb = comb[(efs[comb[:, 0]] == 1) | (efs[comb[:, 1]] == 1)]
        y_left = y[comb[:, 0]]
        y_right = y[comb[:, 1]]
        y = 2 * (y_left > y_right).int() - 1
        mask = get_mask(comb, efs, y_left, y_right)
        losses = []
        mask_sum = mask.sum()
        for i in range(y_hat.shape[1]):
            pred_left = y_hat[comb[:, 0], i]
            pred_right = y_hat[comb[:, 1], i]
            loss = nn.functional.softplus(-y * (pred_left - pred_right) + self.config['margin'])
            loss = (loss.double() * (mask.double())).sum() / mask_sum
            losses.append(loss)


        loss = (loss.double() * (mask.double())).sum() / mask_sum
        losses.append(loss)

        return sum(losses) / (len(losses) / 32)

    def validation_step(self, batch, batch_idx):
        if self.trainer.sanity_checking:
            return
        x_cat, x_cont, y, efs = batch
        y_hat = self(x_cat, x_cont)
        loss = self.get_full_loss(efs, y, y_hat)
        self.targets.append([y, y_hat.mean(1).detach(), efs, x_cat[:, -2]])
        self.log("val_loss", loss, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def on_validation_epoch_end(self):
        if self.trainer.sanity_checking:
            return
        metric, metricv2 = self._calc_cindex()
        self.log("val_cindex", metric, on_epoch=True, prog_bar=True, logger=True)
        self.log("val_cindexv2", metricv2, on_epoch=True, prog_bar=True, logger=True)
        self.targets.clear()

    def _calc_cindex(self):
        y = torch.cat([t[0] for t in self.targets]).cpu().numpy()
        y_hat = torch.cat([t[1] for t in self.targets]).cpu().numpy()
        efs = torch.cat([t[2] for t in self.targets]).cpu().numpy()
        races = torch.cat([t[3] for t in self.targets]).cpu().numpy()
        metric = self._metric(efs, races, y, y_hat)
        metricv2 = concordance_index(y, y_hat, efs)
        return metric, metricv2

    def _metric(self, efs, races, y, y_hat):
        metric_list = []
        for race in np.unique(races):
            y_ = y[races == race]
            y_hat_ = y_hat[races == race]
            efs_ = efs[races == race]
            metric_list.append(concordance_index(y_, y_hat_, efs_))
        metric = float(np.mean(metric_list) - np.sqrt(np.var(metric_list)))
        return metric


    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(make_parameter_groups(self), lr=self.config['lr'], weight_decay=self.config['weight_decay'])
        scheduler_config = {
            "scheduler": torch.optim.lr_scheduler.CosineAnnealingLR(
                optimizer,
                T_max=45,
                eta_min=6e-3
            ),
            "interval": "epoch",
            "frequency": 1,
            "strict": False,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler_config}

In [None]:
arch_type = 'tabm-mini'

my_config = {
    'lr': 0.0007535669042995948,
    'weight_decay': 0.0392,
    'k': 32,
    'num_embeddings': {
                'type': 'PiecewiseLinearEmbeddings',
                'd_embedding': 48,
                'activation': True,
                'version': 'A',
            },
    'backbone': {
            'type': 'MLP',
            'n_blocks': 3 ,
            'd_block': [304, 304, 304],
            'dropout': 0.35425621416312014,
            'activation' : 'GELU',
        },
    'batch_size': 1024,
    'arch_type': 'tabm-mini',
    'folds': 5,
    'n_bins' : 34,
    "aux_weight": 0.26545778308743806,
    "margin": 0.2588153271003354,
}


test_tabm_pl = np.zeros(test.shape[0])

In [None]:
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

val_cindex_scores = []

kf =  StratifiedKFold(n_splits=my_config['folds'], shuffle=True)
for i, (train_index, val_index) in enumerate(kf.split(train[FEATURES], train.race_group.astype(str) + (train.age_at_hct == 0.044).astype(str))):
    ds_true = train.loc[val_index, ["ID","efs","efs_time","race_group"]].copy().reset_index(drop=True)
    ds_pred = train.loc[val_index, ["ID"]].copy().reset_index(drop=True)

    X_num_train_all = torch.tensor(X_num[train_index], dtype=torch.float32)
    X_cat_train_all = torch.tensor(X_cat[train_index], dtype=torch.int64)
    y_train_all = torch.tensor(y[train_index], dtype=torch.float32)
    efs_train_all = torch.tensor(train.loc[train_index, 'efs'].values, dtype=torch.int64)


    X_num_val_all = torch.tensor(X_num[val_index], dtype=torch.float32)
    X_cat_val_all = torch.tensor(X_cat[val_index], dtype=torch.int64)
    y_val_all = torch.tensor(y[val_index], dtype=torch.float32)
    efs_val_all = torch.tensor(train.loc[val_index, 'efs'].values, dtype=torch.int64)


    train_dl = DataLoader(TensorDataset(X_cat_train_all, X_num_train_all, y_train_all, efs_train_all), batch_size=my_config['batch_size'], shuffle=True, num_workers=4)
    valid_dl = DataLoader(TensorDataset(X_cat_val_all, X_num_val_all, y_val_all, efs_val_all), batch_size=1024, shuffle=False)

    bins = rtdl_num_embeddings.compute_bins(X_num_train_all, my_config['n_bins'])

    model = LitNN(config=my_config, bins=bins)
    checkpoint_callback = pl.callbacks.ModelCheckpoint(monitor="val_cindex", save_top_k=1, mode="max")
    early_stopping = pl.callbacks.EarlyStopping(monitor="val_cindex", patience=25, mode="max")
    trainer = pl.Trainer(
        accelerator='cuda',
        max_epochs=44,
        callbacks=[
            checkpoint_callback,
            early_stopping,
            LearningRateMonitor(logging_interval='epoch'),
            TQDMProgressBar(),
            StochasticWeightAveraging(swa_lrs=1e-5, swa_epoch_start=33, annealing_epochs=11)
        ],
    )
    trainer.fit(model, train_dl, valid_dl)
    # trainer.fit(model, train_dl)
    # trainer.validate(model, valid_dl)

    checkpoint_path = f"/content/checkpoints"
    checkpoint_name = os.listdir(checkpoint_path)[0]
    checkpoint_path = os.path.join(checkpoint_path, checkpoint_name)
    val_model = LitNN.load_from_checkpoint(checkpoint_path)

    val_model.eval().cuda()
    valid_pred_list = []
    with torch.no_grad():
        for valid_tensor in valid_dl:

            X_cat_val, X_num_val, _, _ = [t.to(device) for t in valid_tensor]
            with torch.no_grad():
                output = val_model(X_cat_val, X_num_val)
            valid_pred_list.append(output.mean(1).cpu().numpy())

    valid_pred = np.concatenate([p for p in valid_pred_list])

    oof_tabm_pl[val_index] = valid_pred

    ds_pred["prediction"] = - valid_pred
    val_cindex = score(ds_true.copy(), ds_pred.copy(), "ID")
    val_cindex_scores.append(val_cindex)

    # predict test
    test_pred_list = []
    with torch.no_grad():
        for test_tensor in test_dl:

            X_cat_test, X_num_test = [t.to(device) for t in test_tensor]
            with torch.no_grad():
                output = val_model(X_cat_test, X_num_test)
            test_pred_list.append(output.mean(1).cpu().numpy())

    test_pred = np.concatenate([p for p in test_pred_list])
    test_tabm_pl += test_pred


    print("\n")
    print(" *************************************************************************************** ")
    print("\n")
    print(f"Fold {i+1} C-Index: {val_cindex:.6f}")
    print("\n")
    print(" *************************************************************************************** ")


In [None]:
print("Mean Validation C-Index: {:.6f}".format( np.mean(val_cindex_scores)))

results_df = pd.DataFrame({
        'Fold': np.arange(1, my_config['folds']+1),
        'Validation C-Index': val_cindex_scores,
    })


print("\n=== KFold RMSE Results ===")
print(results_df)

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = - oof_tabm_pl
m = score(y_true.copy(), y_pred.copy(), "ID")
race_dict = custom_score(y_true.copy(), y_pred.copy(), "ID", print_info=False)[1]


print(f"\nOverall CV for TabM = {m:.6f}")
print(race_dict)

In [None]:
# 0.6820

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = rankdata(oof_xgb_km_rmse) + rankdata(oof_cat_km_rmse) + rankdata(oof_cat_naf) + rankdata(oof_cat_quantile_rmse)\
                        + rankdata(oof_xgb_quantile_rmse) + rankdata(oof_xgb_quantile_mae) + rankdata(oof_xgb_naf_rmse) + rankdata(oof_xgb_cox) + rankdata(oof_cat_cox)\
                    + rankdata(oof_cat_aft) + rankdata(oof_xgb_ph_rmse) + rankdata(oof_cat_quantile_rmse_2) + rankdata(oof_cat_km_rmse_2) + rankdata(oof_cat_cox_2)\
                    + rankdata(oof_pred_nn) + rankdata(oof_tabm) + rankdata(-oof_tabm_pl)



m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for Ensemble =",m)

In [None]:
sub = pd.read_csv("/content/data/cibmtr/sample_submission.csv")
sub.prediction = rankdata(pred_xgb_km_rmse) + rankdata(pred_cat_km_rmse) + rankdata(pred_cat_naf) + rankdata(pred_cat_quantile_rmse)\
                + rankdata(pred_xgb_quantile_rmse) + rankdata(pred_xgb_quantile_mae) + rankdata(pred_xgb_naf_rmse) + rankdata(pred_xgb_cox)\
                + rankdata(pred_cat_cox) + rankdata(pred_cat_aft) + rankdata(pred_xgb_ph_rmse)\
                + rankdata(pred_cat_quantile_rmse_2)\
                + rankdata(pred_cat_km_rmse_2) + rankdata(pred_cat_cox_2) + rankdata(test_pred_nn) + rankdata(test_mean) + rankdata(-test_tabm_pl)


sub.to_csv("submission.csv",index=False)
print("Sub shape:",sub.shape)
sub.head()

# CHRIS NN MLP BASELINE

In [None]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

test = pd.read_csv(r"data\cibmtr\test.csv")
print("Test shape:", test.shape )

train = pd.read_csv(r"data\cibmtr\train.csv")
print("Train shape:",train.shape)
train.head()

In [None]:
train["y"] = train.efs_time.values
mx = train.loc[train.efs==1,"efs_time"].max()
mn = train.loc[train.efs==0,"efs_time"].min()
train.loc[train.efs==0,"y"] = train.loc[train.efs==0,"y"] + mx - mn
train.y = train.y.rank()
train.loc[train.efs==0,"y"] += 2*len(train)
train.y = train.y / train.y.max()
train.y = np.log( train.y )
train.y -= train.y.mean()
train.y *= -1.0

plt.hist(train.loc[train.efs==1,"y"],bins=100,label="efs=1, Yes Event")
plt.hist(train.loc[train.efs==0,"y"],bins=100,label="efs=0, Maybe Event")
plt.xlim((-5,5))
plt.xlabel("Transformed Target y")
plt.ylabel("Density")
plt.title("Transformed Target y using both efs and efs_time.")
plt.legend()
plt.show()

In [None]:
RMV = ["ID","efs","efs_time","y"]
FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
        CATS.append(c)
    elif not "age" in c:
        train[c] = train[c].astype("str")
        test[c] = test[c].astype("str")
        CATS.append(c)
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

In [None]:
CAT_SIZE = []
CAT_EMB = []
NUMS = []

combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

print("We LABEL ENCODE the CATEGORICAL FEATURES: ")

for c in FEATURES:
    if c in CATS:
        # LABEL ENCODE
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        #combined[c] = combined[c].astype("category")

        n = combined[c].nunique()
        mn = combined[c].min()
        mx = combined[c].max()
        print(f'{c} has ({n}) unique values')

        CAT_SIZE.append(mx+1)
        CAT_EMB.append( int(np.ceil( np.sqrt(mx+1))) )
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")

        m = combined[c].mean()
        s = combined[c].std()
        combined[c] = (combined[c]-m)/s
        combined[c] = combined[c].fillna(0)

        NUMS.append(c)

train = combined.iloc[:len(train)].copy()
test = combined.iloc[len(train):].reset_index(drop=True).copy()

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input, Embedding
from tensorflow.keras.layers import Concatenate, BatchNormalization
import tensorflow.keras.backend as K
from sklearn.model_selection import KFold

print('TF Version',tf.__version__)

In [None]:
EPOCHS = 4
LRS = [0.01]*2 + [0.001]*1 + [0.0001]*1

def lrfn(epoch):
    return LRS[epoch]

rng = [i for i in range(EPOCHS)]
lr_y = [lrfn(x) for x in rng]
plt.figure(figsize=(10, 4))
plt.plot(rng, lr_y, '-o')
print("Learning rate schedule: {:.3g} to {:.3g} to {:.3g}". \
        format(lr_y[0], max(lr_y), lr_y[-1]))
plt.xlabel("Epoch")
plt.ylabel("Learning Rate")
plt.title("Learning Rate Schedule")
plt.show()

lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)

In [None]:
def build_model():

    # CATEGORICAL FEATURES
    x_input_cats = Input(shape=(len(CATS),))
    embs = []
    for j in range(len(CATS)):
        e = tf.keras.layers.Embedding(CAT_SIZE[j],CAT_EMB[j])
        x = e(x_input_cats[:,j])
        x = tf.keras.layers.Flatten()(x)
        embs.append(x)

    # NUMERICAL FEATURES
    x_input_nums = Input(shape=(len(NUMS),))

    # COMBINE
    x = tf.keras.layers.Concatenate(axis=-1)(embs+[x_input_nums])
    x = Dense(256, activation='relu')(x)
    x = Dense(256, activation='relu')(x)
    x = Dense(1, activation='linear')(x)

    model = Model(inputs=[x_input_cats,x_input_nums], outputs=x)

    return model

In [None]:
%%time

REPEATS = 3
FOLDS = 5
kf = KFold(n_splits=FOLDS, random_state=42, shuffle=True)

oof_nn = np.zeros( len(train) )
pred_nn = np.zeros( len(test) )

#directory = "checkpoints"
#if not os.path.exists(directory):
#    os.makedirs(directory)

for r in range(REPEATS):
    VERBOSE = r==0
    print("#"*25)
    print(f"### REPEAT {r+1} ###")
    print("#"*25)

    for i, (train_index, test_index) in enumerate(kf.split(train)):

        X_train_cats = train.loc[train_index,CATS].values
        X_train_nums = train.loc[train_index,NUMS].values
        y_train = train.loc[train_index,"y"].values
        y_train2 = train.loc[train_index,"efs"].values

        X_valid_cats = train.loc[test_index,CATS].values
        X_valid_nums = train.loc[test_index,NUMS].values
        y_valid = train.loc[test_index,"y"].values
        y_valid2 = train.loc[test_index,"efs"].values

        X_test_cats = test[CATS].values
        X_test_nums = test[NUMS].values

        if VERBOSE:
            print(" ","#"*25)
            print(" ",f"### Fold {i+1} ###")
            print(" ","#"*25)

        # TRAIN MODEL
        K.clear_session()
        model = build_model()
        model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                      loss="mean_squared_error",
                     )
        v = 2 if VERBOSE else 0
        model.fit([X_train_cats,X_train_nums], [y_train],
                  validation_data = ([X_valid_cats,X_valid_nums], [y_valid]),
                  callbacks = [lr_callback],
                  batch_size=512, epochs=EPOCHS, verbose=v)
        #model.save_weights(f'{directory}/NN_f{i}_r{r}.weights.h5')

        # INFER OOF
        oof_nn[test_index] += model.predict([X_valid_cats,X_valid_nums], verbose=v, batch_size=512).flatten()
        # INFER TEST
        pred_nn += model.predict([X_test_cats,X_test_nums], verbose=v, batch_size=512).flatten()

oof_nn /= REPEATS
pred_nn /= (FOLDS*REPEATS)

In [None]:
y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_nn
m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for NN =",m)

In [None]:
# 0.6820

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = rankdata(oof_xgb_km_rmse) + rankdata(oof_cat_km_rmse) + rankdata(oof_cat_naf) + rankdata(oof_cat_quantile_rmse)\
                        + rankdata(oof_xgb_quantile_rmse) + rankdata(oof_xgb_quantile_mae) + rankdata(oof_xgb_naf_rmse) + rankdata(oof_xgb_cox) + rankdata(oof_cat_cox)\
                    + rankdata(oof_cat_aft) + rankdata(oof_xgb_ph_rmse) + rankdata(oof_cat_quantile_rmse_2) + rankdata(oof_cat_km_rmse_2) + rankdata(oof_cat_cox_2)\
                    + rankdata(oof_pred_nn) + rankdata(oof_nn)



m = score(y_true.copy(), y_pred.copy(), "ID")
print(f"\nOverall CV for Ensemble =",m)