# Setup and Load Data

In [1]:
%%capture

# Installations
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

In [2]:
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt

from lifelines import KaplanMeierFitter
from lifelines.utils import concordance_index

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

def wgt_gradient(t, cap, g):
    return 1 if t<=cap else math.exp(-g * (t - cap))

# def get_target(df, fitter, time_col='efs_time', event_col='efs', **kwargs):
#     ''' create target val '''
#     fitter.fit(df[time_col], df[event_col], **kwargs)
#     return fitter.survival_function_at_times(df[time_col]).values

    numerical_df = df[numerical_variables]
    categorical_df = df[categorical_variables]

    # not my fav...
    if fit:
        tmp_tf_categorical_df = pd.DataFrame(encoder.fit_transform(categorical_df), columns=categorical_df.columns)
    else:
        tmp_tf_categorical_df = pd.DataFrame(encoder.transform(categorical_df), columns=categorical_df.columns)

def mode_fn(x):
    ''' function to get mode '''
    return x.mode().iloc[0] if not x.mode().empty else None
    
def get_simple_imputations(df:pd.DataFrame, categorical_variables:list[str], numerical_variables:list[str]) -> pd.DataFrame:
    '''
    perform basic imputations on dataset 
    '''
    feng_df = df.copy(deep=True)
    eng_cols = []
    ## Categorical transformations
        # imputation flll as mode of col grouped by race_group
    for categorical_variable in categorical_variables:
        eng_col = f'{categorical_variable}_cModeGrpFill'
        feng_df[eng_col] = feng_df[categorical_variable].fillna(feng_df.groupby('race_group')[categorical_variable].transform(mode_fn))
        eng_cols.append(eng_col)

    ## Numerical transformations
        # imputation fill as mode of col grouped by race_group
    for numerical_variable in numerical_variables:
        eng_col = f'{numerical_variable}_nModeGrpFill'
        feng_df[eng_col] = feng_df[numerical_variable].fillna(feng_df.groupby('race_group')[numerical_variable].transform(mode_fn))
        eng_cols.append(eng_col)

    return feng_df[eng_cols]


def get_model_imputation(
    df:pd.DataFrame, 
    categorical_variables:list[str], 
    numerical_variables:list[str], 
    encoder, 
    imputer,
    fit:bool=False
) -> pd.DataFrame:

    numerical_df = df[numerical_variables]
    categorical_df = df[categorical_variables]

    # not my fav...
    if fit:
        tmp_tf_categorical_df = pd.DataFrame(encoder.fit_transform(categorical_df), columns=categorical_df.columns)
    else:
        tmp_tf_categorical_df = pd.DataFrame(encoder.transform(categorical_df), columns=categorical_df.columns)
    
    data = pd.concat([numerical_df, tmp_tf_categorical_df], axis=1)
    
    # again, not my fav........
    if fit:
        tf_data = imputer.fit_transform(data)
    else:
        tf_data = imputer.transform(data)

    tf_data_df = pd.DataFrame(tf_data, columns=data.columns)

    tf_categorical_df = pd.DataFrame(
        encoder.inverse_transform(tf_data_df[categorical_variables]), columns=categorical_df.columns
    )
    tf_numerical_df = tf_data_df[numerical_df.columns]
    
    tf_categorical_df.columns = [f'{c}_NNFill' for c in tf_categorical_df]
    tf_numerical_df.columns = [f'{c}_NNFill' for c in tf_numerical_df]

    return pd.concat([tf_categorical_df, tf_numerical_df], axis=1)

def apply_kmeans(train_df, test_df, n_clusters, cat_vars, num_vars, **kwargs):

    combined_data = pd.concat([train_df, test_df], keys=['train', 'test'])

    combined_data = pd.concat(
        [
            pd.get_dummies(combined_data[cat_vars]),
            combined_data[num_vars]
        ],
        axis=1
    )

    train_data = combined_data.xs('train')
    test_data = combined_data.xs('test')
    
    # KMeans doesn't work with blanks.. prob could do this in the concat above but its fine
    for var in num_vars:
        for df in [train_data, test_data]:
            df[var] = df[var].fillna(df[var].mean())

    cluster = KMeans(n_clusters=n_clusters, **kwargs).fit(train_data)

    train_pred = cluster.predict(train_data)
    test_pred = cluster.predict(test_data)
        
    return train_pred, test_pred

def create_yesno_group_map(var):
    ''' create tuple for yes,no group var for variables with Yes, No, Not done levels '''
    return (
        var,
        'yes,no',
        {'No':'No', 'Not done':'No', 'Yes':'Yes'},
        'No'
    )

def stack_imputations(raw_df, tf_df, suffix:str, train:bool, train_only_cols=None):

    train_only_cols = [] if train_only_cols is None else train_only_cols
    
    n = len(raw_df) // len(tf_df)
    og_raw_df = raw_df.iloc[:len(raw_df) //n]
    tmp_df = pd.concat([og_raw_df.ID, tf_df], axis=1)

    if train:
        tmp_df = pd.concat(
            [tmp_df,og_raw_df[train_only_cols]],
            axis=1
        )
            
    tmp_df.columns = ['_'.join(c.split('_')[:-1]) if suffix in c else c for c in tmp_df.columns]

    if train:
        col_order = list(raw_df.columns)
    else:
        col_order = [c for c in raw_df if c not in train_only_cols]
    
    tmp_df = tmp_df[col_order]
    return pd.concat([raw_df, tmp_df])

def avg_predictions(predictions, n_stacks):
    ''' add the first half of items to the second half of items '''
    if len(predictions) % n_stacks != 0:
        raise ValueError("predictions should be even")
    mid = len(predictions) // n_stacks
    return [(predictions[i] + predictions[mid + i])/n_stacks for i in range(mid)]

def sum_predictions(predictions, n_stacks):
    ''' add the first half of items to the second half of items '''
    if len(predictions) % n_stacks != 0:
        raise ValueError("predictions should be even")
    mid = len(predictions) // n_stacks
    return [predictions[i] + predictions[mid + i] for i in range(mid)]

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]

    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pd.api.types.is_numeric_dtype(submission[col]):
            raise ValueError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

In [3]:
data_dict = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv')
train_raw = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv')
test_raw = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv')

In [4]:
sub_df = test_raw[['ID']]

In [5]:
# NOTE: Dev only

# from sklearn.model_selection import train_test_split

# X = train_raw.copy()
# y = [0]*len(train_raw)

# train_raw, test_raw, _, _ = train_test_split(X, y, test_size=0.25, random_state=42)

# # train_raw['tgt'] = tmp_train_y
# # test_raw['tgt'] = tmp_test_y

# train_raw = train_raw.reset_index(drop=True)
# test_raw = test_raw.reset_index(drop=True)

In [6]:
for df in [train_raw, test_raw]:
    df['donor_age'] = round(df['donor_age']*2)/2
    df['age_at_hct'] = round(df['age_at_hct']*2)/2

In [7]:
cat_vars = [c for c in data_dict[data_dict.type == 'Categorical'].variable if c != 'efs']
num_vars = [c for c in data_dict[data_dict.type == 'Numerical'].variable if c != 'efs_time']

# Clustering

In [8]:
for df in [train_raw, test_raw]:
    for cat_var in cat_vars:
        df[cat_var] = df[cat_var].fillna('Unknown')

In [9]:
# Data cleaning and grouping for more accurate clusters

cat_var_group_maps = ( # (var, map, fill_val)
    (
        'dri_score',
        'high,other',
        {
            'High':'High', 
            'High - TED AML case <missing cytogenetics':'High', 
            'Very high':'High'
        }, 
        'Other'
    ),
    (
        'cyto_score',
        'poor,other',
        {
            'Poor':'Poor'
        },
        'Other'
    ),
    (
        'cyto_score_detail',
        'favorable,intermediate,poor',
        {
            'Favorable':'Favorable', 
            'Poor':'Poor',
        }, 
        'Intermediate'
    ),
    (
        'conditioning_intensity',
        'mac,nma,ric,other',
        {
            'MAC':'MAC',
            'NMA':'NMA',
            'RIC':'RIC'
        },
        'Other'
    ),
    create_yesno_group_map('psych_disturb'),
    create_yesno_group_map('diabetes'),
    create_yesno_group_map('arrhythmia'),
    create_yesno_group_map('pulm_severe'),
    create_yesno_group_map('hepatic_severe'),
    create_yesno_group_map('prior_tumor'),
)

for var, grp_name, map_, fill_val in cat_var_group_maps:
    train_raw[f'{var}_grp_{grp_name}'] = train_raw[var].map(map_).fillna(fill_val)
    test_raw[f'{var}_grp_{grp_name}'] = test_raw[var].map(map_).fillna(fill_val)

In [10]:
kmeans_variables = [
    # 'comorbidity_score',
    # 'karnofsky_score',
    'tbi_status',
    'prim_disease_hct',
    'age_at_hct',
    'dri_score_grp_high,other',
    'cyto_score_grp_poor,other',
    'cyto_score_detail_grp_favorable,intermediate,poor',
    'conditioning_intensity_grp_mac,nma,ric,other',
    'psych_disturb_grp_yes,no',
    'diabetes_grp_yes,no',
    'arrhythmia_grp_yes,no',
    'pulm_severe_grp_yes,no',
    'hepatic_severe_grp_yes,no',
    'prior_tumor_grp_yes,no'
]

num_kmeans_variables = [c for c in train_raw[kmeans_variables] if c in num_vars]
cat_kmeans_variables = [c for c in train_raw[kmeans_variables] if c in cat_vars]

grp_vars = [c for c in train_raw if '_grp_' in c]
cat_kmeans_variables +=grp_vars

In [11]:
train_raw['cluster_n2'], test_raw['cluster_n2'] = apply_kmeans(
    train_raw, test_raw, 2, cat_kmeans_variables, num_kmeans_variables, random_state=42
)

train_raw['cluster_n3'], test_raw['cluster_n3'] = apply_kmeans(
    train_raw, test_raw, 3, cat_kmeans_variables, num_kmeans_variables, random_state=42
)

train_raw['cluster_n4'], test_raw['cluster_n4'] = apply_kmeans(
    train_raw, test_raw, 4, cat_kmeans_variables, num_kmeans_variables, random_state=42
)

In [12]:
train_raw = train_raw[[c for c in train_raw if not '_grp_' in c]]
test_raw = test_raw[[c for c in test_raw if not '_grp_' in c]]

In [13]:
cat_vars.extend(['cluster_n2', 'cluster_n3', 'cluster_n4'])

In [14]:
display(train_raw.shape)
display(test_raw.shape)

(28800, 63)

(3, 61)

## Calculate Target

In [15]:
from lifelines import KaplanMeierFitter

In [16]:
kmf = KaplanMeierFitter()
kmf.fit(train_raw['efs_time'], train_raw['efs'])
train_raw['kmf'] = kmf.survival_function_at_times(train_raw['efs_time']).values

# Imputation - Numerical: NN Imputation & Union

In [17]:
%%time
# Model Imputations
encoder = OrdinalEncoder()
imputer = KNNImputer(n_neighbors=2)

train_nn_imp = get_model_imputation(train_raw, cat_vars, num_vars, encoder, imputer, fit=True)
test_nn_imp = get_model_imputation(test_raw, cat_vars, num_vars, encoder, imputer)

CPU times: user 55.8 s, sys: 5.43 s, total: 1min 1s
Wall time: 57.7 s


In [18]:
display(len(train_nn_imp) == len(train_raw))
display(len(test_nn_imp) == len(test_raw))

True

True

In [19]:
display(train_raw.head(2))
display(train_nn_imp.head(2))

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10,efs,efs_time,cluster_n2,cluster_n3,cluster_n4,kmf
0,0,N/A - non-malignant indication,No,Unknown,No,,,No TBI,No,6.0,...,No,2.0,No,10.0,0.0,42.356,1,0,1,0.458687
1,1,Intermediate,No,Intermediate,No,2.0,8.0,"TBI +- Other, >cGy",No,6.0,...,No,2.0,Yes,10.0,1.0,4.672,0,2,3,0.847759


Unnamed: 0,dri_score_NNFill,psych_disturb_NNFill,cyto_score_NNFill,diabetes_NNFill,tbi_status_NNFill,arrhythmia_NNFill,graft_type_NNFill,vent_hist_NNFill,renal_issue_NNFill,pulm_severe_NNFill,...,donor_age_NNFill,hla_match_b_low_NNFill,age_at_hct_NNFill,hla_match_a_low_NNFill,hla_match_b_high_NNFill,comorbidity_score_NNFill,karnofsky_score_NNFill,hla_low_res_8_NNFill,hla_match_drb1_high_NNFill,hla_low_res_10_NNFill
0,N/A - non-malignant indication,No,Unknown,No,No TBI,No,Bone marrow,No,No,No,...,42.5,2.0,10.0,2.0,2.0,0.0,90.0,8.0,2.0,10.0
1,Intermediate,No,Intermediate,No,"TBI +- Other, >cGy",No,Peripheral blood,No,No,No,...,72.5,2.0,43.5,2.0,2.0,3.0,90.0,8.0,2.0,10.0


In [20]:
stacks = 1
train_cols = ['efs', 'efs_time', 'kmf']

## V-Stack
train_raw = stack_imputations(train_raw, train_nn_imp, 'NNFill', True, train_cols)
test_raw = stack_imputations(test_raw, test_nn_imp, 'NNFill', False, train_cols)

stacks += 1

## H-Stack
# train_raw = pd.concat([train_raw, train_nn_imp])
# train_raw = pd.concat([test_raw, test_nn_imp])

In [21]:
display(train_raw.shape)
display(test_raw.shape)

(57600, 64)

(6, 61)

In [22]:
tmp_train = train_raw.iloc[:len(train_raw)//2]
tmp_test = test_raw.iloc[:len(test_raw)//2]

train_simp_imp = get_simple_imputations(tmp_train, cat_vars, num_vars)
test_simp_imp = get_simple_imputations(tmp_test, cat_vars, num_vars)

## V-Stack
train_raw = stack_imputations(train_raw, train_simp_imp, 'ModeGrpFill', True, train_cols)
test_raw = stack_imputations(test_raw, test_simp_imp, 'ModeGrpFill', False, train_cols)

stacks += 1

In [23]:
display(train_raw.shape)
display(test_raw.shape)

(86400, 64)

(9, 61)

In [24]:
stacks

3

# Models

## Setup

In [25]:
import xgboost as xgb
import h2o
from h2o.automl import H2OAutoML

In [26]:
predictors = num_vars + cat_vars

## H2O

In [27]:
%%capture

# basic init should be fine
h2o.init()

In [28]:
train_h2o = h2o.H2OFrame(train_raw[predictors+['kmf']])
test_h2o = h2o.H2OFrame(test_raw[predictors])

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [29]:
# H2O likes to (annoyingly) assign data values based on first record sometimes

for h2o_df in [train_h2o, test_h2o]:
    for col in predictors:
        if col in cat_vars:
            if h2o_df[col].types[col] == 'real':  # Check H2O column type
                h2o_df[col] = h2o_df[col].ascharacter().asfactor()
                h2o_df[col] = h2o_df[col].ascharacter().asfactor()
            else:
                h2o_df[col] = h2o_df[col].asfactor()
                h2o_df[col] = h2o_df[col].asfactor()
        else:
            h2o_df[col] = h2o_df[col].asnumeric()
            h2o_df[col] = h2o_df[col].asnumeric()

In [30]:
# AutoML training
h2o_aml = H2OAutoML(
    max_models=5000,
    exclude_algos=['GLM'],
    seed=42,
    nfolds=10,
    max_runtime_secs=4000,
)

h2o_aml.train(
    x=predictors,
    y='kmf',
    training_frame=train_h2o,
    # weights_column = "wgt"
)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,110.0

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid,cv_7_valid,cv_8_valid,cv_9_valid,cv_10_valid
aic,,0.0,,,,,,,,,,
loglikelihood,,0.0,,,,,,,,,,
mae,0.1296125,0.0013125,0.1285602,0.1318537,0.1284756,0.1308741,0.1293751,0.1311845,0.1280319,0.1296093,0.1297507,0.1284103
mean_residual_deviance,0.0247068,0.0005018,0.0244389,0.0257092,0.024348,0.0249659,0.0246565,0.0253409,0.0240638,0.0244461,0.024719,0.024379
mse,0.0247068,0.0005018,0.0244389,0.0257092,0.024348,0.0249659,0.0246565,0.0253409,0.0240638,0.0244461,0.024719,0.024379
r2,0.2085118,0.0163575,0.2074753,0.196428,0.2204298,0.214894,0.2108452,0.1804764,0.2155448,0.1893167,0.2119158,0.2377918
residual_deviance,0.0247068,0.0005018,0.0244389,0.0257092,0.024348,0.0249659,0.0246565,0.0253409,0.0240638,0.0244461,0.024719,0.024379
rmse,0.1571766,0.00159,0.1563295,0.160341,0.1560385,0.1580059,0.157024,0.1591884,0.1551252,0.1563526,0.1572228,0.1561378
rmsle,0.0945672,0.0009113,0.0941753,0.0963321,0.0938318,0.0949592,0.0945349,0.0957646,0.0933273,0.0943036,0.0945617,0.0938819

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2025-03-04 02:46:29,12 min 19.320 sec,0.0,0.2061617,0.1444795,0.0425026
,2025-03-04 02:46:31,12 min 21.115 sec,5.0,0.1670388,0.1397607,0.0279020
,2025-03-04 02:46:32,12 min 21.635 sec,10.0,0.1633288,0.1386887,0.0266763
,2025-03-04 02:46:32,12 min 22.158 sec,15.0,0.1615430,0.1370562,0.0260961
,2025-03-04 02:46:33,12 min 22.732 sec,20.0,0.1603685,0.1354049,0.0257181
,2025-03-04 02:46:33,12 min 23.274 sec,25.0,0.1592893,0.1341383,0.0253731
,2025-03-04 02:46:34,12 min 23.819 sec,30.0,0.1584703,0.1331206,0.0251128
,2025-03-04 02:46:34,12 min 24.384 sec,35.0,0.1577732,0.1322333,0.0248924
,2025-03-04 02:46:35,12 min 24.948 sec,40.0,0.1571284,0.1313125,0.0246893
,2025-03-04 02:46:36,12 min 25.513 sec,45.0,0.1566953,0.1308728,0.0245534

variable,relative_importance,scaled_importance,percentage
dri_score.High,146.3498077,1.0,0.1359283
conditioning_intensity.Unknown,129.8757629,0.8874338,0.1206274
comorbidity_score,88.1275253,0.6021704,0.0818520
year_hct,78.4778671,0.5362348,0.0728895
karnofsky_score,67.8612442,0.4636921,0.0630289
cyto_score_detail.Poor,61.1437721,0.4177920,0.0567898
donor_age,31.4043159,0.2145839,0.0291680
age_at_hct,22.0313740,0.1505391,0.0204625
cyto_score.Poor,19.0091934,0.1298887,0.0176556
dri_score.Intermediate,18.7307072,0.1279859,0.0173969


In [31]:
h2o_aml_predictions = sum_predictions(
    [i[0] for i in h2o_aml.leader.predict(test_h2o).as_data_frame(use_multi_thread=True).values],
    stacks
)
h2o_aml_predictions[:3]

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
Export File progress: |██████████████████████████████████████████████████████████| (done) 100%


[1.0071579217910767, 1.3010954856872559, 0.8768056333065033]

In [32]:
# test_df = test_raw.iloc[:len(test_raw)//stacks]

# test_sol = test_df[['ID', 'efs', 'efs_time', 'race_group']]
# test_sub = test_df[['ID']]
# test_sub['prediction'] = h2o_aml_predictions

# score(test_sol.copy(), test_sub.copy(), 'ID')

In [33]:
sub_df['prediction'] = h2o_aml_predictions
display(sub_df['prediction'])
sub_df.to_csv("submission.csv", index=False)

0    1.007158
1    1.301095
2    0.876806
Name: prediction, dtype: float64