In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from lifelines import KaplanMeierFitter

In [None]:
dic = pd.read_csv('data_dictionary.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
RMV = ["ID","efs","efs_time","y", 'efs_time2','log_survival_probability']
FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
        
    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
train_df_split0 = combined.iloc[:len(train)].copy()
submit_df_split = combined.iloc[len(train):].reset_index(drop=True).copy()

In [None]:
train_df_split0["efs_time2"] = train_df_split0.efs_time.copy()
train_df_split0.loc[train_df_split0.efs==0,"efs_time2"] *= -1

In [None]:
train_df_split, test_df_split = train_test_split(train_df_split0, test_size=0.2, random_state=42, shuffle=True)

In [None]:
kmf_models = {}
for race in train_df_split['race_group'].unique():
    kmf = KaplanMeierFitter()
    mask = train_df_split['race_group'] == race
    kmf.fit(train_df_split[mask]['efs_time'], train_df_split[mask]['efs'], label=race)
    kmf_models[race] = kmf

def get_survival_probability(row):
    race = row['race_group']
    time = row['efs_time']
    return kmf_models[race].survival_function_at_times(time).iloc[0] *1/(row.efs_time + 1)

train_df_split['survival_probability'] = train_df_split.apply(get_survival_probability, axis=1)
test_df_split['survival_probability'] = train_df_split.apply(get_survival_probability, axis=1)

In [None]:
from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=42)
train_df_split['q_survival_probability'] = qt.fit_transform(train_df_split['survival_probability'].values.reshape(-1, 1))
train_df_split['q_survival_probability'] = train_df_split['q_survival_probability']

test_df_split['q_survival_probability'] = qt.transform(test_df_split['survival_probability'].values.reshape(-1, 1))
test_df_split['q_survival_probability'] = test_df_split['q_survival_probability']

In [None]:
FEATURES = train_df_split.drop(['ID', 'efs', 'efs_time','survival_probability','q_survival_probability','efs_time2'], axis=1).columns
train_df_split.reset_index(inplace=True)
test_df_split.reset_index(inplace=True)
submit_df_split.reset_index(inplace=True)

# xgboost kaplan meier

In [None]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor, XGBClassifier
import xgboost
from sklearn.ensemble import StackingRegressor

In [None]:
%%time
FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_xgb = np.zeros(len(train_df_split))
pred_xgb = np.zeros(len(test_df_split))
submit_xgb = np.zeros(len(submit_df_split))

for i, (train_index, test_index) in enumerate(kf.split(train_df_split)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train_df_split.loc[train_index,FEATURES].copy()
    y_train = train_df_split.loc[train_index,"q_survival_probability"]
    y_train2 = train_df_split.loc[train_index,"efs_time2"]
    x_valid = train_df_split.loc[test_index,FEATURES].copy()

    y_valid = train_df_split.loc[test_index,"q_survival_probability"]
    y_valid2 = train_df_split.loc[test_index,"efs_time2"]
    x_test = test_df_split[FEATURES].copy()
    x_submit = submit_df_split[FEATURES].copy()

    dtrain = xgboost.DMatrix(x_train, label=y_train, enable_categorical=True)
    dvalid = xgboost.DMatrix(x_valid, label=y_valid, enable_categorical=True)

    model_kaplan = XGBRegressor(
        max_depth=5,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=300,  
        learning_rate=0.1, 
        early_stopping_rounds=25,
        #objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=5,
        eval_metric= "rmse",
    )

    model_kaplan.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train), (x_valid, y_valid)],  
        verbose=100 
    )

    model_cox = XGBRegressor(
        max_depth=3,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=300,  
        learning_rate=0.1, 
        #eval_metric="mae",
        early_stopping_rounds=25,
        #objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=5,
        objective = "survival:cox",
        eval_metric= "cox-nloglik",
    )
    

    model_cox.fit(
        x_train, y_train2,
        eval_set=[(x_valid, y_valid2)],  
        verbose=100 
    )

    predCox = model_cox.predict(x_train)
    predKaplan = model_kaplan.predict(x_train)
    x_train["cox"] = predCox
    x_train["kaplan"] = predKaplan

    redCoxVal = model_cox.predict(x_valid)
    predKaplanVal = model_kaplan.predict(x_valid)
    x_valid["cox"] = redCoxVal
    x_valid["kaplan"] = predKaplanVal

    redCoxTest = model_cox.predict(x_test)
    predKaplanTest = model_kaplan.predict(x_test)
    x_test["cox"] = redCoxTest
    x_test["kaplan"] = predKaplanTest

    redCoxSubmit = model_cox.predict(x_submit)
    predKaplanSubmit = model_kaplan.predict(x_submit)
    x_submit["cox"] = redCoxSubmit
    x_submit["kaplan"] = predKaplanSubmit

    stacking_model = XGBRegressor(
        max_depth=5,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=300,  
        learning_rate=0.1, 
        early_stopping_rounds=25,
        #objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=5,
        eval_metric= "rmse")

    # define the stacking model
    #stacking_model = StackingRegressor(estimators=[('cox', model_cox), ('kaplan', model_kaplan)], final_estimator=XGBRegressor()) #, early_stopping_rounds=5, eval_set=[(x_valid, y_valid)])

    # fit the stacking model to the training data
    stacking_model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], verbose=100 )

    # INFER OOF
    oof_xgb[test_index] = stacking_model.predict(x_valid)
    # INFER TEST
    pred_xgb += stacking_model.predict(x_test)
    # INFER SUBMIT
    submit_xgb += stacking_model.predict(x_submit)

# COMPUTE AVERAGE TEST PREDS
pred_xgb /= FOLDS
submit_xgb /= FOLDS

In [None]:
oof_xgbR = qt.inverse_transform(oof_xgb.reshape(-1,1))
pred_xgbR = qt.inverse_transform(pred_xgb.reshape(-1,1))
submit_xgb = qt.inverse_transform(submit_xgb.reshape(-1,1))

from metric import score

y_trueCV = train_df_split[["ID","efs","efs_time","race_group",'survival_probability']].copy()
y_predCV = train_df_split[["ID"]].copy()
y_predCV["prediction"] = oof_xgbR #higher risk should lead to lower value, so our prediction is just simply a risk score
m, ar0 = score(y_trueCV.copy(), y_predCV.copy(), "ID")


y_true = test_df_split[["ID","efs","efs_time","race_group",'survival_probability']].copy()
y_pred = test_df_split[["ID"]].copy()
y_pred["prediction"] = pred_xgbR #higher risk should lead to lower value, so our prediction is just simply a risk score
n, ar1 = score(y_true.copy(), y_pred.copy(), "ID")
#print(f"\nOverall CV for XGBoost =",m)
#print(f"\nOverall test for XGBoost =",n)
print(f"CV: {m} | Test: {n}")
#print(f"c-indexes: CV: {ar0} | Test: {ar1}")

In [None]:
y_submit = submit_df_split[["ID"]].copy()
y_submit["prediction"] = submit_xgb
