In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
# estimate a kaplan meier curve per race and use that as a target variable

In [None]:
dic = pd.read_csv('data_dictionary.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf = KaplanMeierFitter()

for race in train['race_group'].unique():
    mask = train['race_group'] == race
    kmf.fit(train[mask]['efs_time'], train[mask]['efs'], label=race)
    kmf.plot()

plt.title('Kaplan-Meier Curves by Race Group')
plt.xlabel('Time')
plt.ylabel('Survival Probability')
plt.show()


In [None]:
kmf_models = {}
for race in train['race_group'].unique():
    kmf = KaplanMeierFitter()
    mask = train['race_group'] == race
    kmf.fit(train[mask]['efs_time'], train[mask]['efs'], label=race)
    kmf_models[race] = kmf

def get_survival_probability(row):
    race = row['race_group']
    time = row['efs_time']
    return kmf_models[race].survival_function_at_times(time).iloc[0]

train['survival_probability'] = train.apply(get_survival_probability, axis=1)

In [None]:
train['log_survival_probability'] = np.log(train['survival_probability']*100)

In [None]:
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=42)
train['q_survival_probability'] = qt.fit_transform(train['survival_probability'].values.reshape(-1, 1))

In [None]:
train[['survival_probability','log_survival_probability','q_survival_probability']].hist(bins=100)

In [None]:
train.plot.scatter(y='log_survival_probability', x='efs_time')

# xgboost

In [None]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor, XGBClassifier
import xgboost

In [None]:
RMV = ["ID","efs","efs_time","y", 'efs_time2','log_survival_probability']
FEATURES = [c for c in train.columns if not c in RMV]
print(f"There are {len(FEATURES)} FEATURES: {FEATURES}")

CATS = []
for c in FEATURES:
    if train[c].dtype=="object":
        CATS.append(c)
        train[c] = train[c].fillna("NAN")
        test[c] = test[c].fillna("NAN")
print(f"In these features, there are {len(CATS)} CATEGORICAL FEATURES: {CATS}")

combined = pd.concat([train,test],axis=0,ignore_index=True)
#print("Combined data shape:", combined.shape )

# LABEL ENCODE CATEGORICAL FEATURES
print("We LABEL ENCODE the CATEGORICAL FEATURES: ",end="")
for c in FEATURES:

    # LABEL ENCODE CATEGORICAL AND CONVERT TO INT32 CATEGORY
    if c in CATS:
        print(f"{c}, ",end="")
        combined[c],_ = combined[c].factorize()
        combined[c] -= combined[c].min()
        combined[c] = combined[c].astype("int32")
        combined[c] = combined[c].astype("category")
        
    # REDUCE PRECISION OF NUMERICAL TO 32BIT TO SAVE MEMORY
    else:
        if combined[c].dtype=="float64":
            combined[c] = combined[c].astype("float32")
        if combined[c].dtype=="int64":
            combined[c] = combined[c].astype("int32")
    
train_df_split0 = combined.iloc[:len(train)].copy()
#test_df_split = combined.iloc[len(train):].reset_index(drop=True).copy()

In [None]:
train_df_split, test_df_split = train_test_split(train_df_split0, test_size=0.2, random_state=42, shuffle=True)

In [None]:
from sklearn.preprocessing import QuantileTransformer
qtDict = {}
for r in train_df_split['race_group'].unique():
    mask = train_df_split['race_group'] == r
    qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=42)
    train_df_split.loc[mask,'q_survival_probability'] = qt.fit_transform(train_df_split.loc[mask,'survival_probability'].values.reshape(-1, 1))
    qtDict[r] = qt

qt = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=42)
train_df_split['q_survival_probability'] = qt.fit_transform(train_df_split['survival_probability'].values.reshape(-1, 1))

In [None]:
FEATURES = train_df_split.drop(['ID', 'efs', 'efs_time','survival_probability','log_survival_probability','q_survival_probability'], axis=1).columns
train_df_split.reset_index(inplace=True)
test_df_split.reset_index(inplace=True)

In [None]:
%%time
FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
oof_xgb = np.zeros(len(train_df_split))
pred_xgb = np.zeros(len(test_df_split))

for i, (train_index, test_index) in enumerate(kf.split(train_df_split)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train_df_split.loc[train_index,FEATURES].copy()
    y_train = train_df_split.loc[train_index,"q_survival_probability"]
    x_valid = train_df_split.loc[test_index,FEATURES].copy()
    y_valid = train_df_split.loc[test_index,"q_survival_probability"]
    x_test = test_df_split[FEATURES].copy()

    dtrain = xgboost.DMatrix(x_train, label=y_train, enable_categorical=True)
    dvalid = xgboost.DMatrix(x_valid, label=y_valid, enable_categorical=True)

    model_xgb = XGBRegressor(
        max_depth=3,  
        colsample_bytree=0.5, 
        subsample=0.8, 
        n_estimators=10_000,  
        learning_rate=0.1, 
        early_stopping_rounds=25,
        #objective='reg:logistic',
        enable_categorical=True,
        min_child_weight=5,
        eval_metric= "mae",
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train), (x_valid, y_valid)],  
        verbose=100 
    )

    # INFER OOF
    oof_xgb[test_index] = model_xgb.predict(x_valid)
    # INFER TEST
    pred_xgb += model_xgb.predict(x_test)

# COMPUTE AVERAGE TEST PREDS
pred_xgb /= FOLDS

In [None]:
from metric import score

y_trueCV = train_df_split[["ID","efs","efs_time","race_group",'survival_probability']].copy()
y_predCV = train_df_split[["ID"]].copy()
y_predCV["prediction"] = oof_xgbR #higher risk should lead to lower value, so our prediction is just simply a risk score
m, ar0 = score(y_trueCV.copy(), y_predCV.copy(), "ID")

y_true = test_df_split[["ID","efs","efs_time","race_group",'survival_probability']].copy()
y_pred = test_df_split[["ID"]].copy()
y_pred["prediction"] = pred_xgbR #higher risk should lead to lower value, so our prediction is just simply a risk score
n, ar1 = score(y_true.copy(), y_pred.copy(), "ID")
#print(f"\nOverall CV for XGBoost =",m)
#print(f"\nOverall test for XGBoost =",n)
print(f"CV: {m} | Test: {n}")
#print(f"c-indexes: CV: {ar0} | Test: {ar1}")

In [None]:
feature_importance = model_xgb.feature_importances_
importance_df = pd.DataFrame({
    "Feature": FEATURES,  # Replace FEATURES with your list of feature names
    "Importance": feature_importance
}).sort_values(by="Importance", ascending=False)
plt.figure(figsize=(10, 15))
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("XGBoost KaplanMeier Feature Importance")
plt.gca().invert_yaxis()  # Flip features for better readability
plt.show()

# investigation of transformed target

In [None]:
y_trueCV = train_df_split[["ID","efs","efs_time","race_group",'survival_probability','q_survival_probability']].copy()
y_trueCV["prediction"] = oof_xgb
y_trueCV["predictionR"] = qt.inverse_transform(oof_xgb.reshape(-1,1))

y_true = test_df_split[["ID","efs","efs_time","race_group",'survival_probability','q_survival_probability']].copy()
y_true["prediction"] = pred_xgb
y_true["predictionR"] = qt.inverse_transform(pred_xgb.reshape(-1,1))

In [None]:
y_trueCV[['q_survival_probability','prediction']].hist(bins=100)

In [None]:
for r in y_trueCV['race_group'].unique():
    mask = y_trueCV['race_group'] == r
    y_trueCV.loc[mask,'predictionR'] = qtDict[r].inverse_transform(y_trueCV.loc[mask,'prediction'].values.reshape(-1, 1))
    print(r)
    y_trueCV.loc[mask,['q_survival_probability','prediction']].hist(bins=100)

In [None]:
y_trueCV.loc[y_trueCV['q_survival_probability'] < -3]

In [None]:
y_trueCV.loc[y_trueCV.race_group ==2].plot.scatter(y='survival_probability', x='efs_time')

In [None]:
y_true.loc[y_true.race_group ==2].plot.scatter(y='survival_probability', x='efs_time')

In [None]:
y_trueCV[['survival_probability','predictionR']].hist(bins=100)