# Load packages and preprocess data

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sksurv.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.utils import resample
from sksurv.util import Surv
from sksurv.kernels import clinical_kernel
from sklearn import set_config

from sklearn.metrics import make_scorer
from sksurv.metrics import concordance_index_censored
from sksurv.metrics import concordance_index_ipcw
from sksurv.metrics import cumulative_dynamic_auc
from sksurv.metrics import integrated_brier_score

from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from sksurv.svm import FastKernelSurvivalSVM, FastSurvivalSVM
from sksurv.ensemble import RandomSurvivalForest
from sksurv.tree import SurvivalTree
import shap
import pickle
import time
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import  RandomForestRegressor

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
#dt = pd.read_csv('data_right.csv')
#dt = dt[(dt['event_14d'] != 1) | (dt['length_14d'] != 14)]
#dt[dt.columns[10:29]] = dt[dt.columns[10:29]].astype('category')
#dt_train, dt_val_test = train_test_split(dt, test_size = 0.2, random_state =42)

In [None]:
#impute missing value. Note that the procedure just used data from training set to predict missing values,
# then predictive model was transformed to the val and test set.

#imputer = IterativeImputer(random_state = 42)
#train_imputed = imputer.fit_transform(dt_train)
#val_test_imputed = imputer.transform(dt_val_test)
#dt_train_imputed = pd.DataFrame(train_imputed, columns = dt_train.columns)
#dt_val_test_imputed = pd.DataFrame(val_test_imputed, columns = dt_test.columns)
#dt_val_imputed, dt_test_imputed = train_test_split(dt_val_test_imputed, test_size = 0.5, random_state =42)

In [None]:
#Save three datasets for preductability

#dt_train_imputed.to_csv('dt_train_imputed.csv', index=False)
#dt_val_test_imputed.to_csv('dt_val_imputed_test.csv', index=False)
#dt_val_imputed.to_csv('dt_val_imputed.csv', index=False)
#dt_test_imputed.to_csv('dt_test_imputed.csv', index=False)

In [None]:
dt_train_imputed = pd.read_csv('dt_train_imputed.csv')
dt_val_imputed = pd.read_csv('dt_val_imputed.csv')
dt_test_imputed = pd.read_csv('dt_test_imputed.csv')

dt_train_imputed[dt_train_imputed.columns[10:29]] =
dt_train_imputed[dt_train_imputed.columns[10:29]].astype('category')
dt_val_imputed[dt_val_imputed.columns[10:29]] =
dt_val_imputed[dt_val_imputed.columns[10:29]].astype('category')
dt_test_imputed[dt_test_imputed.columns[10:29]] =
dt_test_imputed[dt_test_imputed.columns[10:29]].astype('category')

dt_train_imputed =
dt_train_imputed[(dt_train_imputed['event_14d'] != 1) | (dt_train_imputed['length_14d'] != 14)]
dt_val_imputed =
dt_val_imputed[(dt_val_imputed['event_14d'] != 1) | (dt_val_imputed['length_14d'] != 14)]
dt_test_imputed =
dt_test_imputed[(dt_test_imputed['event_14d'] != 1) | (dt_test_imputed['length_14d'] != 14)]
X_train = dt_train_imputed.iloc[:, 10:]
X_val = dt_val_imputed.iloc[:, 10:]
X_test = dt_test_imputed.iloc[:, 10:]

y_train = Surv.from_arrays(dt_train_imputed['event_14d'],  dt_train_imputed['length_14d'],
                           name_event='event', name_time='time')
y_val = Surv.from_arrays(dt_val_imputed['event_14d'],  dt_val_imputed['length_14d'],
                         name_event='event', name_time='time')
y_test = Surv.from_arrays(dt_test_imputed['event_14d'],  dt_test_imputed['length_14d'],
                          name_event='event', name_time='time')

categorical_columns = X_train.select_dtypes(include=['category']).columns
encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train[categorical_columns])
X_val_encoded = encoder.transform(X_val[categorical_columns])
X_test_encoded = encoder.transform(X_test[categorical_columns])


X_train_encoded = pd.concat([X_train.drop(columns=categorical_columns), X_train_encoded], axis=1)
X_val_encoded = pd.concat([X_val.drop(columns=categorical_columns), X_val_encoded], axis=1)
X_test_encoded = pd.concat([X_test.drop(columns=categorical_columns), X_test_encoded], axis=1)


# Define several functions used in the project

In [None]:
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y["event"], y["time"], prediction)
    return result[0]

from sklearn.model_selection import KFold
from sklearn.utils import resample
from sksurv.metrics import concordance_index_censored, cumulative_dynamic_auc, integrated_brier_score

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
y_times = np.arange(1,14)

# 1. Hyperparameter for several algorithms and validation on the training set

## CoxnetSurvivalAnalysis

In [None]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

param_grid = {
    'alpha_min_ratio': [0.01, 0.1, 1],
    'l1_ratio': [0.1, 0.5, 0.9]
}


grid_search_coxnet = GridSearchCV(
    estimator=CoxnetSurvivalAnalysis(),
    param_grid=param_grid,
    cv=cv, scoring=score_survival_model,  n_jobs=-1,
                           refit=False
)

grid_search_coxnet.fit(X_train_encoded, y_train)

# Print the best hyperparameters
print(grid_search_coxnet.best_params_)

In [None]:
coxnet_best = CoxnetSurvivalAnalysis(**grid_search_coxnet.best_params_, fit_baseline_model = True)
coxnet_best.fit(X_train_encoded, y_train)
print(coxnet_best.score(X_val_encoded, y_val))
prob = np.row_stack([fn(y_times) for fn in coxnet_best.predict_survival_function(X_val_encoded)])
print(integrated_brier_score(y_train, y_val, prob, y_times))
y_times = np.arange(1,14)
risk_scores = coxnet_best.predict(X_val_encoded)
auc, mean_auc = cumulative_dynamic_auc(y_train, y_val, risk_scores, y_times)
print(mean_auc)

## Survival tree

In [None]:
warnings.filterwarnings("ignore", category=RuntimeWarning)
from sksurv.tree import SurvivalTree
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search_SVT = GridSearchCV(
    estimator=SurvivalTree(random_state = 42),
    param_grid=param_grid,
    cv=cv, scoring=score_survival_model,  n_jobs=-1,
                           refit=False
)

grid_search_SVT.fit(X_train_encoded, y_train)

# Print the best hyperparameters
print(grid_search_SVT.best_params_)

In [None]:
SVT_best =SurvivalTree(**grid_search_SVT.best_params_, random_state = 42)
SVT_best.fit(X_train_encoded, y_train)
print(SVT_best.score(X_val_encoded, y_val))
prob = np.row_stack([fn(y_times) for fn in SVT_best.predict_survival_function(X_val_encoded)])
print(integrated_brier_score(y_train, y_val, prob, y_times))
y_times = np.arange(1,14)
risk_scores = SVT_best.predict(X_val_encoded)
auc, mean_auc = cumulative_dynamic_auc(y_train, y_val, risk_scores, y_times)
print(mean_auc)

## Gradient Boosting Survival analysis

In [None]:
import warnings

# Your existing code for model training and cross-validation

# Ignore the specific warning
warnings.filterwarnings("ignore", category=UserWarning)

#the GradientBoostingSurvivalAnalysis model
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [50, 100, 500],
    'max_depth': [1, 2,3],
    'min_samples_leaf': [6,10,15,20],
}

gbsa = GradientBoostingSurvivalAnalysis(random_state=42)
#grid_search = GridSearchCV(gbsa, param_grid, cv=5, scoring=score_survival_model,  n_jobs=-1,
                           #refit=False)
grid_search_gbsa = GridSearchCV(gbsa, param_grid, cv=cv, scoring=score_survival_model,  n_jobs=-1,
                           refit=False)


grid_search_gbsa.fit(X_train_encoded, y_train)

print(grid_search_gbsa.best_params_)

In [None]:
gbsa_best = GradientBoostingSurvivalAnalysis(**grid_search_gbsa.best_params_, random_state=42)
gbsa_best.fit(X_train_encoded, y_train)
print(gbsa_best.score(X_val_encoded, y_val))
prob = np.row_stack([fn(y_times) for fn in gbsa_best.predict_survival_function(X_val_encoded)])
print(integrated_brier_score(y_train, y_val, prob, y_times))
y_times = np.arange(1,14)
risk_scores = gbsa_best.predict(X_val_encoded)
auc, mean_auc = cumulative_dynamic_auc(y_train, y_val, risk_scores, y_times)
print(mean_auc)

## Random Survial forest

In [None]:
import warnings
from sksurv.ensemble import RandomSurvivalForest
# Step 3: Hyperparameter Tuning Loop
warnings.filterwarnings("ignore", category=RuntimeWarning)

param_grid = {
    'n_estimators': [100,500,700, 1000],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [6,10,15,20],
    'max_features': ['auto', 'sqrt', 'log2']
}

RFV =RandomSurvivalForest(random_state = 42)

grid_search_RSF = GridSearchCV(RFV, param_grid, cv=cv, scoring=score_survival_model,  n_jobs=-1,
                           refit=False)


grid_search_RSF.fit(X_train_encoded, y_train)

print(grid_search_RSF.best_params_)

In [None]:
RVF_best = RandomSurvivalForest(**grid_search_RSF.best_params_, random_state = 42)
RVF_best.fit(X_train_encoded, y_train)
print(RVF_best.score(X_val_encoded, y_val))
prob = np.row_stack([fn(y_times) for fn in RVF_best.predict_survival_function(X_val_encoded)])
print(integrated_brier_score(y_train, y_val, prob, y_times))
y_times = np.arange(1,14)
risk_scores = RVF_best.predict(X_val_encoded)
auc, mean_auc = cumulative_dynamic_auc(y_train, y_val, risk_scores, y_times)
print(mean_auc)

In [None]:
from sksurv.metrics import concordance_index_ipcw

models = [coxnet_best, SVT_best, gbsa_best, RVF_best]
model_names = ['CoxNet', 'Survival tree','Gradient Boosting Survival', 'Random Survival Forest']

# Plotting time-dependent ROC curves
y_times = np.arange(1, 14)

plt.figure(figsize=(10, 6))
for model, model_name in zip(models, model_names):
    model.fit(X_train_encoded, y_train)
    risk_scores = model.predict(X_val_encoded)
    auc, mean_auc = cumulative_dynamic_auc(y_train, y_val, risk_scores, y_times)

    # Calculate concordance index IPCW
    c_index_ipcw = concordance_index_ipcw(y_train, y_val, risk_scores)
    c_index_value = c_index_ipcw[0]  # Extracting the first value from the tuple

    print(f"Concordance Index IPCW for {model_name}: {c_index_value:.3f}")

    plt.plot(y_times, auc, marker="o", label=f"{model_name}")

plt.xlabel("Days from discharge")
plt.ylabel("Time-dependent AUC")
plt.axhline(0.5, linestyle="--", color='gray', label="Random")
plt.legend()
plt.title("Time-dependent ROC Curves for Survival Models")
plt.grid(True)
plt.show()


# 2. Internal validation

In [None]:
final = RVF_best
final.fit(X_train_encoded, y_train)

y_times = np.arange(1,14)
risk_scores = final.predict(X_test_encoded)
auc, mean_auc = cumulative_dynamic_auc(y_train, y_test, risk_scores, y_times)

plt.plot(y_times, auc, marker="o")
plt.axhline(mean_auc, linestyle="--")
plt.xlabel("days from enrollment")
plt.ylabel("time-dependent AUC")
plt.grid(True)

In [None]:
print(final.score(X_test_encoded, y_test))
prob = np.row_stack([fn(y_times) for fn in final.predict_survival_function(X_test_encoded)])
print(integrated_brier_score(y_train, y_test, prob, y_times))
print(mean_auc)