In [35]:
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
from sklearn.model_selection import train_test_split
from sksurv.util import Surv
import pandas as pd
import numpy as np

In [36]:
dataset = pd.read_csv("data_ready_45.csv")

In [37]:
dataset = dataset[dataset['time_frame'] != 30]
dataset = dataset[dataset['time_frame'] != 365]

In [38]:
data_kp = dataset.copy()

In [39]:
data_cox = dataset.copy()

In [40]:
y = Surv.from_dataframe('GRF_STAT_PA', 'time_frame', dataset)
dataset = dataset.drop(columns=['GRF_STAT_PA', 'time_frame'])
X = dataset

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
rsf = RandomSurvivalForest(n_estimators=100, max_depth=12, min_samples_split=10, min_samples_leaf=15)

In [43]:
rsf.fit(X_train, y_train)

In [57]:
from sklearn.inspection import permutation_importance

# Compute permutation importance on the test set
result = permutation_importance(rsf, X_test, y_test, n_repeats=10, random_state=42)

pd.DataFrame(
    {
        k: result[k]
        for k in (
            "importances_mean",
            "importances_std",
        )
    },
    index=X_test.columns,
).sort_values(by="importances_mean", ascending=False)

In [None]:
risk_scores_test = rsf.predict(X_test)
c_index = concordance_index_censored(y_test['GRF_STAT_PA'], y_test['time_frame'], risk_scores_test)
print(f"Test Concordance Index: {c_index}")

In [None]:
import matplotlib.pyplot as plt
from sksurv.nonparametric import kaplan_meier_estimator

# Assume y_train contains the structured array with 'event' and 'duration'
# We calculate the censoring survival curve, so we use ~y_train['event']
time, prob_censoring = kaplan_meier_estimator(~y_train['GRF_STAT_PA'], y_train['time_frame'])

# Plot the Kaplan-Meier curve for censoring
plt.step(time, prob_censoring, where="post")
plt.xlabel("Time")
plt.ylabel("Probability of Not Being Censored")
plt.title("Kaplan-Meier Curve for Censoring Survival Function")
plt.grid(True)
plt.show()


In [None]:
from sksurv.nonparametric import kaplan_meier_estimator
import numpy as np

# Assume y_train contains the structured array with 'event' and 'duration'
time, prob_censoring = kaplan_meier_estimator(~y_train['GRF_STAT_PA'], y_train['time_frame'])

# Find the maximum time point where the censoring survival function is greater than zero
valid_times = time[prob_censoring > 0]

# The last valid time point
max_valid_time = valid_times[-1] -1
print(f"Maximum valid time point where censoring is > 0: {max_valid_time}")


In [9]:
import pandas as pd
from lifelines import CoxPHFitter
from lifelines.utils import concordance_index
from sklearn.model_selection import train_test_split

# Assuming 'df' is your full dataset containing 'duration', 'event_occurred', and covariates

# Split the dataset into training and test sets
df_train, df_test = train_test_split(data_cox, test_size=0.2, random_state=42)

# Fit the Cox Proportional Hazards model on the training data
cox_model = CoxPHFitter(penalizer=0.1).fit(df_train, duration_col='time_frame', event_col='GRF_STAT_PA')

# Print the summary of the model
cox_model.print_summary()

# Get the C-Index on the training set
c_index_train = cox_model.concordance_index_
print(f"Concordance Index (Training Set): {c_index_train}")

# Predict risk scores for the test data
test_predictions = cox_model.predict_partial_hazard(df_test)

# Calculate the Concordance Index on the test data
c_index_test = concordance_index(df_test['time_frame'], -test_predictions, df_test['GRF_STAT_PA'])
print(f"Concordance Index (Test Set): {c_index_test}")


0,1
model,lifelines.CoxPHFitter
duration col,'time_frame'
event col,'GRF_STAT_PA'
penalizer,0.1
l1 ratio,0.0
baseline estimation,breslow
number of observations,26296
number of events observed,7679
partial log-likelihood,-70471.65
time fit was run,2024-09-15 13:16:36 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
GENDER,0.03,1.03,0.02,-0.02,0.07,0.98,1.08,0.0,1.24,0.21,2.22
PERIP_VASC,0.02,1.02,0.04,-0.05,0.09,0.95,1.09,0.0,0.56,0.58,0.79
AGE_DIAB,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.25,0.80,0.32
CREAT_TRR,-0.01,0.99,0.0,-0.01,-0.0,0.99,1.0,0.0,-2.19,0.03,5.12
AMIS,0.01,1.01,0.02,-0.02,0.05,0.98,1.05,0.0,0.64,0.52,0.94
BMIS,0.02,1.02,0.02,-0.02,0.06,0.98,1.07,0.0,1.17,0.24,2.04
DRMIS,-0.01,0.99,0.02,-0.04,0.03,0.96,1.03,0.0,-0.33,0.74,0.43
HLAMIS,0.01,1.01,0.01,-0.02,0.03,0.98,1.03,0.0,0.52,0.60,0.74
NPKID,0.19,1.21,0.03,0.13,0.24,1.14,1.28,0.0,6.44,<0.005,32.92
NPPAN,0.2,1.22,0.04,0.13,0.27,1.14,1.31,0.0,5.64,<0.005,25.81

0,1
Concordance,0.67
Partial AIC,141091.29
log-likelihood ratio test,1517.51 on 74 df
-log2(p) of ll-ratio test,888.25


Concordance Index (Training Set): 0.6731820352074273
Concordance Index (Test Set): 0.6698539373914603


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix
corr_matrix = data_cox.corr()

# Plot the heatmap to visualize correlation
import pandas as pd

# Assume df is your DataFrame with 113 features
# Compute the correlation matrix

# Unstack the correlation matrix to get pairs of features and their correlation
corr_pairs = corr_matrix.unstack()

# Convert to DataFrame for easier filtering and sorting
corr_pairs = pd.DataFrame(corr_pairs, columns=['correlation']).reset_index()

# Rename columns for clarity
corr_pairs.columns = ['Feature1', 'Feature2', 'Correlation']

# Remove self-correlations (where Feature1 == Feature2)
corr_pairs = corr_pairs[corr_pairs['Feature1'] != corr_pairs['Feature2']]

# Sort by absolute correlation value (highest first)
corr_pairs['abs_corr'] = corr_pairs['Correlation'].abs()
sorted_corr_pairs = corr_pairs.sort_values(by='abs_corr', ascending=False)

# Drop the auxiliary abs_corr column
sorted_corr_pairs.drop(columns=['abs_corr'], inplace=True)

# Show the top correlations
print(sorted_corr_pairs.head(20))  # Change the number to show more or fewer correlations


In [None]:
from sksurv.metrics import integrated_brier_score, brier_score
import numpy as np

# Define the time points at which to compute the Brier score
times = [30, 365, 365*5, 365*10]


# Predict survival functions for the test set
surv_funcs = rsf.predict_survival_function(X_test)

# Convert survival functions into probabilities at specific time points
preds = np.asarray([[fn(t) for t in times] for fn in surv_funcs])

# Compute Brier Score at specific times
brier_scores = brier_score(y_train, y_test, preds, times)
print(f"Brier Scores: {brier_scores}")

# Compute the Integrated Brier Score (IBS) over the specified time points
ibs = integrated_brier_score(y_train, y_test, preds, times)
print(f"Integrated Brier Score (IBS): {ibs}")


In [None]:
surv_funcs = cox_model.predict_survival_function(df_test)

# Define the time points at which to compute the Brier score (30 days, 1 year, 5 years, 10 years)
times = [30, 365, 365 * 5, 365 * 10]

# Extract survival probabilities at the specific time points
preds = np.asarray([surv_funcs.loc[t].values for t in times]).T  # Transpose to match (n_samples, n_times)

# Convert the test data into the required format for sksurv
y_test = np.array([(status == 1, time) for status, time in zip(df_test['GRF_STAT_PA'], df_test['time_frame'])],
                  dtype=[('event', '?'), ('time', '<f8')])

y_train = np.array([(status == 1, time) for status, time in zip(df_train['GRF_STAT_PA'], df_train['time_frame'])],
                   dtype=[('event', '?'), ('time', '<f8')])

# Compute Brier Score at specific times
brier_scores = brier_score(y_train, y_test, preds, times)
print(f"Brier Scores: {brier_scores}")

# Compute the Integrated Brier Score (IBS) over the specified time points
ibs = integrated_brier_score(y_train, y_test, preds, times)
print(f"Integrated Brier Score (IBS): {ibs}")

In [46]:
import numpy as np
from sksurv.metrics import cumulative_dynamic_auc
from sksurv.util import Surv
import pandas as pd

# Convert y_train and y_test to DataFrames
y_train = pd.DataFrame(y_train, columns=["GRF_STAT_PA", "time_frame"])
y_test = pd.DataFrame(y_test, columns=["GRF_STAT_PA", "time_frame"])

y_train["GRF_STAT_PA"] = y_train["GRF_STAT_PA"].astype("bool")
y_test["GRF_STAT_PA"] = y_test["GRF_STAT_PA"].astype("bool")

# Prepare survival objects for train and test data
y_test_surv = Surv.from_dataframe("GRF_STAT_PA", "time_frame", data=y_test)
y_train_surv = Surv.from_dataframe("GRF_STAT_PA", "time_frame", data=y_train)

# Risk scores for the test set
cum_hazards = rsf.predict_cumulative_hazard_function(X_test)
risk_scores = [np.mean(hazard.y) for hazard in cum_hazards]


# Define time points where you want to compute AUC (in days)
time_points = [30, 365]


# Calculate time-dependent AUC
auc_values, mean_auc = cumulative_dynamic_auc(survival_train=y_train_surv,
                                                 survival_test=y_test_surv,
                                                 estimate=risk_scores,
                                                 times=time_points)

# Print time-dependent AUC values
print("AUC at different time points:", auc_values)

# Calculate iAUC (mean AUC across time points using trapezoidal rule)
iAUC = np.trapz(auc_values, time_points) / (time_points[-1] - time_points[0])

print(f"Integrated AUC (iAUC): {iAUC}")


AUC at different time points: [0.7585972 0.6985981]
Integrated AUC (iAUC): 0.7285976523929951
