This notebook is used to perform survival analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, plotting, CoxPHFitter

plt.style.use('bmh')
mpl.rcParams.update({
    "grid.linestyle" : "dashed",
    "axes.facecolor" : "white",
    "axes.spines.top" : False,
    "axes.spines.right" : False,
    "legend.frameon" : True,
    "figure.figsize" : (8, 5),
    "figure.dpi" : 500,
})

%matplotlib inline

In [None]:
# Load the dataset and the extra data
df = pd.read_csv("./data/cad_old/data_cvd.csv", index_col=0, decimal='.')
df["Follow Up Data"] = pd.to_datetime(df["Follow Up Data"], format="%Y/%m/%d")
df["Data prelievo"] = pd.to_datetime(df["Data prelievo"], format="%Y/%m/%d")

df_7y = pd.read_csv("./data/cad/data_feat.csv", index_col=0, decimal='.')
df = df.merge(df_7y.loc[:, ["Creatinina", "Survive7Y"]], how="left", left_index=True, right_index=True)

df["Death7Y"] = df["Survive7Y"].map(lambda x: (x+1)%2)

In [None]:
# Birth event: "Data prelievo"
# Death event: "Death7Y"
# Right-censoring occur if the patient is still alive at "Follow Up Data"

# Duration and observation features
df_surv = pd.DataFrame()
df_surv["Death7Y"] = df["Death7Y"]
df_surv["duration"] = (df["Follow Up Data"] - df["Data prelievo"]).map(lambda x: (x.days//365) + 1)
df_surv["observed"] = df["CVD Death"]
df_surv["ID"] = np.random.randint(0, 1e6, size=len(df_surv))

# Plot duration
df_surv["duration"].value_counts(normalize=True).sort_index().plot(kind='bar')
df_feat = pd.read_csv("./data/cad/data_feat.csv", index_col=0, decimal='.')
df_surv = df_feat.merge(df_surv, how="left", left_index=True, right_index=True)
# df_surv.to_csv("data/cad/data_surv.csv")

### Survival Analysis on all the data
The plot indicates the % of CVD patients that survived after n years. For example, the probability for surviving more than 7 years is ~92%.

In [None]:
fig = plt.figure(figsize=(20, 5), dpi=500) 
fig_dims = (1, 2)
fig.subplots_adjust(hspace=0.2, wspace=0.2)

plt.subplot2grid(fig_dims, (0, 0))
kmf1 = KaplanMeierFitter()
kmf1.fit(df_surv["duration"], event_observed=df_surv["observed"], label="CVD")
kmf1.plot_survival_function()
plt.title('Survival Function of CVD Patients')
plt.xlabel("Years")
plt.ylabel("Survive")
plt.ylim(0, 1)
plotting.add_at_risk_counts(kmf1)

plt.subplot2grid(fig_dims, (0, 1))
kmf2 = KaplanMeierFitter()
kmf2.fit(df_surv["duration"], event_observed=df_surv["Death7Y"], label="CVD")
kmf2.plot_survival_function()
plt.title('Survival Function of CVD Patients (7 Years)')
plt.xlabel("Years")
plt.ylabel("Survive7Y")
plt.ylim(0, 1)
plotting.add_at_risk_counts(kmf2)

# plt.savefig("")
# kmf.median_survival_time_
# kmf.confidence_interval_

### Survival Analysis Univariate

In [None]:
df_test = pd.read_csv("./data/cad/test.csv", index_col=0, decimal='.').sort_index()
df_test = df_test.merge(df_surv.loc[:, ["duration", "observed"]], how="left", left_index=True, right_index=True)
df_test = df_test.rename(columns={"Hyperlipemia\nHistoty of hyperlipemia": "Dyslipidemia"})

In [None]:
fig = plt.figure(figsize=(12, 3.5), dpi=400) 
fig_dims = (1, 2)
fig.subplots_adjust(hspace=0.10, wspace=0.20)

# Dyslipidemia
plt.subplot2grid(fig_dims, (0, 0))
mean = df_test["FE"].mean()
df_under_mean = df_test[df_test["FE"] <= mean]
df_above_mean = df_test[df_test["FE"] > mean]

kmf1 = KaplanMeierFitter()
kmf1.fit(df_under_mean["duration"], event_observed=df_under_mean["observed"], label=f"FE <= {int(mean)}")
kmf1.plot_survival_function()
kmf2 = KaplanMeierFitter()
kmf2.fit(df_above_mean["duration"], event_observed=df_above_mean["observed"], label=f"FE > {int(mean)}")
kmf2.plot_survival_function(linestyle="--")

plt.title('Survival Function of CAD Patients (FE)')
plt.xlabel("Years")
plt.ylabel("Survivals")
plt.ylim(0, 1)
plt.legend(loc="lower left")
plotting.add_at_risk_counts(kmf1, kmf2)

# FE
plt.subplot2grid(fig_dims, (0, 1))
mean = df_test["Age"].mean()
df_under_mean = df_test[df_test["Age"] <= mean]
df_above_mean = df_test[df_test["Age"] > mean]

kmf1 = KaplanMeierFitter()
kmf1.fit(df_under_mean["duration"], event_observed=df_under_mean["observed"], label=f"Age <= {int(mean)}")
kmf1.plot_survival_function()
kmf2 = KaplanMeierFitter()
kmf2.fit(df_above_mean["duration"], event_observed=df_above_mean["observed"], label=f"Age > {int(mean)}")
kmf2.plot_survival_function(linestyle="--")

plt.title('Survival Function of CAD Patients (Age)')
plt.xlabel("Years")
# plt.ylabel("Survivals")
plt.tick_params(labelleft=False)
plt.ylim(0, 1)
plotting.add_at_risk_counts(kmf1, kmf2)

### Survival Analysis Model 

In [None]:
def plot_kmf(df_pred, threshold, variable):
    df_pred_under = df_pred[df_pred["ModelOutput"] <= threshold]
    df_pred_above = df_pred[df_pred["ModelOutput"] > threshold]

    kmf1 = KaplanMeierFitter()
    kmf1.fit(df_pred_under["duration"], event_observed=df_pred_under["observed"], label=f"<= {threshold}")
    kmf1.plot_survival_function()

    kmf2 = KaplanMeierFitter()
    kmf2.fit(df_pred_above["duration"], event_observed=df_pred_above["observed"], label=f"> {threshold}")
    kmf2.plot_survival_function(linestyle="--")

    plt.title(f'Survival Function of CAD Patients ({variable})')
    plt.xlabel("Years")
    plt.ylabel("Survivals")

    if threshold == 0.7:
        plt.ylabel("")
        plt.tick_params(labelleft=False)

    plt.ylim(0, 1)
    plotting.add_at_risk_counts(kmf1, kmf2)

In [None]:
df_out = pd.read_csv("./data/cad/extra_test_output.csv", index_col=0, decimal='.')
df_out7 = pd.read_csv("./data/cad/extra_test_output_top.csv", index_col=0, decimal='.')

# Model 18 features
df_test_pred = df_surv.drop(list(set(df_surv.index) - set(df_out.index)))
df_test_pred = df_test_pred.merge(df_out, how="left", left_index=True, right_index=True)

# Model 7 features
df_test7_pred = df_surv.drop(list(set(df_surv.index) - set(df_out7.index)))
df_test7_pred = df_test7_pred.merge(df_out7, how="left", left_index=True, right_index=True)

In [None]:
# Plots
fig = plt.figure(figsize=(12, 10), dpi=400) 
fig_dims = (2, 2)
fig.subplots_adjust(hspace=1, wspace=0.2)

plt.subplot2grid(fig_dims, (0, 0))
plot_kmf(df_test_pred, threshold=0.6, variable="Model18")

plt.subplot2grid(fig_dims, (0, 1))
plot_kmf(df_test_pred, threshold=0.7, variable="Model18")

plt.subplot2grid(fig_dims, (1, 0))
plot_kmf(df_test7_pred, threshold=0.6, variable="Model7")

plt.subplot2grid(fig_dims, (1, 1))
plot_kmf(df_test7_pred, threshold=0.7, variable="Model7")

### COX Regression and p-value 

In [None]:
df_out = pd.read_csv("./data/cad/extra_test_output.csv", index_col=0, decimal='.')
df_out7 = pd.read_csv("./data/cad/extra_test_output_top.csv", index_col=0, decimal='.')

# Model 18 all features
df_test_pred = df_surv.drop(list(set(df_surv.index) - set(df_out.index)))
df_test_pred = df_test_pred.merge(df_out, how="left", left_index=True, right_index=True)

# Model 7 top features
df_test7_pred = df_surv.drop(list(set(df_surv.index) - set(df_out7.index)))
df_test7_pred = df_test7_pred.merge(df_out7, how="left", left_index=True, right_index=True)

In [None]:
top = [
    "Hyperlipemia\nHistoty of hyperlipemia",
    "FE",
    "Previous CABG",
    "Diabetes\nHistory of diabetes",
    "Previous Myocardial Infarction",
    "Smoke\nHistory of smoke",
    "Documented resting \nor exertional ischemia",
]

Univariate

In [None]:
cph = CoxPHFitter()
cph.fit(df_test_pred.loc[:, ["Angina", "duration", "observed"]], duration_col='duration', event_col='observed')
# cph.fit(df_test7_pred.loc[:, ["ModelOutput", "duration", "observed"]], duration_col='duration', event_col='observed')
cph.print_summary()

Multivariate Top 7 variables

In [None]:
cph = CoxPHFitter()
cph.fit(df_test_pred.loc[:, top + ["duration", "observed"]], duration_col='duration', event_col='observed')
cph.print_summary()

Multivariate Top 7 variables + Model 7

In [None]:
cph = CoxPHFitter()
cph.fit(df_test7_pred.loc[:, top + ["ModelOutput", "duration", "observed"]], duration_col='duration', event_col='observed')
cph.print_summary()

Multivariate Top 7 variables + Model 18

In [None]:
cph = CoxPHFitter()
cph.fit(df_test_pred.loc[:, top + ["ModelOutput", "duration", "observed"]], duration_col='duration', event_col='observed')
cph.print_summary()

Multivariate all variables

In [None]:
df_feat = df_test_pred.drop(columns=["Survive7Y", "Death7Y", "ID", "ModelOutput"])

cph = CoxPHFitter()
cph.fit(df_feat, duration_col='duration', event_col='observed')
cph.print_summary()