In [None]:
#install requirements
%pip install lifelines

In [None]:
#import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lifelines import KaplanMeierFitter
from lifelines.utils import median_survival_times
from lifelines.statistics import pairwise_logrank_test
from lifelines import datasets
from google.colab import auth, drive


In [None]:
#create dataframe
df = datasets.load_stanford_heart_transplants()

In [None]:
#inspect data
df.head()

In [None]:
#Profile the data
df.info()

In [None]:
#inspect every unique value for the columns
#Normally wouldn't do this, but the data is small
for col in df:
    print(col)
    print(df[col].unique())

In [None]:
#Statistics on numeric data
df.describe()

In [None]:
#instantiate the kmf class
kmf = KaplanMeierFitter()

In [None]:
#use the kmf.fit() method to fit the Kaplan-Meier
#estimate for the survival function
##https://lifelines.readthedocs.io/en/latest/fitters/univariate/KaplanMeierFitter.html#lifelines.fitters.kaplan_meier_fitter.KaplanMeierFitter.fit

kmf.fit(durations=df['stop'],
        event_observed=df['event'],
        entry=df['start'])

In [None]:
#Plot the K-M Survival Curve
kmf.survival_function_.plot()
plt.show()

In [None]:
#Plot the K-M Survival Curve w/ 95% CIs
kmf.plot_survival_function()
plt.title('K-M Survival Curve w/ 95% CIs');

In [None]:
#We can look at the median survival time
kmf.median_survival_time_


In [None]:
#Stratify the survival by transplant status

#Set up axis
ax = plt.subplot(111)

#Set up the curve for transplant patients
trans_df = df[df['transplant']==1]

kmf.fit(durations=trans_df['stop'],
        event_observed=trans_df['event'],
        entry=trans_df['start'],
        label="Transplant")

kmf.plot_survival_function(ax=ax)

#Set up the curve for non-transplant patients
nontrans_df = df[df['transplant']==0]

kmf.fit(durations=nontrans_df['stop'],
        event_observed=nontrans_df['event'],
        entry=nontrans_df['start'],
        label="Non Transplant")

kmf.plot_survival_function(ax=ax)

plt.title("K-M Survival Plot, stratified by Transplant Status");

In [None]:
#Stratify the survival by surgery status

#Set up axis
ax = plt.subplot(111)

#Set up the curve for surgery patients
surg_df = df[df['surgery']==1]

kmf.fit(durations=surg_df['stop'],
        event_observed=surg_df['event'],
        entry=surg_df['start'],
        label="Surgery")

kmf.plot_survival_function(ax=ax)

#Set up the curve for non-transplant patients
nonsurg_df = df[df['surgery']==0]

kmf.fit(durations=nonsurg_df['stop'],
        event_observed=nonsurg_df['event'],
        entry=nonsurg_df['start'],
        label="Non Surgery")
kmf.plot_survival_function(ax=ax)

plt.title("K-M Survival Plot, stratified by Surgery Status");


In [None]:
#Stratify the survival by surgery status
#But Let's add a summary table to the output

#Set up axis object
ax = plt.subplot(111)

#Set up the curve for nonsurgery patients
kmf_nonsurgery = KaplanMeierFitter()

nonsurg_df = df[df['surgery'] == 0]

ax = kmf_nonsurgery.fit(durations=nonsurg_df['stop'],
                        event_observed=nonsurg_df['event'],
                        entry=nonsurg_df['start'],
                        label="Non Surgery").plot_survival_function(ax=ax)


#Set up the curve for surgery patients
kmf_surgery = KaplanMeierFitter()

surg_df = df[df['surgery'] == 1]

ax = kmf_surgery.fit(durations=surg_df['stop'],
                     event_observed=surg_df['event'],
                     entry=surg_df['start'],
                     label="Surgery").plot_survival_function(ax=ax)


#Set up at_risk_counts()
from lifelines.plotting import add_at_risk_counts
add_at_risk_counts(kmf_nonsurgery, kmf_nonsurgery, ax=ax)


#Plot
plt.tight_layout()

plt.title("K-M Survival Plot, stratified by Surgery Status");

In [None]:
#Let's look at the log rank test rests

#import log rank package
#link here - https://lifelines.readthedocs.io/en/latest/lifelines.statistics.html#lifelines.statistics.logrank_test
from lifelines.statistics import logrank_test

#Create duration variables for the dataframes
df['duration'] = df['stop'] - df['start']
surg_df = df[df['surgery'] == 1]
nonsurg_df = df[df['surgery'] == 0]

#run the lrt
lrt_results = logrank_test(durations_A = nonsurg_df['duration'],
                           durations_B = surg_df['duration'],
                           event_observed_A = nonsurg_df['event'],
                           event_observed_B = surg_df['event'])

lrt_results.print_summary()
