In [7]:
import os
import pandas as pd
import numpy as np
from lifelines import CoxPHFitter
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler

from plotly.subplots import make_subplots

# Set export path

In [10]:
# Create path for exporting
path = '../Results/Survival analysis/'
if not os.path.exists(path):
    os.makedirs(path)

# Load the survival dataset

In [11]:
survival_data = pd.read_csv('../Datasets/survival_data.csv')
survival_data

Unnamed: 0,part_id,cohort,dead,age_days,Fitness,Gradient,norm_gradient,Max initial vaf,Max last vaf,days_from_wave1,sex
0,LBC0001A,21.0,1.0,33807.0,0.186712,0.034300,0.005009,0.0130,0.2135,13.557837,0
1,LBC0031R,21.0,1.0,33778.0,0.196813,0.003919,0.000022,0.0364,0.0168,13.478439,1
2,LBC0040V,21.0,1.0,32474.0,,0.329211,0.006518,0.0774,0.8262,9.908282,0
3,LBC0046H,21.0,0.0,36526.0,0.244285,0.003593,0.000027,0.0465,0.0196,21.002053,0
4,LBC0047K,21.0,1.0,31552.0,,0.033925,0.012244,0.3609,0.4440,7.383984,1
...,...,...,...,...,...,...,...,...,...,...,...
84,LBC361133,36.0,0.0,30940.0,,0.004215,0.000043,0.0136,0.0174,14.707734,1
85,LBC361172,36.0,0.0,30977.0,0.056521,0.005348,0.000153,0.0287,0.0418,14.809035,0
86,LBC361214,36.0,0.0,30773.0,0.321540,0.007467,0.000470,0.0629,0.0853,14.250513,1
87,LBC361225,36.0,0.0,30890.0,0.113655,0.025767,0.006140,0.2383,0.3156,14.570842,1


# Cox hazard analysis using Max initial VAF
### LBC21

In [12]:
def survival_analysis(keep_columns, cohort):
    # Select cohort and columns
    cox_data = survival_data[survival_data.cohort.isin(cohort)][keep_columns + ['days_from_wave1', 'dead']]
    # Exclude columns not used as covariates and filter for nan values
    cox_data = cox_data.dropna()

    # normalise columns used for regression
    for column in keep_columns:
        data = cox_data[column] - np.mean(cox_data[column])
        data = data/np.std(data)
        cox_data[column] = data

    # Train Cox proportional hazard model
    cph = CoxPHFitter()
    cph.fit(cox_data, duration_col='days_from_wave1', event_col='dead')

    # access the individual results using cph.summary
    cph.print_summary()  
    

    return cph

In [13]:
def plot_hr_analysis(model, covariate):
    fig = make_subplots(rows=1, cols=2, column_widths=[0.3, 0.7],
                        subplot_titles=(f'Estimated hazard ratio', f'Survival stratification'))

    fig.add_trace(
        go.Scatter(
            y=[model.hazard_ratios_[0]],
            x=[covariate],
            marker_symbol='diamond',
            marker_size=15,
            showlegend=False,
            error_y=dict(
                type='data',
                symmetric=False,
                array=np.exp(np.array(model.confidence_intervals_)[:,1])-model.hazard_ratios_[0],
                arrayminus=model.hazard_ratios_[0]-np.exp(np.array(model.confidence_intervals_)[:,0]))
            ), row=1, col=1)

    # Plot covariate effect
    for covariate in model.params_.index:
        values =[-2, 0 , 2]
        partial_ax = model.plot_partial_effects_on_outcome(covariates=covariate, values=values, cmap='coolwarm')
        partial_ax.get_figure()

        #add traces to figure
        fig.add_trace(
            go.Scatter(x=partial_ax.lines[1].get_xdata(),
                       y=partial_ax.lines[1].get_ydata(),
                       mode='lines', line=dict(dash='dash', shape='hv'),
                       name='Mean'), row=1, col=2)
        fig.add_trace(
            go.Scatter(x=partial_ax.lines[0].get_xdata(),
                       y=partial_ax.lines[0].get_ydata(),
                       mode='lines', line=dict(shape='hv'),
                       name='-2 SD'), row=1, col=2)
        fig.add_trace(
            go.Scatter(x=partial_ax.lines[2].get_xdata(),
                       y=partial_ax.lines[2].get_ydata(),
                       mode='lines', line=dict(shape='hv'),
                       name='2 SD'), row=1, col=2)

    fig.update_layout(template='simple_white',
                      title=f'Effect of {covariate} on survival',
                      legend_title_text=f'{covariate}')

    y_range_hazards = [np.floor(np.exp(np.array(model.confidence_intervals_)))[0,0], np.ceil(np.exp(np.array(model.confidence_intervals_)))[0,1]] 
    fig.update_yaxes(title_text="Hazard Ratio (95% CI)", range=y_range_hazards, row=1, col=1, dtick=1) 
    fig.update_yaxes(title_text="Survivors (proportion)", row=1, col=2, dtick=0.2) 
    fig.update_xaxes(title_text=covariate, showticklabels=False,tickvals=[0], row=1, col=1)
    fig.update_xaxes(title_text="Years", row=1, col=2)
    return fig

# Max initial VAF
## LBC21

In [14]:
cohort = [21]
survival_columns = ['Max initial vaf']

cph = survival_analysis(survival_columns, cohort)
fig = plot_hr_analysis(cph, covariate=survival_columns[0])
fig.show()
fig.write_image(path + 'LBC21_init_vaf.svg',width=1000)

0,1
model,lifelines.CoxPHFitter
duration col,'days_from_wave1'
event col,'dead'
baseline estimation,breslow
number of observations,46
number of events observed,39
partial log-likelihood,-120.87
time fit was run,2021-09-03 11:42:49 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
Max initial vaf,0.65,1.92,0.21,0.24,1.07,1.27,2.91,3.07,<0.005,8.86

0,1
Concordance,0.49
Partial AIC,243.74
log-likelihood ratio test,7.12 on 1 df
-log2(p) of ll-ratio test,7.03


## LBC36

In [15]:
cohort = [36]
survival_columns = ['Max initial vaf']

cph = survival_analysis(survival_columns, cohort)
fig = plot_hr_analysis(cph, covariate=survival_columns[0])
fig.show()

0,1
model,lifelines.CoxPHFitter
duration col,'days_from_wave1'
event col,'dead'
baseline estimation,breslow
number of observations,43
number of events observed,10
partial log-likelihood,-36.11
time fit was run,2021-09-03 11:42:55 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
Max initial vaf,-0.29,0.75,0.36,-1.0,0.42,0.37,1.52,-0.81,0.42,1.25

0,1
Concordance,0.57
Partial AIC,74.23
log-likelihood ratio test,0.73 on 1 df
-log2(p) of ll-ratio test,1.35


# Max gradient
## LBC21

In [16]:
cohort = [21]
survival_columns = ['Gradient']
cph = survival_analysis(survival_columns, cohort)
fig = plot_hr_analysis(cph, covariate=survival_columns[0])
fig.show()

0,1
model,lifelines.CoxPHFitter
duration col,'days_from_wave1'
event col,'dead'
baseline estimation,breslow
number of observations,42
number of events observed,35
partial log-likelihood,-107.87
time fit was run,2021-09-03 11:42:57 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
Gradient,0.36,1.43,0.17,0.02,0.69,1.02,1.99,2.08,0.04,4.74

0,1
Concordance,0.51
Partial AIC,217.73
log-likelihood ratio test,2.76 on 1 df
-log2(p) of ll-ratio test,3.37


## LBC36

In [17]:
cohort = [36]
survival_columns = ['Gradient']
cph = survival_analysis(survival_columns, cohort)
fig = plot_hr_analysis(cph, covariate=survival_columns[0])
fig.show()

0,1
model,lifelines.CoxPHFitter
duration col,'days_from_wave1'
event col,'dead'
baseline estimation,breslow
number of observations,41
number of events observed,10
partial log-likelihood,-35.56
time fit was run,2021-09-03 11:42:59 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
Gradient,0.26,1.29,0.28,-0.29,0.81,0.75,2.24,0.92,0.36,1.48

0,1
Concordance,0.50
Partial AIC,73.11
log-likelihood ratio test,0.77 on 1 df
-log2(p) of ll-ratio test,1.40


## Combined cohorts

In [18]:
cohort = [21, 36]
survival_columns = ['Gradient']
cph = survival_analysis(survival_columns, cohort)
fig = plot_hr_analysis(cph, covariate=survival_columns[0])
fig.show()
fig.write_image(path + 'LBC_gradient.svg', width=1000)

0,1
model,lifelines.CoxPHFitter
duration col,'days_from_wave1'
event col,'dead'
baseline estimation,breslow
number of observations,83
number of events observed,45
partial log-likelihood,-162.37
time fit was run,2021-09-03 11:43:08 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
Gradient,0.28,1.32,0.13,0.03,0.52,1.03,1.69,2.2,0.03,5.18

0,1
Concordance,0.50
Partial AIC,326.74
log-likelihood ratio test,3.05 on 1 df
-log2(p) of ll-ratio test,3.63
