In [2]:
import pickle
import pandas as pd
import numpy as np
import plotly.express as px

from scipy.stats import f_oneway

from ARCH import basic

In [3]:
# Load LBC trajectories

full_data = pd.read_csv(r'Datasets/LBC_ARCHER.1PCT_VAF.Mar21.non-synonymous.tsv', sep='\t')
lbc = basic.load(full_data)
part_list = full_data.participant_id.unique()

In [4]:
# load neutral fitted trajectories stored in a pickle file
infile = open('pickles/neutral_filtered_trajectories.pkl','rb')
model = pickle.load(infile)
infile.close()

In [5]:
df21 = pd.read_csv('Datasets/LBC1921_ExtraVariables.csv')
df36 = pd.read_csv('Datasets/LBC1936_ExtraVariables.csv')
extended = pd.read_csv('Datasets/lbc_meta.csv')

In [6]:
survival_data = pd.DataFrame(columns = ['part_id', 'cohort', 'dead', 'age_days'],dtype=float)
for i, row in df21.iterrows():
    if row.dead == 0:
        survival_data = survival_data.append(
                            {'part_id': row.studyno,
                             'cohort': 21,
                             'age_days': row.agedaysApx_LastCensor,
                             'dead': 0}, ignore_index=True)
    else:
        survival_data = survival_data.append(
                            {'part_id': row.studyno,
                             'cohort': 21,
                             'age_days': row.agedays_death,
                             'dead': 1},ignore_index=True )
        
for i, row in df36.iterrows():
    if row.dead == '1':
        survival_data = survival_data.append(
                            {'part_id': row.lbc36no,
                             'cohort': 36,
                             'age_days': row.agedays_death,
                             'dead': 1},ignore_index=True )
    else:
        survival_data = survival_data.append(
                            {'part_id': row.lbc36no,
                             'cohort': 36,
                             'age_days': row.AgedaysApx_LastCensor,
                             'dead': 0}, ignore_index=True)
survival_data = survival_data[survival_data.part_id.isin(part_list)].copy()

In [7]:
def find_fitness(part_id):
    fitness = []
    for traj in model:
        if traj.id == part_id:
            fitness.append(traj.fitness)
    return max(fitness, default=None)

def find_gradient(part_id):
    delta_vaf = []
    for part in lbc:
        if part.id == part_id:
            for traj in part.trajectories:
#                 # append vaf at first timepoint
#                 first_AF = traj.data.AF.iloc[0]
#                 last_AF =traj.data.AF.iloc[-1] 
#                 delta_vaf.append(last_AF - first_AF)
                delta_vaf.append(traj.gradient)
    return max(delta_vaf, default=None)

def find_normalised_gradient(part_id):
    norm_gradient = []
    for part in lbc:
        if part.id == part_id:
            for traj in part.trajectories:
#                 # append vaf at first timepoint
#                 first_AF = traj.data.AF.iloc[0]
#                 last_AF =traj.data.AF.iloc[-1] 
#                 delta_vaf.append(last_AF - first_AF)
                norm_gradient.append(traj.gradient*traj.data.AF.iloc[0])
    return max(norm_gradient, default=None)

def find_vaf_last(part_id):
    return  full_data[full_data.participant_id == part_id][['wave', 'AF']].groupby(by='wave').max().iloc[-1]

def find_vaf_init(part_id):
    return  full_data[full_data.participant_id == part_id][['wave', 'AF']].groupby(by='wave').max().iloc[0]

survival_data['Fitness'] = survival_data.apply (lambda row: find_fitness(row.part_id), axis=1)
survival_data['Gradient'] = survival_data.apply (lambda row: find_gradient(row.part_id), axis=1)
survival_data['norm_gradient'] = survival_data.apply (lambda row: find_normalised_gradient(row.part_id), axis=1)
survival_data['Max initial vaf'] = survival_data.apply (lambda row: find_vaf_init(row.part_id), axis=1)
survival_data['Max last vaf'] = survival_data.apply (lambda row: find_vaf_last(row.part_id), axis=1)
survival_data

Unnamed: 0,part_id,cohort,dead,age_days,Fitness,Gradient,norm_gradient,Max initial vaf,Max last vaf
0,LBC0001A,21.0,1.0,33807.00,0.186712,0.034300,0.005009,0.0130,0.2135
28,LBC0031R,21.0,1.0,33778.00,0.196813,0.003919,0.000022,0.0364,0.0168
36,LBC0040V,21.0,1.0,32474.00,,0.329211,0.006518,0.0774,0.8262
41,LBC0046H,21.0,0.0,36526.00,0.244285,0.003593,0.000027,0.0465,0.0196
42,LBC0047K,21.0,1.0,31552.00,,0.033925,0.012244,0.3609,0.4440
...,...,...,...,...,...,...,...,...,...
1518,LBC361133,36.0,0.0,30940.00,,0.004215,0.000043,0.0136,0.0174
1545,LBC361172,36.0,0.0,30977.00,0.056521,0.005348,0.000153,0.0287,0.0418
1578,LBC361214,36.0,0.0,30773.00,0.321540,0.007467,0.000470,0.0629,0.0853
1586,LBC361225,36.0,0.0,30890.00,0.113655,0.025767,0.006140,0.2383,0.3156


In [8]:
def from_wave1 (row):
    if 'LBC0' in row.part_id:
        age = 79*365.25
    else:
        age = 70*365.25

    return int(row.age_days - age)

def find_sex (row):
    return  extended[extended.ID == row.part_id].sex.unique()[0]

survival_data['age_days'] = survival_data['age_days'].astype(float)
survival_data['days_from_wave1'] = survival_data.apply (lambda row: from_wave1(row), axis=1)
survival_data['days_from_wave1'] = survival_data['days_from_wave1']/365.25
survival_data['sex'] = survival_data.apply (lambda row: find_sex(row), axis=1)
survival_data['sex'] = survival_data['sex'].replace(['M','F'],[0,1])

In [9]:
survival_data

Unnamed: 0,part_id,cohort,dead,age_days,Fitness,Gradient,norm_gradient,Max initial vaf,Max last vaf,days_from_wave1,sex
0,LBC0001A,21.0,1.0,33807.0,0.186712,0.034300,0.005009,0.0130,0.2135,13.557837,0
28,LBC0031R,21.0,1.0,33778.0,0.196813,0.003919,0.000022,0.0364,0.0168,13.478439,1
36,LBC0040V,21.0,1.0,32474.0,,0.329211,0.006518,0.0774,0.8262,9.908282,0
41,LBC0046H,21.0,0.0,36526.0,0.244285,0.003593,0.000027,0.0465,0.0196,21.002053,0
42,LBC0047K,21.0,1.0,31552.0,,0.033925,0.012244,0.3609,0.4440,7.383984,1
...,...,...,...,...,...,...,...,...,...,...,...
1518,LBC361133,36.0,0.0,30940.0,,0.004215,0.000043,0.0136,0.0174,14.707734,1
1545,LBC361172,36.0,0.0,30977.0,0.056521,0.005348,0.000153,0.0287,0.0418,14.809035,0
1578,LBC361214,36.0,0.0,30773.0,0.321540,0.007467,0.000470,0.0629,0.0853,14.250513,1
1586,LBC361225,36.0,0.0,30890.0,0.113655,0.025767,0.006140,0.2383,0.3156,14.570842,1


In [10]:
survival_data.to_csv('Datasets/survival_data.csv', index=False)