In [10]:
# Append root directory to system's path
import sys
sys.path.append('../ARCH_package')

import plot, modelling

In [11]:
import dill
import os
import json

import plotly.express as px

import numpy as np
from scipy.stats import pearsonr

# Load dataset and create directory paths

In [12]:
# Import synonymous trajectories as exported through with basic.load module
with open('../Exports/threshold_trajectories.dill', 'rb') as infile:
    model = dill.load(infile)
    
    
# Create participant_list
part_list = list(set([traj.id for traj in model]))
part_dict = { i : [] for i in part_list}
for traj in model:
    part_dict[traj.id].append(traj)

In [13]:
# Create path for exporting
path = '../Results/Threshold/'
if not os.path.exists(path):
    os.makedirs(path)

# Add prefix to plots
path = path + 'Threshold-' 

In [14]:
# Compute pearsonr coefficient for every fitted trajectory
for traj in model:
    traj.pearson = pearsonr(list(traj.data_vaf.values()), list(traj.model_vaf.values()))

# Exclude trajectories with negative pearson coefficient
exclude_pearson = []
pearson = []
for traj in model:
    pearson.append(traj.pearson[0])
    if traj.pearson[0] < 0:
        exclude_pearson.append(traj.id)  
        
px.box(y=pearson, points='all').show()

for id in set(exclude_pearson):
    plot.participant_model_plot(model, id).show()

In [15]:
# Extract error produced during fitting
error = []
for id, part in part_dict.items():
    error.append(part[0].fit.chisqr)

# Exclude participants with large fitting error (>1 std deviation)
exclude = [] 
for id, part in part_dict.items():
    if part[0].fit.chisqr > np.std(error):
        exclude.append(id)
px.box(y=error, points='all').show()

# Exclude participants with a clinical record
clinical_records = ['LBC360021','LBC360725', 'LBC360914']
exclude.append(clinical_records[0])

# Show excluded participants
for id in exclude: 
    plot.participant_model_plot(model, id).show() 

# Exclude participants based on clinical record, fitting error and pearson coeff
model_filtered = [traj for traj in model if traj.id not in exclude]
model_filtered = [traj for traj in model_filtered if traj.pearson[0]>0]

# Cohort overview

In [16]:
# # box plot with fitness distribution by fitness
box, gene_dict = plot.gene_box(model_filtered)
box.write_image(path + 'gene_box.svg', width=1000)
box.show()

# Compute heatmap for statistic: 'kruskal-wallis' or 'anova'.
heatmap, statistic_df = plot.gene_statistic(gene_dict, statistic='kruskal-wallis', filter=True)
heatmap.show()
heatmap.write_image(path + 'gene_specific_differences.svg', height=300)

# Trajectories

## by participant

In [17]:
interesting_part = ['LBC0251K', 'LBC360636']
for part in interesting_part:
    plot.participant_model_plot(model_filtered, part).show()

## by gene

In [18]:
path_2 = path + 'gene trajectories/' 
if not os.path.exists(path_2):
    os.makedirs(path_2)
path_2 = path_2 + 'NGF-'
    
for gene in gene_dict.keys():
    fig = modelling.gene_trajectories(model_filtered, gene)
    fig.write_image(path_2 + f'{gene}.svg', width=1000)
    if gene in ['DNMT3A', 'JAK2', 'ASXL1', 'TET2']:
        fig.show()

# Protein damage prediction

In [19]:
fig , df = modelling.damage_class(model_filtered)
fig.update_yaxes(title='fitness',
                 linewidth=2,
                 type='log',
                 tickvals=[0.05,0.1,0.2,0.4], row=2, col=1)
fig.update_yaxes(linewidth=2, row=1, col=1)
fig.update_xaxes(linewidth=2, row=2, col=1)
fig.show()
fig.write_image(path + 'damage_box.svg')

# Export filtered trajectories

In [20]:
# Export fitted trajectories
with open('../Exports/threshold_filtered_trajectories.dill', 'wb') as outfile:
    dill.dump(model_filtered, outfile)