In [1]:
from ARCH import basic, modelling, plot

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
colors = px.colors.qualitative.Plotly
pio.templates.default = "simple_white"

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

import os
import pickle

# Load fitted trajectories and create directory path

In [2]:
# load fitted trajectories stored in a pickle file in 'model_list'
infile = open('pickles/neutral_fit_bulk.pkl','rb')
model_neutral= pickle.load(infile)
infile.close()

# load fitted trajectories stored in a pickle file in 'model_list'
infile = open('pickles/threshold_fit_bulk.pkl','rb')
model_threshold= pickle.load(infile)
infile.close()

In [3]:
path = 'Results/Filter comparison/'
if not os.path.exists(path):
    os.makedirs(path)

 # Filtering comparisons
 
 ## Fitness

In [4]:
# Determine variants unique found in neutral or threshold cohorts
list1 = []
for model in model_neutral:
    list1.append((model.id, model.mutation))

list2 = []
for model in model_threshold:
    list2.append((model.id, model.mutation))

ngf_unique = set(list1)-set(list2)
chip_unique = set(list2)-set(list1)

In [12]:
ngf_fitness = []
for traj in model_neutral:
    ngf_fitness.append(traj.fitness)

chip_fitness = []
for traj in model_threshold:
    chip_fitness.append(traj.fitness)

ngf_unique_fitness = []
for traj in model_neutral:
    if (traj.id, traj.mutation) in ngf_unique:
        ngf_unique_fitness.append(traj.fitness)
        
chip_unique_fitness = []
for traj in model_threshold:
    if (traj.id, traj.mutation) in chip_unique:
        chip_unique_fitness.append(traj.fitness)

fig = go.Figure()
fig.add_trace(go.Box(y=[item for item in ngf_fitness if item > 0],  boxpoints=False, 
                     name='ngf growth filter', 
                     marker_color = 'rgb(102,102,102)', 
                     showlegend=False)) 

fig.add_trace(go.Box(y=ngf_fitness, boxpoints='all',
                     name='ngf growth filter', 
                     marker_color = 'rgb(102,102,102)',
                     line = dict(color = 'rgba(0,0,0,0)'), fillcolor = 'rgba(0,0,0,0)', showlegend=False))


fig.add_trace(go.Box(y=[item for item in chip_fitness if item > 0],  boxpoints=False, 
                     name='CHIP filter', 
                     marker_color = 'rgb(102,102,102)', 
                     showlegend=False)) 

fig.add_trace(go.Box(y=chip_fitness, boxpoints='all',
                     name='CHIP filter', 
                     marker_color = 'rgb(102,102,102)',
                     line = dict(color = 'rgba(0,0,0,0)'), fillcolor = 'rgba(0,0,0,0)', showlegend=False))

fig.add_trace(go.Box(y=ngf_unique_fitness,  boxpoints='all', 
                     name='Neutral growth unique', 
                     marker_color = 'rgb(102,102,102)', 
                     showlegend=False))

fig.add_trace(go.Box(y=[item for item in chip_unique_fitness if item > 0],  boxpoints=False, 
                     name='CHIP unique', 
                     marker_color = 'rgb(102,102,102)', 
                     showlegend=False))

fig.add_trace(go.Box(y=chip_unique_fitness,  boxpoints='all', 
                     name='CHIP unique', 
                     marker_color = 'rgb(102,102,102)', 
                     line = dict(color = 'rgba(0,0,0,0)'), fillcolor = 'rgba(0,0,0,0)',
                     showlegend=False))

fig.update_layout(
    template='simple_white',
    yaxis_title='Fitness')
fig.update_yaxes(type='log', dtick=1)
fig.write_image(path + 'Box plot all.svg')
fig

In [6]:
def gene_dict(model_list):
    """ Create a dictionary of gene counts in a model_list"""
    # Create a dictionary with all filtered genes
    gene_list = []
    for traj in model_list: 
        gene_list.append(traj.gene)
    gene_dict = {element:0 for element in set(gene_list)}

    # update the counts for each gene
    for traj in model_list:
        gene_dict[traj.mutation.split()[0]] = gene_dict[traj.mutation.split()[0]] + 1
    # sort dictionary in descending order
    gene_dict = dict(sorted(gene_dict.items(), key=lambda item: item[1], reverse=True))
    
    return gene_dict

In [7]:
threshold_gene = gene_dict(model_threshold)
syn_gene = gene_dict(model_neutral)

In [8]:
fig = go.Figure()
# Bar plot
fig.add_trace(go.Bar(x=list(threshold_gene.keys()), y=list(threshold_gene.values()),
                     orientation='v',
                     name = '0.02 VAF filter'))

fig.add_trace(go.Bar(x=list(syn_gene.keys()), y=list(syn_gene.values()),
                     orientation='v',
                     name = 'Neutral gradient filter'))

fig.update_layout(title='Comparison of filtered mutations',
                  template="simple_white",
                  xaxis_tickangle=-45)
fig.write_image(path + 'Gene counts.svg', width= 1200, scale=10)
fig

## Fitness

In [9]:
for traj in model_threshold:
    traj.cohort='CHIP'

for traj in model_neutral:
    traj.cohort = 'NGF'

In [10]:
def damage_class(model, fs_ter='fs - ter', color_list=None):
    """ Box plot of variant predicted protein damage class ~ Fitness.
    Returns:
    * Box plot
    * Modified model with damage_class attribute."""

    # Load dataset with prediction damage
    xls = pd.ExcelFile('Datasets/new_kristina_variants.xlsx')
    df = pd.DataFrame(columns=['p_key', 'damaging'])

    damaging_dict = {1:'likely damaging',
                     0:'possibly damaging',
                     -1:'likely benign'}
    # Access each sheet of xls file
    for name in xls.sheet_names:
        df_temp = pd.read_excel(xls, name)
        # replace integer for damage_class
        for key, value in damaging_dict.items():
            df_temp['Likely damaging'] = df_temp['Likely damaging'].replace(key, value)

        # Extract p_key and damage class
        p_keys = name + ' p.' + df_temp.iloc[:, 0]
        damaging = df_temp['Likely damaging']
        for i, j in zip(p_keys, damaging):
            df = df.append({'p_key': i, 'damaging': j}, ignore_index=True)

    # Exclude trajectories in model without a p_key
    damage_model = [traj for traj in model if isinstance(traj.p_key, str)]

    # Assign damage to trajectories
    for traj in damage_model:
        if traj.p_key in set(df['p_key']):
            traj.damage_class = df.loc[df.p_key == traj.p_key, 'damaging'].values[0]
        elif 'fs' in traj.p_key or '*' in traj.p_key:
            traj.damage_class = fs_ter
        else:
            traj.damage_class = None

    damage_model = [traj for traj in damage_model if traj.damage_class is not None]
    
    df = pd.DataFrame(columns=['damage_class', 'fitness', 'cohort'])
    for traj in damage_model:
        df = df.append({'damage_class':traj.damage_class,
                        'fitness':traj.fitness*1,
                        'cohort':traj.cohort}, ignore_index=True)

    df.damage_class = pd.Categorical(df.damage_class, 
                          categories=['likely benign','possibly damaging','likely damaging','fs - ter'],
                          ordered=True)

    df.sort_values('damage_class', inplace=True)
    
    plot_color = ['Lightseagreen', 'Orange']

    df_CHIP = df[df['cohort']=='CHIP']
    df_neutral = df[df['cohort']=='NGF']

    # Start subplot figure with shared xaxis
    fig = make_subplots(rows=2, cols=1,
                        row_heights=[0.5,0.5],
                        shared_xaxes=True,
                        vertical_spacing=0.05)

    for i, data in enumerate([df_CHIP, df_neutral]):
        fig.add_trace(
            go.Histogram(x=data['damage_class'],
                         name=data['cohort'].unique()[0],
                         marker_color=plot_color[i]),
        row=1,col=1)


        fig.add_trace(
            go.Box(x=data['damage_class'],
                   y=data['fitness'],
                   boxpoints='all',
                   name=data['cohort'].unique()[0],
                   showlegend=False,
                   marker_color=plot_color[i]),
            row=2, col=1)

    fig.update_layout(
        boxmode='group')
    fig.update_yaxes(title='Trajectory counts', row=1,col=1)
    fig.update_yaxes(title='Fitness', 
                     type='log', dtick=1, row=2,col=1)
    
    return fig

In [11]:
fig = damage_class(model_threshold + model_neutral)
fig.write_image(path + 'Damage_class.svg')
fig.show()