In [1]:
import sys
sys.path.append("../ARCH_package")

import plot, damage, clonal_model

import dill

import plotly.express as px
import numpy as np
import os

In [2]:
# Import non-synonymous mutations as exported in LiFT.py
with open('../Exports/LBC_non-synonymous_LiFT_fitted.dill', 'rb') as infile:
    lbc = dill.load(infile)

# Gene plots

In [3]:
# Create path for exporting
path = f'../Results/LiFT/'
if not os.path.exists(path):
    os.makedirs(path)

part = path + "LiFT-" 

In [4]:
fig = plot.gene_fitness_plot(lbc)
fig.show()
fig.write_image(path + 'gene_fitness_summary.png', width=1200, scale=10)
fig.write_image(path + 'gene_fitness_summary.svg', width=1200)

In [5]:
fig, gene_statistic_df = plot.gene_statistic(lbc)
fig.show()

fig.write_image(path + 'gene_statistic.png', height=350, scale=10)
fig.write_image(path + 'gene_statistic.svg', height=350)

# Gene categories

In [6]:
gene_dict = dict()

for part in lbc:
    for traj in part.trajectories:
        if traj.gene not in gene_dict.keys():
            gene_dict[traj.gene] = []
        gene_dict[traj.gene].append(traj.fitness)

cohort = [traj for part in lbc for traj in part.trajectories if traj.fitness>0.02 ]

In [7]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd
from itertools import combinations
from scipy import stats

# Create gene_to_category dictionary

category_to_gene = dict()
category_to_gene['histone_regulation'] = ['EZH2', 'ASXL1', 'KMT2A', 'KDM6A', 'BCOR', 'BCORL1']
category_to_gene['splicing'] = ['SF3B1', 'U2AF1', 'SRSF2', 'U2AF2', 'ZRSR2', 'LUC7L2', 'DDX41']
category_to_gene['dna_damage'] = ['TP53', 'CDKN2A', 'PPM1D', 'ATRX']
category_to_gene['mitogenic'] = ['KRAS', 'NF1', 'JAK2', 'JAK3', 'SH2B3', 'PTEN', 'PTPN11','NRAS']
category_to_gene['cohesin'] = ['RAD21', 'STAG2']
category_to_gene['tf_development'] = ['GATA2', 'RUNX1', 'ETV6', 'CUX1', 'NOTCH1']
category_to_gene['dna_methylation'] = ['TET2', 'DNMT3A']

gene_to_category = dict()
for k, v in category_to_gene.items():
    for gene in v:
        gene_to_category[gene] = k
        
# Create fitness distribution by category
category_fitness = {element: [] for element in category_to_gene.keys()}
for traj in cohort:
    category_fitness[gene_to_category[traj.gene]].append(traj.fitness)  

# sort dictionary by mean order
category_fitness = dict(sorted(category_fitness.items(),
                        key=lambda item: np.mean(item[1]),
                        reverse=True))

# Plot distribution of fitness by category
fig = go.Figure()
for i, key in enumerate(category_fitness):
    fig.add_trace(
            go.Box(y=category_fitness[key],
                   name=key, boxpoints='all', showlegend=False))
fig.update_xaxes(linewidth=2,tickangle=-45)
fig.update_layout(yaxis_title= 'Fitness', template='simple_white')
fig.update_yaxes(linewidth=2,
                 type='log', tickvals=[0.05,0.1,0.2,0.4])
fig.show()
fig.write_image(path + 'category_fitness.svg')

# Compute Kruskal Wallis test

def gene_statistic(gene_dict, statistic='kruskal-wallis', filter=True):
    """ compute a statistical test to find significant differences in the
    distribution of fitness by gene.
    statistic parameter accepts: 'kruskal' or 'anova'.
    Returns:
    * heatmap with significant statistical differences.
    * dataframe."""

    # Check if statistic is allowed
    if statistic not in ['kruskal-wallis', 'anova']:
        return 'Statistic not recognised.'

    # extract all possible gene combinations
    gene_list = []
    for gene in gene_dict.keys():
        if len(gene_dict[gene]) > 1:
            gene_list.append(gene)

    # Create dataframe to store statistics
    test_df = pd.DataFrame(index=gene_list, columns=gene_list)
    for gene1, gene2 in combinations(gene_list, 2):
        # compute statistic for each possible comination of genes
        if statistic == 'kruskal-wallis':
            stat, pvalue = stats.kruskal(gene_dict[gene1], gene_dict[gene2])
        if statistic == 'anova':
            stat, pvalue = stats.f_oneway(gene_dict[gene1], gene_dict[gene2])
        # if statistic is significant store value in dataframe
        if pvalue < 0.05:
            test_df.loc[gene1, gene2] = stat

    # Clean dataset from nan
    if filter is True:
        test_df = test_df.dropna(how='all', axis=1)
        test_df = test_df.dropna(how='all', axis=0)

    test_df = test_df.reindex(index=test_df.index[::-1])
    y = test_df.index
    x = test_df.columns
    fig = go.Figure(data=go.Heatmap(
                    z=np.array(test_df),
                    x=x,
                    y=y,
                    colorscale='Cividis',
                    colorbar=dict(title=f'{statistic} score')))
    fig.update_xaxes(side="top", mirror=True)
    fig.update_yaxes(side='top', mirror=True)
    fig.update_layout(template='simple_white')

    return fig, test_df


category_fig, category_dic = gene_statistic(category_fitness)
category_fig.update_layout(xaxis_tickangle=-45)
#fig.update_layout(yaxis_tickangle=-45)
category_fig.show()
category_fig.write_image(path + 'category_fitness_statistics.svg', width = 500)

# Gene trajectories

In [8]:
# Create path for exporting
path = f'../Results/LiFT/fitness posterior/'
if not os.path.exists(path):
    os.makedirs(path)

print("Exporting posterior fitness")
for part in lbc:
    fig = part.optimal_model.posterior_fitness_plot
    fig.update_layout(margin=dict(pad=4))
    fig.write_image(path + f"{part.id}_fitness_distribution.svg", width=500)
    fig.write_image(path + f"{part.id}_fitness_distribution.png", width = 500, scale=5)

    fig = part.optimal_model.contour_plot
    fig.write_image(path + f"{part.id}_contour_plot.svg")
    fig.write_image(path + f"{part.id}_contour_plot.png", scale=5)


for part in lbc:
    fig = part.deterministic_plot
    fig.write_image(path + f"{part.id}_deterministic_fit.svg")
    fig.write_image(path + f"{part.id}_deterministic_fit.png", scale=5)

Exporting posterior fitness


In [9]:
cohort = [traj for part in lbc for traj in part.trajectories if traj.fitness >-1]
gene_set = set([traj.gene for traj in cohort])

# Create path for exporting
path = f'../Results/LiFT/Genes/'
if not os.path.exists(path):
    os.makedirs(path)

# Export deterministic fit for all trajectories 
for gene in gene_set:
    fig = plot.gene_trajectories(cohort, gene)
    fig.write_image(path + f"{gene}_deterministic_fit.svg")
    fig.write_image(path + f"{gene}_deterministic_fit.png", scale=5)

# Model evidence

In [10]:

# Create path for exporting
path = f'../Results/LiFT/'
if not os.path.exists(path):
    os.makedirs(path)



difference = []
model_evidence = 0.5
for part in lbc:
    if len(part.clonal_models) > 1:
        model_1 = part.optimal_model
        optimal_weighted = model_1.model_probability*(1 + model_evidence*(len(model_1.clonal_map)-1))
        model_2 = part.clonal_models[-1]
        split_weighted = model_2.model_probability*(1 + model_evidence*(len(model_2.clonal_map)-1))
        difference.append(optimal_weighted/split_weighted)

# create the bins
counts, bins = np.histogram(difference, bins=[1,1.001, 2, 10, 7000])

labels = ['mutations on separate <br> clones favoured', "1-2", "2-10", ">10"]

fig = px.bar(x=labels, y=counts, labels={'x':'relative evidence', 'y':'counts'})
fig.show()
fig.write_image(path + 'model_evidence_comparison_split.png', scale=10)
fig.write_image(path + 'model_evidence_comparison_split.svg')
