In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append("../analysis/")

# 3A

In [None]:
sys.path.append("../signatures/")
from sig_ml import doMLsignatures
from clonality import return_clon_dict, load_clonality_treatment, boxplot_clonality
import pickle
import gzip

sys.path.append("../config/")
from config import ALL_MUTS, SIG_ALL_AML_EXPOSURES, SIG_ALL_AML_PROCESSES, ML_SIGNATURES

In [None]:
# Example on how to generate ML signatures. This should be generated for the three datasets.
# for more info, please read the mutational footprints of cancer therapies, NatGen, 2019.

type_mut = 'SNV'

doMLsignatures(ALL_MUTS, SIG_ALL_AML_PROCESSES, SIG_ALL_AML_EXPOSURES, type_mut, ML_SIGNATURES, annotate=True)

In [None]:
import pandas as pd
import json
from config import AML_ALL_HEMATO_ML, TREATMENT_DIC

# tAML platin
df_ml = pd.read_csv(AML_ALL_HEMATO_ML, sep ="\t")

sig_cols = df_ml['ML'].unique().tolist()
signature_treatment = '1_0.97_SBS31-0.95'

# Get treated samples
dic_treatment = json.load(open(TREATMENT_DIC))
platinum_based = []
for sample, l in dic_treatment.items():
    if 'Platinum-based Drug' in l:
        platinum_based.append(str(sample))

platinum_based.append(str(864484))
dic_clon = return_clon_dict(df_ml, sig_cols, platinum_based, signature_treatment)
healthy = dic_clon[signature_treatment]

In [None]:
from config import ML_OVARY, ML_UT, SIG_TREATMENT_DRUG_LVL, FIG_3

# Load HMF treatment data
hartwig_treated = pickle.load(gzip.open(SIG_TREATMENT_DRUG_LVL))

# Ovary
sig_treated = '6_1.0_SBS31-0.92'
all_treat = hartwig_treated['Ovary']['Carboplatin'] | hartwig_treated['Ovary']['Cisplatin']
ovary_t = load_clonality_treatment(ML_OVARY, sig_treated, all_treat)

# Urinary Tract
sig_treated = '7_1.0_SBS31-0.97'
all_treat = hartwig_treated['Urinary-tract']['Carboplatin'] | hartwig_treated['Urinary-tract']['Cisplatin']
urinary_t = load_clonality_treatment(ML_UT, sig_treated, all_treat)

In [None]:
boxplot_clonality(healthy, ovary_t, urinary_t, FIG_3 )

# 3B


In [None]:
sys.path.append("../signatures/")

from profiles import plot_snvs
from config import BOXTEL

In [None]:
hcss = ['AC41HSC3', 'AC63HSCc2', 
       'ACC55HSC15','ACHSC19', 'ACHSC23',
        'ACHSC7', 'ACHSCc6',
       'ACHSCc8','BCHHSC4']

df_boxtel = pd.read_csv(BOXTEL, sep ='\t')
mean_counts = df_boxtel[hcss].mean(axis = 1)
normalized = mean_counts/mean_counts.sum()

plot_snvs(normalized, "HSC", FIG_3 )

# 3C



In [None]:
import sys
import pandas as pd 
import json

sys.path.append("../analysis/")

from age_corr import do_plot
from config import OSORIO_METADATA, LIST_TTYPE, SAMPLE_2_AGE,  FIG_3

In [None]:
df_exp = pd.read_csv(SIG_ALL_AML_EXPOSURES, sep ='\t', index_col = 0).T
exposures = df_exp[df_exp['2_0.97_NA']>0]

high_mut_platinum = ['AP0390_AP0395', 'AP0388_AP0389', 'SRR1802228_SRR1800776', 'SRR388810_SRR388850', 
                    'SRR1802754_SRR1802621', 'PATIENT_WTS_1_TUM1_PATIENT_WTS_1_BUC',
                     'PATIENT_WTS_3_TUM1_PATIENT_WTS_3_BUC', 'SRR1802711_SRR1802494']

dic_samples_ttypes = json.load(open(LIST_TTYPE))
dic_aging_sample_level = json.load(open(SAMPLE_2_AGE))

meta_hsc = pd.read_csv(OSORIO_METADATA, sep ='\t')
dic_HSC = dict(zip(meta_hsc['Identifier'], meta_hsc['Age (years)']))

In [None]:
do_plot(exposures.T.loc[['2_0.97_NA']], 'SigProfiler', high_mut_platinum, FIG_3, 
       dic_samples_ttypes, dic_aging_sample_level, dic_HSC)

# 3D

In [None]:
from config import OHSU_DATA_CLINICAL, OHSU_DATA_MUTATIONS, AML_INTOGEN

from beatAML import do_barplot

In [None]:
# This is from CBIOPORTAL, even though the IntoGen run was done using the raw data because CBIOPORTAL does 
# not have synonymous mutations


do_barplot(OHSU_DATA_MUTATIONS, OHSU_DATA_CLINICAL, AML_INTOGEN, FIG_3)

# 3F

In [None]:
from beatAML import odds_ratio_tAMLs

In [None]:
# This is from CBIOPORTAL, even though the IntoGen run was done using the raw data because CBIOPORTAL does 
# not have synonymous mutations

odds_ratio_tAMLs(OHSU_DATA_MUTATIONS, OHSU_DATA_CLINICAL, AML_INTOGEN, FIG_3)

# Supplementary Figures

## 4A

In [None]:
from collections import defaultdict

# Import modules
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
import itertools
import seaborn as sns
import numpy as np
import random

sys.path.append("../config/")
from config_plot import config_plot_params

In [None]:
# generate order of SBS
def snvs_order():
    order = []
    first = ['A', 'C', 'G', 'T']
    pyr = ['C', 'T']
    for p in pyr:
        for mut in first:
            if mut != p:
                for f in first:
                    for f2 in first:
                        comb = '{}[{}>{}]{}'.format(f, p, mut, f2)
                        order.append(comb)
    return order

df_m = pd.read_csv(ALL_MUTS, sep ="\t")
clonal = df_m[df_m["Clonal1"]=="clonal"]

samples_dict = defaultdict(dict)
order = snvs_order()

for sample, data in clonal.groupby(by='SAMPLE'):
    dic_count = data['VARIANT_CLASS'].value_counts().to_dict()
    for i in order:
        samples_dict[sample][i] = dic_count.get(i, 0)

matrix = pd.DataFrame.from_dict(samples_dict)
matrix = matrix.loc[order]

In [None]:
plat_treated = ['SRR388810_SRR388850',
 'SRR1802228_SRR1800776',
 'SRR1802754_SRR1802621',
 'AP0388_AP0389',
'PATIENT_WTS_1_TUM1_PATIENT_WTS_1_BUC',
 'AP0390_AP0395',
 'PATIENT_WTS_3_TUM1_PATIENT_WTS_3_BUC',
    ]

In [None]:
# Path with output
exp = "/workspace/projects/reverse_calling/rebuttle/extractions/SigProfiler/results/exposures/input_snvs_clonals/input_snvs_clonals.exposures.tsv"

dfexp = pd.read_csv(exp, sep ="\t", index_col = 0).T
sample_suspicious = "SRR1802711_SRR1802494"

new_index = [s for s in dfexp.index.tolist() if s != sample_suspicious]
dfexp = dfexp.loc[new_index]

dic_exposures = dfexp.to_dict()

plat_dec = [dic_exposures['6_0.97_SBS31-0.94'][s] for s in plat_treated]
noplat_dec = [dic_exposures['6_0.97_SBS31-0.94'][s] for s in dic_exposures['6_0.97_SBS31-0.94'].keys() if s not in plat_treated]


config_params(5)
fig, ax = plt.subplots(1, 1, figsize = (1,1.3))

v = [noplat_dec, plat_dec]

print(mannwhitneyu(noplat_dec, plat_dec, alternative='two-sided' ))

c1 = ['#0088aaff' for i in noplat_dec]
c3 = ['brown' for i in plat_dec]
cols = c1+c3
x = list(itertools.chain.from_iterable([[0 + i] * len(l) for i, l in enumerate(v)]))
x += np.random.normal(0, 0.05, size=len(x))
y = list(itertools.chain.from_iterable(v))



ax=sns.boxplot(data=v, showfliers=False, color = 'white',
            medianprops={'color':'black'}, linewidth = 0.4, ax = ax) #zorder=0
                  #boxprops=dict(alpha=0.5), 
ax.scatter(x, y, s=1, alpha=1,
              color=cols,  )  # scatter zorder=5

plt.xticks([0, 1], ['other samples',
                    'platinum-treated tAML'], rotation = 90)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

plt.ylabel('Clonal cisplatin-related\nmutations\n')
plt.show()

## 4C

For figure 3C we should remove samples with high exposure to artefactual signatures, do a de novo extraction and rerun the main3C panel. We are writting here how to filter the samples

In [None]:
# Get the extracted signatures too
from config import SIG_ALL_AML_EXPOSURES

ext = pd.read_csv(SIG_ALL_AML_EXPOSURES, sep= "\t", index_col = 0).T

ext = ext.T.drop("SRR1802711_SRR1802494", axis = 1).T
ext_no_art = ext[(ext['5_1.0_SBS43-0.91']<300)&(ext["6_1.0_SBS58-0.87"]<300) &
                 (ext["4_0.81_NA"]<300)].index.tolist()

new_df = df_in[ext_no_art]

# Change it to your path
new_df.to_csv('extractions/input_snvs_noartifacts.dlm',
              sep ='\t', index = False, header = True)

## 4D

In [None]:
import sys
import pandas as pd 
import json

from beatAML import do_heatmap

In [None]:
# This is from CBIOPORTAL, even though the IntoGen run was done using the raw data because CBIOPORTAL does 
# not have synonymous mutations

do_heatmap(OHSU_DATA_MUTATIONS, OHSU_DATA_CLINICAL, AML_INTOGEN, FIG_3)