# Parse patient A7 data

## First use the paper's reported clusters (from SciClone) to generate input files 
Adapted from MACHNA (El-Kebir et. al.)

In [1]:
confidence=0.95

In [2]:
def get_id(line):
    return "_".join(map(str, [line['chromosome_name'], line['start'], line['stop']]))

In [3]:
import pandas as pd
import os

REPO_DIR = os.path.join(os.getcwd(), "../") 
HOADLEY_DATA_DIR = os.path.join(REPO_DIR, 'data', 'hoadley_breast_cancer_2016/')
table = pd.read_table(os.path.join(HOADLEY_DATA_DIR,"A7/A7_raw.tsv"))
table['id']=table.apply(get_id, axis=1)
table = table.set_index('id')
table.columns

Index(['chromosome_name', 'start', 'stop', 'reference', 'variant', 'type',
       'gene_name', 'transcript_name', 'transcript_species',
       'transcript_source', 'transcript_version', 'strand',
       'transcript_status', 'trv_type', 'c_position', 'amino_acid_change',
       'ucsc_cons', 'domain', 'all_domains', 'deletion_substructures',
       'transcript_error', 'brain.rcnt.llr3_ref', 'brain.rcnt.llr3_var',
       'brain.rcnt.llr3_VAF', 'kidney.rcnt.llr3_ref', 'kidney.rcnt.llr3_var',
       'kidney.rcnt.llr3_VAF', 'liver.rcnt.llr3_ref', 'liver.rcnt.llr3_var',
       'liver.rcnt.llr3_VAF', 'lung.rcnt.llr3_ref', 'lung.rcnt.llr3_var',
       'lung.rcnt.llr3_VAF', 'rib.rcnt.llr3_ref', 'rib.rcnt.llr3_var',
       'rib.rcnt.llr3_VAF', 'tumor.rcnt.llr3_ref', 'tumor.rcnt.llr3_var',
       'tumor.rcnt.llr3_VAF', 'cluster'],
      dtype='object')

### Reassign clusters of mutations 7_12163423_12163423 and 7_57562948_57562948

#### details for why in MACHINA paper

In [4]:
table.loc['7_12163423_12163423', 'cluster']=2
table.loc['7_57562948_57562948', 'cluster']=2

In [5]:
def label_snv(row):
    label = []
    if row['gene_name'] != "-":
        label.append(row['gene_name'])
    label += [str(row['chromosome_name']), str(row['start']), str(row['stop'])]
    return (":").join(label)
table['character_label'] = table.apply(lambda row: label_snv(row), axis=1)

In [6]:
raw_table = table.copy()
raw_table.head()

Unnamed: 0_level_0,chromosome_name,start,stop,reference,variant,type,gene_name,transcript_name,transcript_species,transcript_source,...,lung.rcnt.llr3_var,lung.rcnt.llr3_VAF,rib.rcnt.llr3_ref,rib.rcnt.llr3_var,rib.rcnt.llr3_VAF,tumor.rcnt.llr3_ref,tumor.rcnt.llr3_var,tumor.rcnt.llr3_VAF,cluster,character_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_2866642_2866642,1,2866642,2866642,G,A,SNP,ENSG00000177133,ENST00000321399,human,ensembl,...,57,33.33,102,85,45.45,125,42,25.15,1,ENSG00000177133:1:2866642:2866642
1_2984058_2984058,1,2984058,2984058,G,A,SNP,PRDM16,NM_022114.2,human,genbank,...,1,0.56,212,0,0.0,205,0,0.0,3,PRDM16:1:2984058:2984058
1_3814510_3814510,1,3814510,3814510,A,C,SNP,C1orf174,NM_207356.2,human,genbank,...,3,1.96,82,58,41.43,170,0,0.0,6,C1orf174:1:3814510:3814510
1_4293949_4293949,1,4293949,4293949,G,T,SNP,-,-,-,-,...,27,28.12,51,45,46.88,100,45,31.03,1,1:4293949:4293949
1_4367117_4367117,1,4367117,4367117,A,C,SNP,-,-,-,-,...,0,0.0,179,0,0.0,187,0,0.0,9,1:4367117:4367117


## Generate migration history input file

In [7]:
ref_cols = ['tumor.rcnt.llr3_ref','brain.rcnt.llr3_ref', 'kidney.rcnt.llr3_ref', 'liver.rcnt.llr3_ref', 'lung.rcnt.llr3_ref', 'rib.rcnt.llr3_ref']
var_cols = ['tumor.rcnt.llr3_var','brain.rcnt.llr3_var', 'kidney.rcnt.llr3_var', 'liver.rcnt.llr3_var', 'lung.rcnt.llr3_var', 'rib.rcnt.llr3_var']

cols = ['breast', 'brain', 'kidney', 'liver', 'lung', 'rib']
table = table[['cluster']+ref_cols+var_cols]
table.columns = ['cluster']+['ref-'+c for c in cols] + ['var-'+c for c in cols]

In [8]:
ctable = table.groupby('cluster').sum()
ctable.head()

global corrected_confidence
nsamples = len([c for c in ctable.columns if c.startswith('ref')])
nclusters = len(ctable)
corrected_confidence = 1-((1.-confidence)/(nsamples*nclusters))
print(corrected_confidence)

assert(corrected_confidence > confidence)
assert(corrected_confidence < 1.0)


0.9991666666666666


## Get intervals

In [9]:
import numpy
from scipy.stats import beta
from scipy.stats import norm

def binomial_hpdr(n, N, pct, a=1, b=1, n_pbins=1e3):
    """
    Function computes the posterior mode along with the upper and lower bounds of the
    **Highest Posterior Density Region**.

    Parameters
    ----------
    n: number of successes 
    N: sample size 
    pct: the size of the confidence interval (between 0 and 1)
    a: the alpha hyper-parameter for the Beta distribution used as a prior (Default=1)
    b: the beta hyper-parameter for the Beta distribution used as a prior (Default=1)
    n_pbins: the number of bins to segment the p_range into (Default=1e3)

    Returns
    -------
    A tuple that contains the mode as well as the lower and upper bounds of the interval
    (mode, lower, upper)

    """
    # fixed random variable object for posterior Beta distribution
    rv = beta(n+a, N-n+b)
    # determine the mode and standard deviation of the posterior
    stdev = rv.stats('v')**0.5
    mode = (n+a-1.)/(N+a+b-2.)
    # compute the number of sigma that corresponds to this confidence
    # this is used to set the rough range of possible success probabilities
    n_sigma = numpy.ceil(norm.ppf( (1+pct)/2. ))+1
    # set the min and max values for success probability 
    max_p = mode + n_sigma * stdev
    if max_p > 1:
        max_p = 1.
    min_p = mode - n_sigma * stdev
    if min_p > 1:
        min_p = 1.
    # make the range of success probabilities
    p_range = numpy.linspace(min_p, max_p, int(n_pbins+1))
    # construct the probability mass function over the given range
    if mode > 0.5:
        sf = rv.sf(p_range)
        pmf = sf[:-1] - sf[1:]
    else:
        cdf = rv.cdf(p_range)
        pmf = cdf[1:] - cdf[:-1]
    # find the upper and lower bounds of the interval 
    sorted_idxs = numpy.argsort( pmf )[::-1]
    cumsum = numpy.cumsum( numpy.sort(pmf)[::-1] )
    j = numpy.argmin( numpy.abs(cumsum - pct) )
    upper = p_range[ (sorted_idxs[:j+1]).max()+1 ]
    lower = p_range[ (sorted_idxs[:j+1]).min() ]    

    return (mode, lower, upper)

In [10]:
#### 
def get_ub(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    return v[2]
    

def get_lb(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[1]
    #if mval < 0.01: mval = 0
    return mval

def get_mean(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[0]
    return mval

ctable = table.groupby('cluster').sum()
for sam in cols:
    ctable['ub-'+sam]= ctable.apply(get_ub, args=[sam], axis=1)
    ctable['lb-'+sam]= ctable.apply(get_lb, args=[sam], axis=1)
    ctable[sam]= ctable.apply(get_mean, args=[sam], axis=1)

In [11]:
def get_ub(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    return v[2]
    

def get_lb(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[1]
    if mval < 0.01: mval = 0
    return mval

def get_mean(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[0]
    return mval

ctable_cutoff = table.groupby('cluster').sum()
for sam in cols:
    ctable_cutoff['ub-'+sam]= ctable.apply(get_ub, args=[sam], axis=1)
    ctable_cutoff['lb-'+sam]= ctable.apply(get_lb, args=[sam], axis=1)
    ctable_cutoff[sam]= ctable.apply(get_mean, args=[sam], axis=1)

In [12]:
def get_vaf(row, sam):
    return float(row['var-'+sam])/float(row['var-'+sam]+row['ref-'+sam])

vafs = pd.DataFrame()
for sam in cols:
    vafs[sam] = table.apply(get_vaf, args=[sam], axis=1)
vafs['cluster'] = table['cluster']

In [13]:
def add_char_label(row): 
    return "_".join(list(raw_table[raw_table['cluster']==row.name]['character_label']))

ctable_cutoff['character_label'] = ctable_cutoff.apply(lambda row: add_char_label(row), axis=1)
ctable_cutoff

Unnamed: 0_level_0,ref-breast,ref-brain,ref-kidney,ref-liver,ref-lung,ref-rib,var-breast,var-brain,var-kidney,var-liver,...,ub-liver,lb-liver,liver,ub-lung,lb-lung,lung,ub-rib,lb-rib,rib,character_label
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,19957,24490,34894,17945,17201,18757,7090,11892,9405,12987,...,0.429257,0.410512,0.419856,0.357974,0.338386,0.348151,0.374843,0.356134,0.36546,ENSG00000177133:1:2866642:2866642_1:4293949:42...
2,2833,3627,3483,1747,2649,1862,5,39,935,1409,...,0.476082,0.416998,0.446451,0.02858,0.011115,0.018525,0.387565,0.328122,0.357488,ENSG00000189015:1:28295035:28295035_1:56623876...
3,5860,7725,7242,3535,5652,6225,4,68,1931,2844,...,0.466682,0.425119,0.445838,0.002973,0.0,0.000884,0.001296,0.0,0.000161,PRDM16:1:2984058:2984058_FHAD1:1:15514080:1551...
4,6681,5938,10865,7874,4371,7259,5,2826,13,4,...,0.001918,0.0,0.000508,0.345283,0.306408,0.325671,0.00319,0.0,0.001238,1:5200289:5200289_KCNAB2:1:6078316:6078316_PRD...
5,1601,2064,1894,996,1483,1620,1,0,490,631,...,0.428625,0.348122,0.38783,0.005428,0.0,0.000674,0.00497,0.0,0.000617,PRDM2:1:14012851:14012851_ZSCAN20:1:33685697:3...
6,15478,20498,24541,18141,14737,10482,7,5,17,10,...,0.001362,0.0,0.000551,0.017513,0.011068,0.01405,0.356147,0.331045,0.343521,C1orf174:1:3814510:3814510_1:5461427:5461427_K...
7,1786,2503,3143,2338,1189,1911,0,3,2,3,...,0.005545,0.0,0.001282,0.331037,0.257056,0.293103,0.005563,0.0,0.001045,ENSG00000219178:1:29727073:29727073_ENSG000002...
8,3346,4457,4158,4056,3258,3556,4,1,1105,3,...,0.0032,0.0,0.000739,0.003267,0.0,0.000613,0.002267,0.0,0.000281,AADACL3:1:12659643:12659643_KIAA1026:1:1516176...
9,13466,18097,21928,12496,13187,14786,5,28,12,2886,...,0.198293,0.177266,0.187622,0.001145,0.0,0.000303,0.001022,0.0,0.00027,1:4367117:4367117_CLCN6:1:11799962:11799962_PA...
10,915,1029,1394,1015,855,903,0,227,1,0,...,0.004912,0.0,0.0,0.005828,0.0,0.0,0.005519,0.0,0.0,AGBL4:1:49111131:49111131_AGBL4:1:49210849:492...


In [14]:
rows = ["6 #anatomical sites\n6 #samples\n10 #mutations\n#sample_index\tsample_label\tanatomical_site_index\tanatomical_site_label\tcharacter_index\tcharacter_label\tf_lb\tf_ub\tref\tvar\n",]
def print_char(row, sam):
    return "\t".join(map(str,[i, sam, i, sam, row.name-1, row["character_label"], max(row['lb-'+sam] * 2, 0), min(1, 2 * row['ub-'+sam]), int(row['ref-'+sam]), int(row['var-'+sam])]))+"\n"

for i, sam in enumerate(cols):
    rows += list(ctable_cutoff.apply(print_char, args=[sam], axis=1))

with open(os.path.join(HOADLEY_DATA_DIR,"A7/A7_"+str(confidence)+".tsv"), 'w') as f:
    for line in rows:
        f.write(line)

## Prep inputs from raw data for PyClone clustering

In [15]:
pyclone_dir = os.path.join(HOADLEY_DATA_DIR, "pyclone_analysis", "A7")
if not os.path.exists(pyclone_dir): 
    os.makedirs(pyclone_dir)
    
raw_sample_names = ['tumor.rcnt.llr3','brain.rcnt.llr3', 'kidney.rcnt.llr3', 'liver.rcnt.llr3', 'lung.rcnt.llr3', 'rib.rcnt.llr3']
final_sample_names = ['breast', 'brain', 'kidney', 'liver', 'lung', 'rib']

# put df in format used in PyClone, with columns:
#'mutation_id', 'ref_counts','var_counts', 'normal_cn', 'minor_cn', 'major_cn'
for raw_sample, final_sample in zip(raw_sample_names, final_sample_names):
    data = []
    for i, row in raw_table.iterrows():
        data.append([row['character_label'], row[f'{raw_sample}_ref'], row[f'{raw_sample}_var'], 2, 1, 1])
    sample_subset = pd.DataFrame(data, columns=['mutation_id', 'ref_counts','var_counts', 'normal_cn', 'minor_cn', 'major_cn'])
    sample_subset.to_csv(os.path.join(pyclone_dir,  f"A7_{final_sample}.tsv"), index=False, sep="\t")    


## Prep inputs for orchard tree inference

In [16]:
from metient.util import data_extraction_util as dutil
import json
cluster_id_to_mut_names, mutation_names = dutil.load_pyclone_clusters(os.path.join(pyclone_dir, "tables", "loci.tsv"))
pairtree_dir = os.path.join(HOADLEY_DATA_DIR, "orchard_trees")
header = ["id", "name", "var_reads", "total_reads", "var_read_prob"]

mut_name_to_mut_id = {}
with open(os.path.join(pairtree_dir, f"A7.ssm"), "w") as f:
    f.write("\t".join(header))
    f.write("\n")  
    for i, mut in enumerate(mutation_names):
        mut_name_to_mut_id[mut] = f"m{i}"
        row = [f"m{i}", mut]
        
        mut_row = raw_table[raw_table['character_label']==mut]
        var_reads = []
        total_reads = []
        var_read_probs = []
        for sample in raw_sample_names:
            var = mut_row[f'{sample}_var'].item()
            ref = mut_row[f'{sample}_ref'].item()
            var_reads.append(str(var))
            total_reads.append(str(var+ref))
            var_read_probs.append(str(0.5))

        row += [",".join(var_reads), ",".join(total_reads), ",".join(var_read_probs)]
        f.write("\t".join(row))
        f.write("\n")
json_data = {"samples": final_sample_names, "clusters": [], "garbage": []}
for x in range(0,len(cluster_id_to_mut_names)):
    json_data["clusters"].append([mut_name_to_mut_id[t] for t in cluster_id_to_mut_names[x]])

with open(os.path.join(pairtree_dir, f"A7.params.json"), 'w', encoding='utf-8') as f:
    json.dump(json_data, f, ensure_ascii=False)

CUDA GPU: False


## Prepare data for migration history inference (tsvs)

In [17]:
# Need a tsv for each patient with ['anatomical_site_index','anatomical_site_label', 'character_index', 'character_label', 'ref', 'var']
import re
import numpy as np


cols = ['anatomical_site_index','anatomical_site_label', 'character_index', 'character_label', 'ref', 'var', 'var_read_prob', 'site_category']

mut_name_to_clstr_id = {}
clstr_id_to_name = {}
for cid in cluster_id_to_mut_names:
    for mut in cluster_id_to_mut_names[cid]:
        mut_name_to_clstr_id[mut] = cid
    clstr_id_to_name[cid] = ";".join(cluster_id_to_mut_names[cid])

data = []
for midx, mut in enumerate(mutation_names):

    mut_row = raw_table[raw_table['character_label']==mut]
    for sidx, sample in enumerate(raw_sample_names):
        var = mut_row[f'{sample}_var'].item()
        ref = mut_row[f'{sample}_ref'].item()
        var_read_probs.append(str(0.5))
        site_category = 'primary' if final_sample_names[sidx] == 'breast' else 'metastasis'
        data.append([sidx, final_sample_names[sidx], midx, mut, ref, var, 0.5, site_category])
            
patient_df = pd.DataFrame(data, columns=cols)

dutil.write_pooled_tsv_from_clusters(patient_df, mut_name_to_clstr_id, clstr_id_to_name, {}, 
                                     os.path.join(HOADLEY_DATA_DIR,"pyclone_clustered_tsvs"), "A7")


In [18]:
# Add extra required fields
tsv_fn = os.path.join(HOADLEY_DATA_DIR,"pyclone_clustered_tsvs", f"A7_clustered_SNVs.tsv")
df = pd.read_csv(tsv_fn, sep="\t")
df['num_mutations'] = df.apply(lambda row: len(row['character_label'].split(";")), axis=1)
df['full_label'] = df['character_label']
df['character_label'] = df.apply(lambda row:dutil.get_pruned_mut_label(row['character_label'], ";", ":"), axis=1)
df.to_csv(tsv_fn, sep="\t")