# Parse A1
## Adapted from MACHNA (El-Kebir et. al.)

In [1]:
confidence=0.95

In [2]:
def get_id(line):
    return "_".join(map(str, [line['chromosome_name'], line['start'], line['stop']]))

In [3]:
import pandas as pd
import os

HOADLEY_DATA_DIR = '/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/met_history_prediction/src/data/hoadley_breast_cancer_2016/'
table = pd.read_table(os.path.join(HOADLEY_DATA_DIR,"A1/A1_raw.tsv"))
table['id']=table.apply(get_id, axis=1)
table = table.set_index('id')
table.columns

Index(['chromosome_name', 'start', 'stop', 'reference', 'variant', 'type',
       'gene_name', 'transcript_name', 'transcript_species',
       'transcript_source', 'transcript_version', 'strand',
       'transcript_status', 'trv_type', 'c_position', 'amino_acid_change',
       'ucsc_cons', 'domain', 'all_domains', 'deletion_substructures',
       'transcript_error', 'adrenalmet.rcnt.llr3_ref',
       'adrenalmet.rcnt.llr3_var', 'adrenalmet.rcnt.llr3_VAF',
       'livermet.rcnt.llr3_ref', 'livermet.rcnt.llr3_var',
       'livermet.rcnt.llr3_VAF', 'lungmet.rcnt.llr3_ref',
       'lungmet.rcnt.llr3_var', 'lungmet.rcnt.llr3_VAF',
       'spinalmet.rcnt.llr3_ref', 'spinalmet.rcnt.llr3_var',
       'spinalmet.rcnt.llr3_VAF', 'tumor.rcnt.llr3_ref', 'tumor.rcnt.llr3_var',
       'tumor.rcnt.llr3_VAF', 'cluster'],
      dtype='object')

In [4]:
def label_snv(row):
    label = []
    if row['gene_name'] != "-":
        label.append(row['gene_name'])
    label += [str(row['chromosome_name']), str(row['start']), str(row['stop'])]
    return (":").join(label)
table['character_label'] = table.apply(lambda row: label_snv(row), axis=1)

In [5]:
raw_table = table.copy()
raw_table.head()

Unnamed: 0_level_0,chromosome_name,start,stop,reference,variant,type,gene_name,transcript_name,transcript_species,transcript_source,...,lungmet.rcnt.llr3_var,lungmet.rcnt.llr3_VAF,spinalmet.rcnt.llr3_ref,spinalmet.rcnt.llr3_var,spinalmet.rcnt.llr3_VAF,tumor.rcnt.llr3_ref,tumor.rcnt.llr3_var,tumor.rcnt.llr3_VAF,cluster,character_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_110359957_110359957,1,110359957,110359957,C,T,SNP,AHCYL1,NM_006621,human,genbank,...,0,0.0,142,0,0.0,150,4,2.58,7,AHCYL1:1:110359957:110359957
6_48902663_48902663,6,48902663,48902663,T,G,SNP,ENSG00000221175,ENST00000408248,human,ensembl,...,1,0.68,178,0,0.0,189,0,0.0,6,ENSG00000221175:6:48902663:48902663
6_49276467_49276467,6,49276467,49276467,G,C,SNP,-,-,-,-,...,0,0.0,63,17,21.25,117,1,0.85,9,6:49276467:49276467
6_50173697_50173697,6,50173697,50173697,C,G,SNP,DEFB112,NM_001037498,human,genbank,...,0,0.0,115,30,20.69,179,0,0.0,9,DEFB112:6:50173697:50173697
6_50179508_50179508,6,50179508,50179508,A,T,SNP,-,-,-,-,...,0,0.0,128,0,0.0,141,0,0.0,6,6:50179508:50179508


## Generate migration history input file

In [6]:
ref_cols = ['tumor.rcnt.llr3_ref','adrenalmet.rcnt.llr3_ref', 'livermet.rcnt.llr3_ref', 'lungmet.rcnt.llr3_ref', 'spinalmet.rcnt.llr3_ref']
var_cols = ['tumor.rcnt.llr3_var','adrenalmet.rcnt.llr3_var', 'livermet.rcnt.llr3_var', 'lungmet.rcnt.llr3_var', 'spinalmet.rcnt.llr3_var']

#breast,adrenal,liver,lung,spinal
cols = ['breast', 'adrenal', 'liver', 'lung', 'spinal']
table = table[['cluster']+ref_cols+var_cols]
table.columns = ['cluster']+['ref-'+c for c in cols] + ['var-'+c for c in cols]

## Get intervals

In [7]:
ctable = table.groupby('cluster').sum()

global corrected_confidence
nsamples = len([c for c in ctable.columns if c.startswith('ref')])
nclusters = len(ctable)
corrected_confidence = 1-((1.-confidence)/(nsamples*nclusters))
print(corrected_confidence)

assert(corrected_confidence > confidence)
assert(corrected_confidence < 1.0)
ctable

0.9988888888888889


Unnamed: 0_level_0,ref-breast,ref-adrenal,ref-liver,ref-lung,ref-spinal,var-breast,var-adrenal,var-liver,var-lung,var-spinal
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,657,571,551,612,698,267,307,437,147,170
2,5458,3263,2850,4572,5357,322,2287,2256,70,12
3,6948,8762,8990,6248,6861,2070,7,8,1092,1663
4,14589,10980,10950,12169,14195,486,3274,3625,47,10
5,4613,5563,6004,4323,5600,1337,8,11,720,26
6,4255,3175,3715,3465,4020,2,884,5,4,2
7,5529,5354,4489,4693,5212,152,3,1256,4,0
8,2156,2528,3007,2144,2608,543,1,1,13,0
9,12221,11841,11226,9962,9242,10,8,3,12,2129


In [8]:
import numpy
from scipy.stats import beta
from scipy.stats import norm

def binomial_hpdr(n, N, pct, a=1, b=1, n_pbins=1e3):
    """
    Function computes the posterior mode along with the upper and lower bounds of the
    **Highest Posterior Density Region**.

    Parameters
    ----------
    n: number of successes 
    N: sample size 
    pct: the size of the confidence interval (between 0 and 1)
    a: the alpha hyper-parameter for the Beta distribution used as a prior (Default=1)
    b: the beta hyper-parameter for the Beta distribution used as a prior (Default=1)
    n_pbins: the number of bins to segment the p_range into (Default=1e3)

    Returns
    -------
    A tuple that contains the mode as well as the lower and upper bounds of the interval
    (mode, lower, upper)

    """
    # fixed random variable object for posterior Beta distribution
    rv = beta(n+a, N-n+b)
    # determine the mode and standard deviation of the posterior
    stdev = rv.stats('v')**0.5
    mode = (n+a-1.)/(N+a+b-2.)
    # compute the number of sigma that corresponds to this confidence
    # this is used to set the rough range of possible success probabilities
    n_sigma = numpy.ceil(norm.ppf( (1+pct)/2. ))+1
    # set the min and max values for success probability 
    max_p = mode + n_sigma * stdev
    if max_p > 1:
        max_p = 1.
    min_p = mode - n_sigma * stdev
    if min_p > 1:
        min_p = 1.
    # make the range of success probabilities
    p_range = numpy.linspace(min_p, max_p, int(n_pbins+1))
    # construct the probability mass function over the given range
    if mode > 0.5:
        sf = rv.sf(p_range)
        pmf = sf[:-1] - sf[1:]
    else:
        cdf = rv.cdf(p_range)
        pmf = cdf[1:] - cdf[:-1]
    # find the upper and lower bounds of the interval 
    sorted_idxs = numpy.argsort( pmf )[::-1]
    cumsum = numpy.cumsum( numpy.sort(pmf)[::-1] )
    j = numpy.argmin( numpy.abs(cumsum - pct) )
    upper = p_range[ (sorted_idxs[:j+1]).max()+1 ]
    lower = p_range[ (sorted_idxs[:j+1]).min() ]    

    return (mode, lower, upper)

In [9]:
def get_ub(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    return v[2]
    

def get_lb(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[1]
    #if mval < 0.01: mval = 0
    return mval

def get_mean(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[0]
    return mval

ctable = table.groupby('cluster').sum()

for sam in cols:
    ctable['ub-'+sam]= ctable.apply(get_ub, args=[sam], axis=1)
    ctable['lb-'+sam]= ctable.apply(get_lb, args=[sam], axis=1)
    ctable[sam]= ctable.apply(get_mean, args=[sam], axis=1)
    

In [10]:
def get_ub(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    return v[2]
    

def get_lb(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[1]
    if mval < 0.01: mval = 0
    return mval

def get_mean(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[0]
    return mval

ctable_cutoff = table.groupby('cluster').sum()
for sam in cols:
    ctable_cutoff['ub-'+sam]= ctable.apply(get_ub, args=[sam], axis=1)
    ctable_cutoff['lb-'+sam]= ctable.apply(get_lb, args=[sam], axis=1)
    ctable_cutoff[sam]= ctable.apply(get_mean, args=[sam], axis=1)

In [11]:
def get_vaf(row, sam):
    return float(row['var-'+sam])/float(row['var-'+sam]+row['ref-'+sam])

vafs = pd.DataFrame()
for sam in cols:
    vafs[sam] = table.apply(get_vaf, args=[sam], axis=1)
vafs['cluster'] = table['cluster']

In [12]:
def add_char_label(row): 
    return "_".join(list(raw_table[raw_table['cluster']==row.name]['character_label']))

ctable_cutoff['character_label'] = ctable_cutoff.apply(lambda row: add_char_label(row), axis=1)
ctable_cutoff

Unnamed: 0_level_0,ref-breast,ref-adrenal,ref-liver,ref-lung,ref-spinal,var-breast,var-adrenal,var-liver,var-lung,var-spinal,...,ub-liver,lb-liver,liver,ub-lung,lb-lung,lung,ub-spinal,lb-spinal,spinal,character_label
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,657,571,551,612,698,267,307,437,147,170,...,0.494057,0.391347,0.442308,0.243143,0.149944,0.193676,0.242172,0.154515,0.195853,INVS:9:101950362:101950362_9:105825009:1058250...
2,5458,3263,2850,4572,5357,322,2287,2256,70,12,...,0.464552,0.419253,0.441833,0.021687,0.0,0.01508,0.005058,0.0,0.002235,6:50519104:50519104_ENSG00000214604:6:55918529...
3,6948,8762,8990,6248,6861,2070,7,8,1092,1663,...,0.002358,0.0,0.000889,0.162648,0.135564,0.148774,0.209346,0.181362,0.195096,6:50534952:50534952_RAB23:6:57180847:57180847_...
4,14589,10980,10950,12169,14195,486,3274,3625,47,10,...,0.260493,0.237149,0.248714,0.005975,0.0,0.003847,0.001707,0.0,0.000704,ICK:6:53035144:53035144_LOC727842:6:58602457:5...
5,4613,5563,6004,4323,5600,1337,8,11,720,26,...,0.004273,0.0,0.001829,0.159327,0.127203,0.142772,0.008232,0.0,0.004621,6:50568504:50568504_6:99144022:99144022_ROS1:6...
6,4255,3175,3715,3465,4020,2,884,5,4,2,...,0.004415,0.0,0.001344,0.004243,0.0,0.001153,0.002648,0.0,0.000497,ENSG00000221175:6:48902663:48902663_6:50179508...
7,5529,5354,4489,4693,5212,152,3,1256,4,0,...,0.236727,0.201177,0.218625,0.003134,0.0,0.000852,0.000959,0.0,0.0,AHCYL1:1:110359957:110359957_6:97912586:979125...
8,2156,2528,3007,2144,2608,543,1,1,13,0,...,0.00268,0.0,0.000332,0.013246,0.0,0.006027,0.001915,0.0,0.0,TFAP2B:6:50909762:50909762_ENPP3:6:132075413:1...
9,12221,11841,11226,9962,9242,10,8,3,12,2129,...,0.001156,0.0,0.000267,0.002727,0.0,0.001203,0.199339,0.175488,0.187231,6:49276467:49276467_DEFB112:6:50173697:5017369...


In [13]:
rows = ["5 #anatomical sites\n5 #samples\n9 #mutations\n#sample_index\tsample_label\tanatomical_site_index\tanatomical_site_label\tcharacter_index\tcharacter_label\tf_lb\tf_ub\tref\tvar\n",]
def print_char(row, sam):
    return "\t".join(map(str,[i, sam, i, sam, row.name-1, str(row['character_label']), max(row['lb-'+sam] * 2, 0), min(1, 2 * row['ub-'+sam]), int(row['ref-'+sam]), int(row['var-'+sam])]))+"\n"

for i, sam in enumerate(cols):
    rows += list(ctable_cutoff.apply(print_char, args=[sam], axis=1))

with open(os.path.join(HOADLEY_DATA_DIR,"A1/A1_"+str(confidence)+".tsv"), 'w') as f:
    for line in rows:
        f.write(line)