# Parse patient A7 data

## Adapted from MACHNA (El-Kebir et. al.)

In [10]:
confidence=0.95

In [11]:
def get_id(line):
    return "_".join(map(str, [line['chromosome_name'], line['start'], line['stop']]))

In [12]:
import pandas as pd
import os

HOADLEY_DATA_DIR = '/Users/divyakoyyalagunta/Desktop/Cornell_Research/Morris_Lab/machina/data/hoadley_2016/'
table = pd.read_table(os.path.join(HOADLEY_DATA_DIR,"A7/raw/A7.tsv"))
table['id']=table.apply(get_id, axis=1)
table = table.set_index('id')
table.columns

Index(['chromosome_name', 'start', 'stop', 'reference', 'variant', 'type',
       'gene_name', 'transcript_name', 'transcript_species',
       'transcript_source', 'transcript_version', 'strand',
       'transcript_status', 'trv_type', 'c_position', 'amino_acid_change',
       'ucsc_cons', 'domain', 'all_domains', 'deletion_substructures',
       'transcript_error', 'brain.rcnt.llr3_ref', 'brain.rcnt.llr3_var',
       'brain.rcnt.llr3_VAF', 'kidney.rcnt.llr3_ref', 'kidney.rcnt.llr3_var',
       'kidney.rcnt.llr3_VAF', 'liver.rcnt.llr3_ref', 'liver.rcnt.llr3_var',
       'liver.rcnt.llr3_VAF', 'lung.rcnt.llr3_ref', 'lung.rcnt.llr3_var',
       'lung.rcnt.llr3_VAF', 'rib.rcnt.llr3_ref', 'rib.rcnt.llr3_var',
       'rib.rcnt.llr3_VAF', 'tumor.rcnt.llr3_ref', 'tumor.rcnt.llr3_var',
       'tumor.rcnt.llr3_VAF', 'cluster'],
      dtype='object')

## Generate MACHINA input file

In [14]:
ref_cols = ['tumor.rcnt.llr3_ref','brain.rcnt.llr3_ref', 'kidney.rcnt.llr3_ref', 'liver.rcnt.llr3_ref', 'lung.rcnt.llr3_ref', 'rib.rcnt.llr3_ref']
var_cols = ['tumor.rcnt.llr3_var','brain.rcnt.llr3_var', 'kidney.rcnt.llr3_var', 'liver.rcnt.llr3_var', 'lung.rcnt.llr3_var', 'rib.rcnt.llr3_var']

cols = ['breast', 'brain', 'kidney', 'liver', 'lung', 'rib']
table = table[['cluster']+ref_cols+var_cols]
table.columns = ['cluster']+['ref-'+c for c in cols] + ['var-'+c for c in cols]

### Reassign clusters of mutations 7_12163423_12163423 and 7_57562948_57562948

#### details for why in MACHINA paper

In [15]:
ctable = table.groupby('cluster').sum()
ctable.head()

print(table.loc['7_12163423_12163423'])
print(table.loc['7_57562948_57562948'])

table.loc['7_12163423_12163423']['cluster']=2
table.loc['7_57562948_57562948']['cluster']=2

global corrected_confidence
nsamples = len([c for c in ctable.columns if c.startswith('ref')])
nclusters = len(ctable)
corrected_confidence = 1-((1.-confidence)/(nsamples*nclusters))
print(corrected_confidence)

assert(corrected_confidence > confidence)
assert(corrected_confidence < 1.0)


cluster         3
ref-breast    161
ref-brain     218
ref-kidney    185
ref-liver     102
ref-lung      132
ref-rib       114
var-breast      0
var-brain       6
var-kidney     60
var-liver     104
var-lung        0
var-rib        47
Name: 7_12163423_12163423, dtype: int64
cluster         3
ref-breast    196
ref-brain     286
ref-kidney    306
ref-liver     132
ref-lung      235
ref-rib       146
var-breast      1
var-brain       4
var-kidney     67
var-liver     121
var-lung        0
var-rib        81
Name: 7_57562948_57562948, dtype: int64
0.9991666666666666


## Get intervals

In [16]:
import numpy
from scipy.stats import beta
from scipy.stats import norm

def binomial_hpdr(n, N, pct, a=1, b=1, n_pbins=1e3):
    """
    Function computes the posterior mode along with the upper and lower bounds of the
    **Highest Posterior Density Region**.

    Parameters
    ----------
    n: number of successes 
    N: sample size 
    pct: the size of the confidence interval (between 0 and 1)
    a: the alpha hyper-parameter for the Beta distribution used as a prior (Default=1)
    b: the beta hyper-parameter for the Beta distribution used as a prior (Default=1)
    n_pbins: the number of bins to segment the p_range into (Default=1e3)

    Returns
    -------
    A tuple that contains the mode as well as the lower and upper bounds of the interval
    (mode, lower, upper)

    """
    # fixed random variable object for posterior Beta distribution
    rv = beta(n+a, N-n+b)
    # determine the mode and standard deviation of the posterior
    stdev = rv.stats('v')**0.5
    mode = (n+a-1.)/(N+a+b-2.)
    # compute the number of sigma that corresponds to this confidence
    # this is used to set the rough range of possible success probabilities
    n_sigma = numpy.ceil(norm.ppf( (1+pct)/2. ))+1
    # set the min and max values for success probability 
    max_p = mode + n_sigma * stdev
    if max_p > 1:
        max_p = 1.
    min_p = mode - n_sigma * stdev
    if min_p > 1:
        min_p = 1.
    # make the range of success probabilities
    p_range = numpy.linspace(min_p, max_p, int(n_pbins+1))
    # construct the probability mass function over the given range
    if mode > 0.5:
        sf = rv.sf(p_range)
        pmf = sf[:-1] - sf[1:]
    else:
        cdf = rv.cdf(p_range)
        pmf = cdf[1:] - cdf[:-1]
    # find the upper and lower bounds of the interval 
    sorted_idxs = numpy.argsort( pmf )[::-1]
    cumsum = numpy.cumsum( numpy.sort(pmf)[::-1] )
    j = numpy.argmin( numpy.abs(cumsum - pct) )
    upper = p_range[ (sorted_idxs[:j+1]).max()+1 ]
    lower = p_range[ (sorted_idxs[:j+1]).min() ]    

    return (mode, lower, upper)

In [17]:
#### 
def get_ub(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    return v[2]
    

def get_lb(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[1]
    #if mval < 0.01: mval = 0
    return mval

def get_mean(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[0]
    return mval

ctable = table.groupby('cluster').sum()
for sam in cols:
    ctable['ub-'+sam]= ctable.apply(get_ub, args=[sam], axis=1)
    ctable['lb-'+sam]= ctable.apply(get_lb, args=[sam], axis=1)
    ctable[sam]= ctable.apply(get_mean, args=[sam], axis=1)

In [18]:
def get_ub(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    return v[2]
    

def get_lb(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[1]
    if mval < 0.01: mval = 0
    return mval

def get_mean(row, sam):
    v=binomial_hpdr(row['var-'+sam], row['var-'+sam]+row['ref-'+sam], corrected_confidence)
    mval = v[0]
    return mval

ctable_cutoff = table.groupby('cluster').sum()
for sam in cols:
    ctable_cutoff['ub-'+sam]= ctable.apply(get_ub, args=[sam], axis=1)
    ctable_cutoff['lb-'+sam]= ctable.apply(get_lb, args=[sam], axis=1)
    ctable_cutoff[sam]= ctable.apply(get_mean, args=[sam], axis=1)

In [19]:
def get_vaf(row, sam):
    return float(row['var-'+sam])/float(row['var-'+sam]+row['ref-'+sam])

#ctable_cutoff = table.groupby('cluster').mean()
vafs = pd.DataFrame()
for sam in cols:
    vafs[sam] = table.apply(get_vaf, args=[sam], axis=1)
vafs['cluster'] = table['cluster']

In [20]:
rows = ["5 #anatomical sites\n5 #samples\n9 #mutations\n#sample_index\tsample_label\tanatomical_site_index\tanatomical_site_label\tcharacter_index\tcharacter_label\tf_lb\tf_ub\tref\tvar\n",]
def print_char(row, sam):
    return "\t".join(map(str,[i, sam, i, sam, row.name-1, str(row.name), max(row['lb-'+sam] * 2, 0), min(1, 2 * row['ub-'+sam]), int(row['ref-'+sam]), int(row['var-'+sam])]))+"\n"

for i, sam in enumerate(cols):
    rows += list(ctable_cutoff.apply(print_char, args=[sam], axis=1))

with open("../A7_MACHINA_"+str(confidence)+".tsv", 'w') as f:
    for line in rows:
        f.write(line)