BASE CALLING SCRATCH CODE
=====

In [1]:
import itertools

bases = ['A','T','G','C']
genotypes = list(itertools.combinations_with_replacement(bases,bases))
#genotypes = [('A','A'),('A','T')]

In [2]:
from math import log10, factorial
from functools import reduce
import operator
from collections import defaultdict

# HELPER FUNCTIONS

# product of list
def prod(iterable): # credit to: http://stackoverflow.com/questions/7948291/is-there-a-built-in-product-in-python
    return reduce(operator.mul, iterable, 1)

# sum of two probabilities that are in log form
def addlogs(a,b):
    if a > b:
        return a + log10(1 + pow(10, b - a))
    else:
        return b + log10(1 + pow(10, a - b))

# difference of two probabilities that are in log form
def subtractlogs(a,b):
    if a > b:
        return a + log10(1 - pow(10, b - a))
    else:
        return b + log10(1 - pow(10, a - b))

p_null = 0.5 # probability that strand is not sampled in chamber. hardcoded until we can estimate it

# helper data structures that pre-compute probabilities of strand configurations

ch_priors = [log10((1-p_null)/24)]*24 + [log10(p_null)]
poss_ch = list(range(0,25)) # the last element is "no chamber"
het_config_probs = dict()

for c1,c2,c3,c4 in itertools.product(poss_ch,poss_ch,poss_ch,poss_ch):
    # because of symmetry we ignore cases where we can flip the strands for the same result
    # e.g. we ignore cases where strand 1 is in a later chamber than strand 2 and say that
    # the cases where it is in an earlier chamber are twice as likely. Same for strands 3,4
    if c1 > c2 and c3 > c4:
        perm = log10(4)
    elif (c1 > c2 and c3 == c4) or (c1 == c2 and c3 > c4):
        perm = log10(2)
    elif c1 == c2 and c3 == c4:
        perm = log10(1)
    else:
        continue

    het_config_probs[(c1,c2,c3,c4)] = perm+ch_priors[c1]+ch_priors[c2]+ch_priors[c3]+ch_priors[c4] # probability of configuration
    
possible_chambers = list(range(0,25))

hom_config_probs = dict()

for c1,c2,c3,c4 in itertools.product(poss_ch,poss_ch,poss_ch,poss_ch):
    
    lst  = sorted([c1,c2,c3,c4])
    if tuple(lst) in hom_config_probs:
        continue
    uniq = list(set(lst))
    counts  = [lst.count(u) for u in uniq]
    perm = log10(factorial(len(lst))/prod([factorial(c) for c in counts]))
    
    hom_config_probs[tuple(lst)] = perm+ch_priors[c1]+ch_priors[c2]+ch_priors[c3]+ch_priors[c4] # probability of configuration    
    
# INPUT
# G: a tuple that specifies genotype e.g. ('A','T')
# nonzero_chambers: a list containing the indices of chambers 0..23 (or 24, unsampled) where strands may be (have read coverage)
# OUTPUT
# a list of configurations represented by tuples.
# the tuples contain (sl,prob) where sl is a 4-tuple of strand chambers (0..23 or 24) and prob is log probability of config occuring
def singlecell_config(G,nonzero_chambers):
    g1,g2 = G
    nz = nonzero_chambers + [24]
    nzs = set(nonzero_chambers)
    
    configs = []
    for c1,c2,c3,c4 in itertools.product(nz,nz,nz,nz):
        if set.intersection({c1,c2,c3,c4},nzs) != nzs:
            continue

        # c1..c4 represent the chamber placements of strands 1..4
        sl = (c1,c2,c3,c4)
        if (g1 == g2 and sl not in hom_config_probs) or (g1 != g2 and sl not in het_config_probs):
            continue
        
        if g1 == g2:
            prob = hom_config_probs[(c1,c2,c3,c4)]
        else:
            prob = het_config_probs[(c1,c2,c3,c4)]
            
        configs.append((sl,prob))

    total = configs[0][1]
    for i in range(1,len(configs)):
        total = addlogs(total,configs[i][1])
    
    for i in range(len(configs)):
        configs[i] = (configs[i][0], configs[i][1] - total)
        
    return configs

# INPUT
# G: a tuple that specifies genotype e.g. ('A','T')
# nonzero_chambers: a list containing one list per cell.
# the inner list should contain the indices of chambers 0..23 (or 24, unsampled) where strands are found
# OUTPUT
# a list of configurations represented by tuples.
# the tuple contains one inner tuple per cell.
# these inner tuples contain (sl,prob) where sl is a 4-tuple of strand chambers (0..23 or 24) and prob is log probability of config occuring
def multicell_config(G,nonzero_chambers):
    g1,g2 = G
    
    config_sets = [singlecell_config(G,nz) for nz in nonzero_chambers]
    return itertools.product(*config_sets)
    


In [15]:
-

In [16]:
mixed_alleles = list(set([tuple(sorted(list(x))) for x in itertools.product(alleles,alleles)]))

In [17]:
mixed_alleles

[('C', 'T'),
 ('G', 'G'),
 ('C', 'C'),
 ('A', 'G'),
 ('A', 'C'),
 ('T', 'T'),
 ('A', 'A'),
 ('C', 'G'),
 ('G', 'T'),
 ('A', 'T')]

In [18]:
nz = [0,1,2,24]
it = itertools.product(nz,nz,nz,nz)

In [19]:
next(it)

(0, 0, 0, 0)

In [23]:
total = 0
for k,v in mixed_allele_priors[4].items():
    print("{}\t{}".format(k,10**v))
    total += 10**v
print(total)

('G',)	0.24999999999999986
('C', 'T')	0.0
('A', 'T')	0.0
('T',)	0.2499999999999998
('A', 'G')	0.0
('G', 'T')	0.0
('A', 'C')	0.0
('C',)	0.24999999999999967
('A',)	0.2499999999999998
('C', 'G')	0.0
0.9999999999999991


defaultdict(float,
            {0.001949317738791423: 0.010000000000000002,
             0.0038910505836575876: 0.020000000000000004,
             0.007751937984496124: 0.030000000000000006,
             0.015384615384615385: 0.04000000000000001,
             0.030303030303030304: 0.05000000000000001,
             0.058823529411764705: 0.06000000000000001,
             0.1111111111111111: 0.07,
             0.2: 0.08000000000000002,
             0.3333333333333333: 0.09000000000000002,
             0.5: 0.10000000000000003,
             0.6666666666666666: 0.09000000000000002,
             0.8: 0.08000000000000002,
             0.8888888888888888: 0.07,
             0.9411764705882353: 0.06000000000000001,
             0.9696969696969697: 0.05000000000000001,
             0.9846153846153847: 0.04000000000000001,
             0.9922480620155039: 0.030000000000000006,
             0.9961089494163424: 0.020000000000000004,
             0.9980506822612085: 0.010000000000000002})

In [31]:
P_plst = list(P_p.items())

In [33]:
P_plst.sort()

In [34]:
P_plst

[(0.001949317738791423, 0.010000000000000002),
 (0.0038910505836575876, 0.020000000000000004),
 (0.007751937984496124, 0.030000000000000006),
 (0.015384615384615385, 0.04000000000000001),
 (0.030303030303030304, 0.05000000000000001),
 (0.058823529411764705, 0.06000000000000001),
 (0.1111111111111111, 0.07),
 (0.2, 0.08000000000000002),
 (0.3333333333333333, 0.09000000000000002),
 (0.5, 0.10000000000000003),
 (0.6666666666666666, 0.09000000000000002),
 (0.8, 0.08000000000000002),
 (0.8888888888888888, 0.07),
 (0.9411764705882353, 0.06000000000000001),
 (0.9696969696969697, 0.05000000000000001),
 (0.9846153846153847, 0.04000000000000001),
 (0.9922480620155039, 0.030000000000000006),
 (0.9961089494163424, 0.020000000000000004),
 (0.9980506822612085, 0.010000000000000002)]

In [12]:
import pickle
from collections import defaultdict

#P_parent1_lst = [(0.001,log10(0.495)),(0.5,log10(0.01)),(0.999,log10(0.495))]
cov_frac_dist_raw = pickle.load(open( "parameters/cov_frac_dist.p", "rb"))
cov_frac_dist = defaultdict(list)

lim = 30
for i in range(1,lim+1):
    cov_frac_dist[i] = cov_frac_dist_raw[i]

def chunks(l, n): # credit to http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

# given a list with length N*binsize, sum consectutive binsize-sized chunks into a length N list
def condense(l, binsize):
    return [sum(x) for x in chunks(l,binsize)]
    
for i in range(lim,lim):
    binsize = int(i / lim)
    lst = condense(cov_frac_dist_raw[i], binsize)
    for j in range(i,i+lim):
        cov_frac_dist[j] = lst

for i in range(lim,2000):
    print("{} {} {}".format(i,len(cov_frac_dist[i]), lim))
    assert(len(cov_frac_dist[i]) == lim)

30 30 30


AssertionError: 

In [7]:
lst

[0.0,
 0.0,
 0.10433744096411704,
 0.056285398910663552,
 0.048079184988147589,
 0.035367243906411161,
 0.03527676745743083,
 0.025758645024700072,
 0.029106273636972296,
 0.026301503718582052,
 0.025785787959394169,
 0.025034833432857426,
 0.026310551363480088,
 0.024817689955304634,
 0.02479959466550857,
 0.021334346669561913,
 0.025025785787959394,
 0.024808642310406602,
 0.026708647738993539,
 0.024464831804281346,
 0.026337694298174185,
 0.026455313681848613,
 0.028138175632882761,
 0.025686263865515806,
 0.036398675424786926,
 0.036244865461520365,
 0.048902520673868591,
 0.057588259775980311,
 0.10464506089065016,
 0.0]

In [9]:
list(range(2,20,4))

[2, 6, 10, 14, 18]

In [None]:
def estimate