BASE CALLING SCRATCH CODE
=====

In [1]:
import itertools

bases = ['A','T','G','C']
genotypes = list(itertools.product(bases,bases))
#genotypes = [('A','A'),('A','T')]

In [2]:
from math import log10, factorial
from functools import reduce
import operator

# HELPER FUNCTIONS

# product of list
def prod(iterable): # credit to: http://stackoverflow.com/questions/7948291/is-there-a-built-in-product-in-python
    return reduce(operator.mul, iterable, 1)

# sum of two probabilities that are in log form
def addlogs(a,b):
    if a > b:
        return a + log10(1 + pow(10, b - a))
    else:
        return b + log10(1 + pow(10, a - b))

# difference of two probabilities that are in log form
def subtractlogs(a,b):
    if a > b:
        return a + log10(1 - pow(10, b - a))
    else:
        return b + log10(1 - pow(10, a - b))

p_null = 0.5 # probability that strand is not sampled in chamber. hardcoded until we can estimate it

# helper data structures that pre-compute probabilities of strand configurations

ch_priors = [log10((1-p_null)/24)]*24 + [log10(p_null)]
poss_ch = list(range(0,25)) # the last element is "no chamber"
het_config_probs = dict()

for c1,c2,c3,c4 in itertools.product(poss_ch,poss_ch,poss_ch,poss_ch):
    # because of symmetry we ignore cases where we can flip the strands for the same result
    # e.g. we ignore cases where strand 1 is in a later chamber than strand 2 and say that
    # the cases where it is in an earlier chamber are twice as likely. Same for strands 3,4
    if c1 > c2 and c3 > c4:
        perm = log10(4)
    elif (c1 > c2 and c3 == c4) or (c1 == c2 and c3 > c4):
        perm = log10(2)
    elif c1 == c2 and c3 == c4:
        perm = log10(1)
    else:
        continue

    het_config_probs[(c1,c2,c3,c4)] = perm+ch_priors[c1]+ch_priors[c2]+ch_priors[c3]+ch_priors[c4] # probability of configuration
    
possible_chambers = list(range(0,25))

hom_config_probs = dict()

for c1,c2,c3,c4 in itertools.product(poss_ch,poss_ch,poss_ch,poss_ch):
    
    lst  = sorted([c1,c2,c3,c4])
    if tuple(lst) in hom_config_probs:
        continue
    uniq = list(set(lst))
    counts  = [lst.count(u) for u in uniq]
    perm = log10(factorial(len(lst))/prod([factorial(c) for c in counts]))
    
    hom_config_probs[tuple(lst)] = perm+ch_priors[c1]+ch_priors[c2]+ch_priors[c3]+ch_priors[c4] # probability of configuration    
    
# INPUT
# G: a tuple that specifies genotype e.g. ('A','T')
# nonzero_chambers: a list containing the indices of chambers 0..23 (or 24, unsampled) where strands may be (have read coverage)
# OUTPUT
# a list of configurations represented by tuples.
# the tuples contain (sl,prob) where sl is a 4-tuple of strand chambers (0..23 or 24) and prob is log probability of config occuring
def singlecell_config(G,nonzero_chambers):
    g1,g2 = G
    nz = nonzero_chambers + [24]
    nzs = set(nonzero_chambers)
    
    configs = []
    #count = 0     # TEMP
    for c1,c2,c3,c4 in itertools.product(nz,nz,nz,nz):
        if set.intersection({c1,c2,c3,c4},nzs) != nzs:
            continue
        #count += 1 # TEMP
        #continue   # TEMP
        # c1..c4 represent the chamber placements of strands 1..4
        sl = (c1,c2,c3,c4)
        if (g1 == g2 and sl not in hom_config_probs) or (g1 != g2 and sl not in het_config_probs):
            continue
        
        if g1 == g2:
            prob = hom_config_probs[(c1,c2,c3,c4)]
        else:
            prob = het_config_probs[(c1,c2,c3,c4)]
            
        configs.append((sl,prob))
    #return count      # TEMP
    total = configs[0][1]
    for i in range(1,len(configs)):
        total = addlogs(total,configs[i][1])
    
    for i in range(len(configs)):
        configs[i] = (configs[i][0], configs[i][1] - total)
        
    return configs

# INPUT
# G: a tuple that specifies genotype e.g. ('A','T')
# nonzero_chambers: a list containing one list per cell.
# the inner list should contain the indices of chambers 0..23 (or 24, unsampled) where strands are found
# OUTPUT
# a list of configurations represented by tuples.
# the tuple contains one inner tuple per cell.
# these inner tuples contain (sl,prob) where sl is a 4-tuple of strand chambers (0..23 or 24) and prob is log probability of config occuring
def multicell_config(G,nonzero_chambers):
    g1,g2 = G
    
    #total2 = 1                              #TEMP
    #for nz in nonzero_chambers:             #TEMP
    #    total2 *= singlecell_config(G,nz)   #TEMP
    #return total2 #TEMP
    config_sets = [singlecell_config(G,nz) for nz in nonzero_chambers]
    return itertools.product(*config_sets)
    


In [15]:
epsilon = log10(0.0001)

genotype_priors = dict()
for g in genotypes:
    genotype_priors[g] = log10(1/16)
n_cells = 3
alleles = ['A','T','G','C']
mixed_alleles = list(set([tuple(sorted(list(set(x)))) for x in itertools.product(alleles,alleles)]))

# accessed as mixed_allele_priors[x][y]
# where x is the number of chambers with reads
# and y is the allele mixture as an alphabetically sorted tuple
mixed_allele_priors = {1:dict(),2:dict(),3:dict(),4:dict()}

for i in range(1,5):
    for mixed_allele in mixed_alleles:
        mixed_allele_priors[i][mixed_allele] = -1e100

for i in range(1,5):
    for G in genotypes:
        nz = list(range(i))
        cfgs = singlecell_config(G,nz)

        for (c1,c2,c3,c4), p in cfgs:
            # probability of configuration
            p += genotype_priors[G] - log10(len({c1,c2,c3,c4}))

            for j in {c1,c2,c3,c4}:

                alleles_present = []
                if c1 == j:
                    alleles_present.append(G[0])
                if c2 == j:
                    alleles_present.append(G[0])
                if c3 == j:
                    alleles_present.append(G[1])
                if c4 == j:
                    alleles_present.append(G[1])

                alleles_present = tuple(sorted(list(set(alleles_present))))

                mixed_allele_priors[i][alleles_present] = addlogs(mixed_allele_priors[i][alleles_present], p)
                                                
# compute probability that allele is present in chamber
def pr_allele(allele, chamber, base_data, qual_data, configurations):
    
    res = []
    probs = []
    
    for allele in mixed_alleles:
        
        probs.append(pr_all_chamber_data(allele, chamber, base_data, qual_data, configurations)) 
    
    # denominator for bayes rule posterior calculation
    total = -1e100
    for p, ap in zip(probs,allele_priors):
        total = addlogs(total,p+ap)
    
    for a, prob in zip(mixed_alleles,probs):
        
        posterior = prob - total
        res.append(a,10**posterior)
        
    return res

def pr_all_chamber_data(allele, chamber, base_data, qual_data, configs):
    
    total = -1e100
    
    for G in genotypes:
        for config in configs:
            
            p = genotype_priors[G] 
            
            for i in range(len(config)):

                (c1,c2,c3,c4), p_cell_cfg = config[i]
                p += p_cell_cfg
                
            for i in range(24):
                    
                alleles_present = []
                if c1 == i:
                    alleles_present.append(G[0])
                if c2 == i:
                    alleles_present.append(G[0])
                if c3 == i:
                    alleles_present.append(G[1])
                if c4 == i:
                    alleles_present.append(G[1])
                
                alleles_present = tuple(sorted(list(set(alleles_present))))
                
                if alleles_present == allele:
                
                    p += pr_one_chamber_data(alleles_present, base_data[j], qual_data[j])
            
            total = addlogs(total,p)
            
            
#def pr_one_chamber_data(alleles_present, base_data, qual_data):
    
    #if len(alleles_present) == 1           

In [16]:
mixed_alleles = list(set([tuple(sorted(list(x))) for x in itertools.product(alleles,alleles)]))

In [17]:
mixed_alleles

[('C', 'T'),
 ('G', 'G'),
 ('C', 'C'),
 ('A', 'G'),
 ('A', 'C'),
 ('T', 'T'),
 ('A', 'A'),
 ('C', 'G'),
 ('G', 'T'),
 ('A', 'T')]

In [18]:
nz = [0,1,2,24]
it = itertools.product(nz,nz,nz,nz)

In [19]:
next(it)

(0, 0, 0, 0)

In [23]:
total = 0
for k,v in mixed_allele_priors[4].items():
    print("{}\t{}".format(k,10**v))
    total += 10**v
print(total)

('G',)	0.24999999999999986
('C', 'T')	0.0
('A', 'T')	0.0
('T',)	0.2499999999999998
('A', 'G')	0.0
('G', 'T')	0.0
('A', 'C')	0.0
('C',)	0.24999999999999967
('A',)	0.2499999999999998
('C', 'G')	0.0
0.9999999999999991
