# Hidden Markov Models

### Mixed Genome Problem
- given a set of alleles from the alphabet {A,C,G,T,H,N} (where H and N indicate probe errors) from 8 founder genomes and a 9th hybrid genome, determine the founder genome that most likely contributed to each allele in the hybrid genome
    - i.e. determine the most likely set of mutations

In [1]:
fp = open("GenomeData.csv", 'r')
data = fp.read().split('\n')        # break file into lines
fp.close()
header = data.pop(0).split(',')     # First line is header
column = header.index("owenmc")
# while (len(data[-1].strip()) < 1):  # remove extra lines
#     data.pop()
for i, line in enumerate(data):     # make a list from each row
    field = line.split(',')
    field[1] = int(field[1])        # convert position to integer
    for base in field[2:10]:
        if base != field[3]:
            break
    data[i] = field[0:10] + [base]
    

for i in range(0, 10):
    print("data[%d] = %s" % (i, data[i]))

FileNotFoundError: [Errno 2] No such file or directory: 'GenomeData.csv'

In [None]:
from math import exp, log10

Nstates = 8
prevpos = 1
state = [[(float(len(data)),i) for i in range(Nstates)]]   # (log(p), PathToHere)
for i in range(len(data)):
    # Count expected genotypes
    count = dict([(call, data[i][2:2+Nstates].count(call)) for call in "ACGTHN"])
    # Get the target genotype at this probe
    observed = data[i][-1]
    # Compute emission probability, assuming 5% error rate
    if (count[observed] == 0):
        emission = [1.0/Nstates for j in xrange(2,2+Nstates)]   # unexpected 
    else:
        emission = [0.99/count[data[i][j]] if data[i][j] == observed else 0.01/(Nstates - count[data[i][j]])
                    for j in range(2,2+Nstates)]
    # compute transition probability
    position = data[i][1]
    delta = position - prevpos
    prevpos = position
    stay = ((Nstates - 1.0)*exp(-delta/23604450.50) + 1.0)/Nstates
    switch = (1.0 - stay)/(Nstates - 1.0)
    # update state probailities for all paths leading to the ith state
    path = []
    for j in range(Nstates):
        choices = [(log10(emission[j])+(log10(stay) if (k==j) else log10(switch))+state[-1][k][0],k)
                   for k in range(Nstates)]
        path.append(max(choices))   # choices is a list of tuples of (score[i], from_whence_I_arrived[i])
    state.append(path)
print("Length of paths:", len(state))