# Hidden Markov Models

# Generalized Viterbi Decoding Algorithm
- states --> tuple of strings represeting sigma, the set of all possible states
- obs --> the observed sequence of emission characters, represents Q
- s_pro --> starting probability dictionary that maps each state to a probability
- t_pro --> transition probability that maps each state to another dict that maps each state to a probability, represents A
- e_pro --> emission probability that maps each state to another dict that maps each emission character to a probability, represents E

In [None]:
def Viterbi(obs, states, s_pro, t_pro, e_pro):
    path = { s:[] for s in states}
    curr_pro = {}
    for s in states:
        curr_pro[s] = s_pro[s]*e_pro[s][obs[0]]
    for i in range(1, len(obs)):
        last_pro = curr_pro
        curr_pro = {}
        for curr_state in states:
            max_pro, last_sta = max(((last_pro[last_state]*t_pro[last_state][curr_state]*e_pro[curr_state][obs[i]], last_state) 
                       for last_state in states))
            curr_pro[curr_state] = max_pro
            path[curr_state].append(last_sta)

    # backtrack to find max path
    max_pro = -1
    max_path = None
    for s in states:
        path[s].append(s)
        if curr_pro[s] > max_pro:
            max_path = path[s]
            max_pro = curr_pro[s]
    return (max_path, max_pro)

### Mixed Genome Problem
- given a set of alleles from the alphabet {A,C,G,T,H,N} (where H and N indicate probe errors) from 8 founder genomes and a 9th hybrid genome, determine the founder genome that most likely contributed to each allele in the hybrid genome
    - i.e. determine the most likely sequence of mutations

#### Read the Genome data

In [139]:
fp = open("GenomeData.csv", 'r')
data = fp.read().split('\n')        # break file into lines
fp.close()
header = data.pop(0).split(',')     # First line is header
column = header.index("owenmc")
# while (len(data[-1].strip()) < 1):  # remove extra lines
#     data.pop()
for i, line in enumerate(data):     # make a list from each row
    field = line.split(',')
    field[1] = int(field[1])        # convert position to integer
    for base in field[2:10]:
        if base != field[3]:
            break
    data[i] = field[0:10] + [base]
    
print(header[0:10] + [header[column]])

for i in range(0, 10):
    print((data[i]))

['Chromosome', 'Position', 'A/J', 'C57BL/6J', '129S1/SvImJ', 'NOD/ShiLtJ', 'NZO/HlLtJ', 'CAST/EiJ', 'PWK/PhJ', 'WSB/EiJ', 'owenmc']
['13', 3561535, 'A', 'C', 'A', 'C', 'C', 'C', 'C', 'C', 'A']
['13', 4029227, 'A', 'A', 'A', 'A', 'G', 'G', 'A', 'A', 'G']
['13', 4752533, 'C', 'T', 'C', 'T', 'C', 'T', 'C', 'T', 'C']
['13', 5392085, 'T', 'G', 'T', 'T', 'G', 'T', 'G', 'T', 'T']
['13', 5945123, 'G', 'G', 'G', 'A', 'G', 'A', 'G', 'G', 'A']
['13', 6304055, 'G', 'A', 'G', 'G', 'G', 'G', 'G', 'G', 'G']
['13', 7088753, 'T', 'T', 'T', 'C', 'C', 'T', 'C', 'T', 'C']
['13', 7553047, 'A', 'A', 'A', 'C', 'C', 'A', 'C', 'C', 'C']
['13', 7999300, 'G', 'G', 'G', 'T', 'T', 'G', 'T', 'G', 'T']
['13', 8618531, 'T', 'T', 'T', 'C', 'C', 'C', 'C', 'C', 'C']


#### Viterbi Decoding Algorithm Dynamic Program

In [15]:
from math import exp, log10

Nstates = 8
prevpos = 1
state = [[(float(len(data)),i) for i in range(Nstates)]]   # (log(p), PathToHere)
for i in range(len(data)):
    # Count expected genotypes
    count = dict([(call, data[i][2:2+Nstates].count(call)) for call in "ACGTHN"])
    # Get the target genotype at this probe
    observed = data[i][-1]
    # Compute emission probability, assuming 5% error rate
    if (count[observed] == 0):
        emission = [1.0/Nstates for j in xrange(2,2+Nstates)]   # unexpected 
    else:
        emission = [0.99/count[data[i][j]] if data[i][j] == observed else 0.01/(Nstates - count[data[i][j]])
                    for j in range(2,2+Nstates)]
    # compute transition probability
    position = data[i][1]
    delta = position - prevpos
    prevpos = position
    stay = ((Nstates - 1.0)*exp(-delta/23604450.50) + 1.0)/Nstates
    switch = (1.0 - stay)/(Nstates - 1.0)
    # update state probailities for all paths leading to the ith state
    path = []
    for j in range(Nstates):
        choices = [(log10(emission[j])+(log10(stay) if (k==j) else log10(switch))+state[-1][k][0],k)
                   for k in range(Nstates)]
        path.append(max(choices))   # choices is a list of tuples of (score[i], from_whence_I_arrived[i])
    state.append(path)
print("Length of paths:", len(state))

Length of paths: 261


#### Backtrack to find solution: position, allele, most likely founder genome

In [16]:
# backtrack
path = state[-1]
maxi = 0
maxp = path[0][0]
for i in range(1,Nstates):
    if (path[i][0] > maxp):
        maxp = path[i][0]
        maxi = i
print(maxi, path[maxi], header[2+maxi])

for j in range(len(state)-2,-1,-1):
    data[j].append(header[2+maxi])
    maxi = state[j+1][maxi][1]

# print the first 10 rows
for row in data[:10]:
    print(row[1], row[-2], row[-1])

2 (-13.933684392229354, 2) 129S1/SvImJ
3561535 A NZO/HlLtJ
4029227 G NZO/HlLtJ
4752533 C NZO/HlLtJ
5392085 T NOD/ShiLtJ
5945123 A NOD/ShiLtJ
6304055 G NOD/ShiLtJ
7088753 C NOD/ShiLtJ
7553047 C NOD/ShiLtJ
7999300 T NOD/ShiLtJ
8618531 C NOD/ShiLtJ


# Fair Bet Casino Problem
- Î£ = {H, T}
- Q = {F, B}
- A = [[0.9, 0.1],[0.1, 0.9]]
- E = [[0.5, 0.5],[0.25,0.75]]

- the unfair casino has fair and biased coins. The dealer swaps coins after each flip with probability of 0.1. The fair coin has 50-50 odds. The biased coin has 0.75, 0.25 odds for heads and tails, respectively.

- given a sequence of coin flips, use a hmm to determine the most likely sequece of coins used

In [131]:
data = "HTHHHHHHHHHTHTTTHTHTHHHHTTHHTHTTTHHTHHHHHHHTHHHTHHTTTHHHHTTHHHHHHHTTHTHHHHHHHTTH"

In [132]:
states = ('F', 'B')
 
observations = ('H', 'T')
 
start_probability = {'F': 0.5, 'B': 0.5}
 
transition_probability = {
   'F' : {'F': 0.9, 'B': 0.1},
   'B' :   {'B': 0.9, 'F': 0.1},
   }
 
emission_probability = {
   'F' : {'H': 0.5, 'T': 0.5},
   'B'   : {'H': 0.75, 'T': 0.25}
   }


def Viterbi(obs, states, s_pro, t_pro, e_pro):
    path = { s:[] for s in states} # init path: path[s] represents the path ends with s
    curr_pro = {}
    for s in states:
        curr_pro[s] = s_pro[s]*e_pro[s][obs[0]]
    for i in range(1, len(obs)):
        last_pro = curr_pro
        curr_pro = {}
        for curr_state in states:
            max_pro, last_sta = max(((last_pro[last_state]*t_pro[last_state][curr_state]*e_pro[curr_state][obs[i]], last_state) 
                       for last_state in states))
            curr_pro[curr_state] = max_pro
            path[curr_state].append(last_sta)

    # backtrack to find max path
    max_pro = -1
    max_path = None
    for s in states:
        path[s].append(s)
        if curr_pro[s] > max_pro:
            max_path = path[s]
            max_pro = curr_pro[s]
    return (max_path, max_pro)

(['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B'], 4.829216754076891e-27)


### Coding and non Coding DNA Regions
- H state --> high GC content --> coding DNA region
- L state --> low GC content --> non coding DNA region

In [124]:
states = ('H', 'L')
 
observations = ('A', 'C', 'G', 'T')
 
start_probability = {'H': 0.5, 'L': 0.5}
 
transition_probability = {
   'H' : {'H': 0.5, 'L': 0.5},
   'L' :   {'L': 0.6, 'H': 0.4},
   }
 
emission_probability = {
   'H' : {'A': 0.2, 'C': 0.3, 'G': 0.3, 'T': 0.2},
   'L'   : {'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3}
   }


def Viterbi(obs, states, s_pro, t_pro, e_pro):
    path = { s:[] for s in states} # init path: path[s] represents the path ends with s
    curr_pro = {}
    for s in states:
        curr_pro[s] = s_pro[s]*e_pro[s][obs[0]]
    for i in range(1, len(obs)):
        last_pro = curr_pro
        curr_pro = {}
        for curr_state in states:
            max_pro, last_sta = max(((last_pro[last_state]*t_pro[last_state][curr_state]*e_pro[curr_state][obs[i]], last_state) 
                       for last_state in states))
            curr_pro[curr_state] = max_pro
            path[curr_state].append(last_sta)

    # backtrack to find max path
    max_pro = -1
    max_path = None
    for s in states:
        path[s].append(s)
        if curr_pro[s] > max_pro:
            max_path = path[s]
            max_pro = curr_pro[s]
    return (max_path, max_pro)

In [130]:
obs = [char for char in "GGCACTGAA"]
print(Viterbi(obs, states, start_probability, transition_probability, emission_probability))

(['H', 'H', 'H', 'L', 'L', 'L', 'L', 'L', 'L'], 4.251527999999999e-08)
