In [1]:
import numpy as np
from numpy import log2
import pandas as pd
from hmmlearn.hmm import MultinomialHMM

#### Setting up a toy HMM

Inspired by [this lecture](https://www.csb.pitt.edu/ComputationalGenomics/Lectures/Lec5.pdf)
from the U of Pittsburg (starting at slide 11).

Consider two hidden states ("H" for high GC and "L" for low GC) which can each "emit" the nucleotides A, C, G and T.

We are using ["MultinomialHMM"](https://hmmlearn.readthedocs.io/en/latest/api.html#hmmlearn.hmm.MultinomialHMM)
because the emissions are discrete states.

In "hmmlearn" speak, **states** are **components** and emmitted **symbols** are **features**.

The `emission_prob` matrix maps the likelihood of a particular hidden state (high or low GC) emitting a particular nucleotide. For example, the probability of seeing a "A" given an underlying hidden state of "H" is 0.2 or 20%.

The `initial_probabilities` matrix counts the number of times each state is found at the start of a sequence, then divides those by the total so that they sum to 1. In this model, the initial states are split evenly (50% and 50%). 

Finally, the `transition_probabilities` is the likelihood of transitioning from one hidden state to another. In this model, for example, the probability of staying in state "H" is 0.5 or 50%.

In [2]:
states = ['H', 'L']
nucleotides = ['A', 'C', 'G', 'T']

# shape = len(states), len(symbols)
emission_p = np.array([[0.2, 0.3, 0.3, 0.2],
                       [0.3, 0.2, 0.2, 0.3]]) 

transition_p = np.array([[0.5, 0.5],
                         [0.4, 0.6]])
initial_p = np.array([0.5, 0.5])

`assert` statements are little tests to make sure that things are working the way we expect. They are *very* helpful for catching silly bugs.

In [3]:
# Just making sure the transition matrices were initialized correctly
assert transition_p[0, 0] == 0.5 # prob(H|H) = 0.5
assert transition_p[1, 0] == 0.4 # prob(H|L) = 0.4
assert emission_p[0, 0] == 0.2   # prob(A|H) = 0.2
assert emission_p[1, 2] == 0.2   # prob(G|L) = 0.2

#### Build model

In [4]:
hmm = MultinomialHMM(n_components=len(states), 
                     verbose=False,  
                     init_params='')
hmm.transmat_ = transition_p
hmm.emissionprob_ = emission_p
hmm.startprob_ = initial_p
hmm.n_features = len(nucleotides)

In [5]:
sample_dna, sample_hid = hmm.sample(n_samples=10, random_state=21)

In [6]:
sample_dna.ravel(), sample_hid.ravel()

(array([1, 0, 0, 2, 2, 3, 0, 3, 3, 1]), array([0, 1, 0, 0, 0, 0, 0, 0, 1, 1]))

#### Some helper functions

You'll notice that the hmmlearn model works with integers ^^^.

The following functions encode sequences into integers, and then decode them back into sequences. We include some more assert statements as little tests to make sure things are working as we expect.

In [7]:
def encode_seq(symbols, seqtype='dna'):
    encdr = nucleotides
    if seqtype != 'dna':
        encdr = states
    outseq = np.array([encdr.index(s) for s in symbols])
    return outseq

test_hl = 'HHHLLL'
test_nuc = 'GGGAAA'
assert encode_seq(test_hl, seqtype='states')[0] == states.index(test_hl[0]) and \
       encode_seq(test_hl, seqtype='states')[-1] == states.index(test_hl[-1])
assert encode_seq(test_nuc, seqtype='dna')[0] == nucleotides.index(test_nuc[0]) and \
       encode_seq(test_nuc, seqtype='dna')[-1] == nucleotides.index(test_nuc[-1])

def decode_seq(num_array, seqtype='dna'):
    encdr = nucleotides
    if seqtype != 'dna':
        encdr = states
    outseq = [encdr[s] for s in num_array]
    return ''.join(outseq)

assert decode_seq(encode_seq(test_nuc)) == test_nuc
assert decode_seq(encode_seq(test_hl, seqtype='prot'), seqtype='prot') == test_hl

In [8]:
decode_seq(sample_dna.reshape(-1), seqtype='dna'), \
decode_seq(sample_hid.reshape(-1), seqtype='hid')

('CAAGGTATTC', 'HLHHHHHHLL')

The Viterbi algorithm returns the most probable sequence of hidden states (H or L) to explain the observations (DNA).

In [9]:
observed_dna = 'GGCACTGAA'
obs_dna_e = encode_seq(observed_dna, seqtype='dna')
obs_dna_e.ravel()

array([2, 2, 1, 0, 1, 3, 2, 0, 0])

In [10]:
mle_hid_indices = hmm.predict(obs_dna_e.reshape(-1, 1))
mle_hid = decode_seq(mle_hid_indices, seqtype='hid')
mle_hid

'HHHLLLLLL'

#### How Viterbi works

Assume that the probability at position *pos* only depends on the emitted symbol and the hidden state at position *pos-1*.

Start by calculating the probability of each hidden state given the first symbol "G" at position 0 and--simultaneously--the probability of starting in either hidden state.

Determine the most likely hidden state at position 0 by comparing the two probabilities.

Next, determine the probability of each hidden state at position 1, given the maximum probability of each hidden state previously calculated, and--simultaneously--emitting the second symbol "G".

Determine the most likely hidden state at position 1 by comparing the two probabilities.


In [11]:
# A function to visually track our progress
# through Viterbi
def print_cum_prob(cprob, dna):
    print('DNA\tp(H)\tp(L)')
    for i in range(len(dna)):
        print(f'{dna[i]}\t{cprob[0, i]:0.3f}\t{cprob[1, i]:0.3f}')

In [12]:
cum_prob = np.zeros((len(states), len(observed_dna)))
gi = nucleotides.index('G')
cum_prob[0, 0] = emission_p[0, gi] * initial_p[0]
cum_prob[1, 0] = emission_p[1, gi] * initial_p[1]

print_cum_prob(cum_prob, observed_dna)

DNA	p(H)	p(L)
G	0.150	0.100
G	0.000	0.000
C	0.000	0.000
A	0.000	0.000
C	0.000	0.000
T	0.000	0.000
G	0.000	0.000
A	0.000	0.000
A	0.000	0.000


In [13]:
pos = 1
pi = nucleotides.index(observed_dna[pos])
cum_prob[0, pos] = emission_p[0, pi] * max(cum_prob[0, pos-1]*transition_p[0, 0], cum_prob[1, pos-1]*transition_p[1, 0])
cum_prob[1, pos] = emission_p[1, pi] * max(cum_prob[0, pos-1]*transition_p[0, 1], cum_prob[1, pos-1]*transition_p[1, 1])

print_cum_prob(cum_prob, observed_dna)

DNA	p(H)	p(L)
G	0.150	0.100
G	0.022	0.015
C	0.000	0.000
A	0.000	0.000
C	0.000	0.000
T	0.000	0.000
G	0.000	0.000
A	0.000	0.000
A	0.000	0.000


`max(cum_prob[0, pos-1]*transition_p[0, 0], cum_prob[1, pos-1]*transition_p[1, 0])`

^ This is where things can start to seem confusing, but they are intuitive if you take a moment to understand it

For both H and L, we are selecting the most likely of two possibilities (e.g. the max() operation).
For H, the two possibilities are an H at the previous step transitioning to an H at this step 

OR 

an L at the previous step transitioning to an H at this step.

Similarly for L.

We've already calculated the probability of an H or an L at the previous step, so we multiply those values with the transition probabilities.

In [14]:
pos = 2
pi = nucleotides.index(observed_dna[pos])
cum_prob[0, pos] = emission_p[0, pi] * max(cum_prob[0, pos-1]*transition_p[0, 0], cum_prob[1, pos-1]*transition_p[1, 0])
cum_prob[1, pos] = emission_p[1, pi] * max(cum_prob[0, pos-1]*transition_p[0, 1], cum_prob[1, pos-1]*transition_p[1, 1])

print_cum_prob(cum_prob, observed_dna)

DNA	p(H)	p(L)
G	0.150	0.100
G	0.022	0.015
C	0.003	0.002
A	0.000	0.000
C	0.000	0.000
T	0.000	0.000
G	0.000	0.000
A	0.000	0.000
A	0.000	0.000


And so on ...

The only additional detail is that we generally operate using logarithms instead of the raw probabilities to avoid underflow (notice how the probabilities are getting progressively smaller as the algorithm progresses^^^). Recall that using logs, multiplications become summations.

The whole thing generalizes to:

In [15]:
# Viterbi
# Initialization
pos = 0
pi = nucleotides.index(observed_dna[pos])
cum_prob[0, pos] = log2(emission_p[0, pi]) + log2(initial_p[0])
cum_prob[1, pos] = log2(emission_p[1, pi]) + log2(initial_p[1])
most_likely_hidden_index = np.argmax([cum_prob[0, pos], cum_prob[1, pos]])
most_likely_hidden = states[most_likely_hidden_index]
mle = [most_likely_hidden]

# Continuation
for pos in range(1, len(observed_dna)):
    pi = nucleotides.index(observed_dna[pos])
    prob_H = max(cum_prob[0, pos-1] + log2(transition_p[0, 0]), cum_prob[1, pos-1] + log2(transition_p[1, 0]))
    prob_L = max(cum_prob[0, pos-1] + log2(transition_p[0, 1]), cum_prob[1, pos-1] + log2(transition_p[1, 1]))
    cum_prob[0, pos] = log2(emission_p[0, pi]) + prob_H
    cum_prob[1, pos] = log2(emission_p[1, pi]) + prob_L
    
    most_likely_hidden_index = np.argmax([cum_prob[0, pos], cum_prob[1, pos]])
    most_likely_hidden = states[most_likely_hidden_index]
    if cum_prob[0, pos] == cum_prob[1, pos]:
        # use hidden state estimate as tie breaker
        most_likely_hidden_index = np.argmax([prob_H, prob_L])
        most_likely_hidden = states[most_likely_hidden_index]
    mle.append(most_likely_hidden)

print('Cumulative log probabilities and MLE')
print('DNA\tlog_p(H) log_p(L) MLE')
for i in range(cum_prob.shape[1]):
    print(f'{observed_dna[i]}\t{cum_prob[0, i]:.2f}\t {cum_prob[1, i]:.2f}\t  {mle[i]}')

Cumulative log probabilities and MLE
DNA	log_p(H) log_p(L) MLE
G	-2.74	 -3.32	  H
G	-5.47	 -6.06	  H
C	-8.21	 -8.80	  H
A	-11.53	 -10.95	  L
C	-14.01	 -14.01	  L
T	-17.33	 -16.48	  L
G	-19.54	 -19.54	  L
A	-22.86	 -22.01	  L
A	-25.66	 -24.49	  L


Thus, the most likely sequence of hidden states given the model parameters, is HHHLLLLLL. 

#### Sample training data for RNN

As a final step, we use the model to generate 100 training sequences of length 500. These training sequences include both randomly generated DNA and its matching hidden states. These will be used for training a recurrent neural network later in notebook 2.

In [16]:
N = 100
L = 500
outfile = 'rnn_toy_training.tsv'
with open(outfile, 'w') as out:
    out.write('dna\thidden_state\n')
    for i in range(N):
        sample_dna, sample_hid = hmm.sample(n_samples=L, random_state=i)
        dna = decode_seq(sample_dna.reshape(-1), seqtype='dna')
        hid = decode_seq(sample_hid.reshape(-1), seqtype='hid')
        out.write(f'{dna}\t{hid}\n')