In [44]:
import numpy as np
import string
import codecs
from itertools import product
from pprint import pprint

### from HMM packet 
## Problem 2a

For this problem, use the same model   and observation sequence O given in Problem 1.

The model is $λ = (A, B, π)$ where
$A = 
\begin{bmatrix}
0.7 & 0.3 \\
0.4 & 0.6
\end{bmatrix}$
, $B =
\begin{bmatrix}
0.1 &0.4& 0.5 \\
0.7 &0.2 &0.1
\end{bmatrix}$
, $π =
\begin{bmatrix}
0.0& 1.0
\end{bmatrix}$.

Furthermore, suppose the hidden states correspond to $H$ and $C$, respectively, and the observations are $S$, $M$, and $L$, respectively. In this problem, we consider the observation sequence $O = (O_0, O_1, O_2) = (M, S, L)$.


a) Determine the “best” hidden state sequence (X0, X1, X2) in the dynamic program- ming sense.

In [45]:
A1 = np.array([[.7,.3],
              [.4,.6]])
B1 = np.array([[.1,.4,.5],
              [.7,.2,.1]])
pi1 = np.array([0., 1.])
Obs = np.array([1,0,2])
states = ['H','C']
path_probs = dict()

N = 2
M = 3
T = 3

delta = np.zeros((T,N))
for i in range(N):
    delta[0,i] = pi1[i]*B1[i,Obs[0]]
    path_probs[states[i]+'0'] = delta[0,i]

for t in range(1,3):
    for i in range(N):
        delta[t,i] = np.max(delta[t-1]*A1[:,i]*B1[i,Obs[t]])
        path_probs[states[i]+str(t)] = delta[t,i]
        #for j in range(N):
        #    alpha[t,i] = alpha[t,i] + alpha[t-1,j]*A1[j,i]
        #alpha[t,i] = alpha[t,i]*B1[i,Obs[t]]
prob = delta[-1].max()
print(prob)
pprint(path_probs)

0.0168
{'C0': 0.20000000000000001,
 'C1': 0.083999999999999991,
 'C2': 0.0050399999999999993,
 'H0': 0.0,
 'H1': 0.0080000000000000019,
 'H2': 0.016799999999999999}


So optimal path is CCH

## Problem 4

Write the re-estimation formulae (9), (10) and (11) directly in terms of $\alpha$ and $\beta$.

(9) $\pi_i = \gamma_0(i) = \frac{\alpha_0(i)\beta_0(i)}{P(O|\lambda)}$ <br>
(10) $a_{ij} = \sum_{t=0}^{T-2}\gamma_t(i,j) / \sum_{t=0}^{T-2}\gamma_t(i) = \sum_{t=0}^{T-2}\alpha_t(i)a_{ij}b_j(O_{t+1})\beta_{t+1}(j) / \sum_{t=0}^{T-2}\sum_{j=0}^{N-1}\alpha_t(i)a_{ij}b_j(O_{t+1})\beta_{t+1}(j) $ <br>
(11) $b_j(k) = \sum_{t\in{0,1,...,T-1}; O_t=k} \sum_{j=0}^{N-1}\alpha_t(i)a_{ij}b_j(O_{t+1})\beta_{t+1}(j) / \sum_{t=0}^{T-1} \sum_{j=0}^{N-1}\alpha_t(i)a_{ij}b_j(O_{t+1})\beta_{t+1}(j)$

### from Discrete Hidden Markov Models (lab 18)

In [34]:
class hmm():
    def __init__(self, A=None, B=None, pi=None):
        self.A = A
        self.B = B
        self.pi = pi
        
    def _forward(self, obs):
        """
        Compute the scaled forward probability matrix and scaling factors.
        Parameters
        ----------
        obs : ndarray of shape (T,)
        The observation sequence
        Returns
        -------
        alpha : ndarray of shape (T,N)
        The scaled forward probability matrix
        c : ndarray of shape (T,)
        The scaling factors c = [c_1,c_2,...,c_T]
        """
        A = self.A
        B = self.B
        pi = self.pi
        T = len(obs)
        N = self.A.shape[0]
        alpha = np.zeros((T,N))
        c = np.zeros(T)
        c[0] = 1./np.dot(pi,B[obs[0]])
        alpha[0] = c[0]*(pi*B[obs[0]])
        for t in range(1,T):
            c[t] = 1./np.dot(A.dot(alpha[t-1]),B[obs[t]])
            alpha[t] = c[t]*(A.dot(alpha[t-1])*B[obs[t]])
        return alpha, c
            
        
    def _backward(self, obs, c):
        """
        Compute the scaled backward probability matrix.
        
        Parameters
        ----------
        obs : ndarray of shape (T,)
        The observation sequence
        c : ndarray of shape (T,)
        The scaling factors from the forward pass
        
        Returns
        -------
        beta : ndarray of shape (T,N)
        The scaled backward probability matrix
        """
        A = self.A
        B = self.B
        pi = self.pi
        T = len(obs)
        N = A.shape[0]
        beta = np.zeros((T,N))
        beta[-1] = c[-1]
        for t in range(T-2,-1,-1):
            beta[t] = c[t]*(A.T).dot(B[obs[t+1]]*beta[t+1])
        return beta
    
    def _delta(self, obs, alpha, beta):
        """
        Compute the delta probabilities.
        Parameters
        ----------
        obs : ndarray of shape (T,)
        The observation sequence
        alpha : ndarray of shape (T,N)
        The scaled forward probability matrix from the forward pass
        beta : ndarray of shape (T,N)
        The scaled backward probability matrix from the backward pass
        Returns
        -------
        delta : ndarray of shape (T-1,N,N)
        The delta probability array
        gamma : ndarray of shape (T,N)
        The gamma probability array
        """
        A = self.A
        B = self.B
        T = len(obs)
        N = A.shape[0]
        delta = np.zeros((T-1, N, N))
        gamma = np.zeros((T,N))
        for t in range(0,T-1):
            denominator = np.sum([alpha[t,k]*A[l,k]*B[obs[t+1],l]*beta[t+1,l] 
                                  for l,k in product(range(N),repeat=2)])
            for i in range(N):
                for j in range(N):
                    delta[t,i,j] = alpha[t,i]*A[j,i]*B[obs[t+1],j]\
                                    *beta[t+1,j]/denominator
                gamma[t,i] = delta[t,i,:].sum()
        gamma[-1] = alpha[-1]*beta[-1]/alpha[-1].dot(beta[-1])
        return delta, gamma
    
    def _estimate(self, obs, delta, gamma):
        """
        Estimate better parameter values.
        Parameters
        ----------
        obs : ndarray of shape (T,)
        The observation sequence
        delta : ndarray of shape (T-1,N,N)
        The delta probability array
        gamma : ndarray of shape (T,N)
        The gamma probability array
        """
        # update self.A, self.B, self.pi in place
        T = len(obs)
        N = self.A.shape[0]
        self.A = delta.sum(axis=0).T/gamma[:T-1].sum(axis=0)
        self.B = np.zeros_like(self.B)
        for i in range(self.B.shape[0]):
            self.B[i] = gamma[obs==i,:].sum(axis=0)/gamma.sum(axis=0)
        self.pi = gamma[0]
        
    # add fit method
    def fit(self, obs, A, B, pi, max_iter=100, tol=1e-3):
        """
        Fit the model parameters to a given observation sequence.
        Parameters
        ----------
        obs : ndarray of shape (T,)
        Observation sequence on which to train the model.
        A : stochastic ndarray of shape (N,N)
        Initialization of state transition matrix
        B : stochastic ndarray of shape (M,N)
        Initialization of state observation matrix
        pi : stochastic ndarray of shape (N,)
        Initialization of initial state distribution
        max_iter : integer
        The maximum number of iterations to take
        tol : float
        The convergence threshold for change in log-probability
        """
        # initialize self.A, self.B, self.pi
        self.A, self.B, self.pi = A, B, pi
        
        # run the iterations
        logprob0 = 1 # initializing logprob to something it couldn't be
        for i in range(max_iter):
            alpha, c = self._forward(obs)
            logprob1 = -1*c.sum()
            if np.abs(logprob1 - logprob0) < tol:
                break
            beta = self._backward(obs,c)
            delta, gamma = self._delta(obs, alpha, beta)
            self._estimate(obs, delta, gamma)
        

In [22]:
# toy HMM example to be used to check answers
A = np.array([[.7, .4],
              [.3, .6]])
B = np.array([[.1,.7],
              [.4, .2],
              [.5, .1]])
pi = np.array([.6, .4])
obs = np.array([0, 1, 0, 2])

h = hmm()
h.A = A
h.B = B
h.pi = pi

In [23]:
alpha, c = h._forward(obs)
print(-1*(np.log(c)).sum()) # the log prob of observation
# Expected output should be -4.6429135909

-4.6429135909


In [24]:
beta = h._backward(obs, c)
print(beta)
# Expected output:
# [[ 3.1361635 2.89939354]
# [ 2.86699344 4.39229044]
# [ 3.898812 2.66760821]
# [ 3.56816483 3.56816483]]

[[ 3.1361635   2.89939354]
 [ 2.86699344  4.39229044]
 [ 3.898812    2.66760821]
 [ 3.56816483  3.56816483]]


# Problem 4

Added methods to compute $\delta$ and $\gamma$ to my class.

In [25]:
delta, gamma = h._delta(obs, alpha, beta)
print(delta)
print(gamma)

[[[ 0.14166321  0.0465066 ]
  [ 0.37776855  0.43406164]]

 [[ 0.17015868  0.34927307]
  [ 0.05871895  0.4218493 ]]

 [[ 0.21080834  0.01806929]
  [ 0.59317106  0.17795132]]]
[[ 0.18816981  0.81183019]
 [ 0.51943175  0.48056825]
 [ 0.22887763  0.77112237]
 [ 0.8039794   0.1960206 ]]


# Problem 5

Added a parameter updating function.

In [26]:
h._estimate(obs,delta,gamma)
print(h.A)
print('')
print(h.B)
print('')
print(h.pi)

[[ 0.55807991  0.49898142]
 [ 0.44192009  0.50101858]]

[[ 0.23961928  0.70056364]
 [ 0.29844534  0.21268397]
 [ 0.46193538  0.08675238]]

[ 0.18816981  0.81183019]


## Problem 6

Implement the learning algorithm by adding the following method to your hmm class:

In [29]:
def initialize_random(N,M,sigma = .01):
    A = np.random.randn(N,N)*sigma + 1./N
    A[:,-1] = 1. - A[:,:-1].sum(axis=1)
    B = np.random.randn(M,N)*sigma + 1./N
    B[:,-1] = 1. - B[:,:-1].sum(axis=1)
    pi = np.random.randn(N)*sigma + 1./N
    return A,B,pi

In [30]:
# added the fit method to my class

## Problem 7


You are now ready to train a HMM using the Declaration of Independence data.
Use N = 2 states and M =len(set(obs))= 27 observation values (26 lower case characters
and 1 whitespace character), and run for 200 iterations with the default value for tol. Generally
speaking, if you converge to a log probability greater than 21550,
then you have reached an
acceptable set of parameters for this dataset.
Once the learning algorithm converges, analyze the state observation matrix B. Note
which rows correspond to the largest and smallest probability values in each column of B, and
check the corresponding characters. The code below displays typical results for a well-converged
HMM. Note that the u before the " indicates that the string should be unicode, which will be
required for languages other than English

In [28]:
def vec_translate(a, my_dict):
    # translate numpy array from symbols to state numbers or vice versa
    return np.vectorize(my_dict.__getitem__)(a)

def prep_data(filename):
 
    # Get the data as a single string
    with codecs.open(filename, encoding='utf-8') as f:
        data=f.read().lower() #and convert to all lower case
        
    # remove punctuation and newlines
    remove_punct_map = {ord(char): None for char in string.punctuation+"\n\r"}
    data = data.translate(remove_punct_map)
    
    # make a list of the symbols in the data
    symbols = sorted(list(set(data)))
    
    # convert the data to a NumPy array of symbols
    a = np.array(list(data))
    
    #make a conversion dictionary from symbols to state numbers
    symbols_to_obsstates = {x:i for i,x in enumerate(symbols)}
    
    #convert the symbols in a to state numbers
    obs_sequence = vec_translate(a,symbols_to_obsstates)
    
    return symbols, obs_sequence

     
symbols, obs = prep_data('declaration.txt')

In [31]:
N = 2
M = len(set(obs))

In [36]:
A,B,pi = initialize_random(N,M,sigma=.01)
declaration1 = hmm()
declaration1.fit(obs,A,B,pi,max_iter=300)

# example of typical results for well converged HMM
for i in range(len(declaration1.B)):
    print(u"{0}, {1:0.4f}, {2:0.4f}".format(symbols[i], 
                                            declaration1.B[i,0], 
                                            declaration1.B[i,1]))

 , 0.0027, 0.3308
a, 0.0000, 0.1231
b, 0.0240, 0.0000
c, 0.0466, 0.0000
d, 0.0638, 0.0000
e, 0.0000, 0.2217
f, 0.0455, 0.0000
g, 0.0329, 0.0000
h, 0.0741, 0.0145
i, 0.0000, 0.1159
j, 0.0040, 0.0000
k, 0.0030, 0.0005
l, 0.0570, 0.0007
m, 0.0364, 0.0000
n, 0.1213, 0.0009
o, 0.0010, 0.1315
p, 0.0349, 0.0000
q, 0.0015, 0.0000
r, 0.1075, 0.0000
s, 0.1209, 0.0000
t, 0.1619, 0.0000
u, 0.0000, 0.0540
v, 0.0187, 0.0000
w, 0.0245, 0.0000
x, 0.0023, 0.0000
y, 0.0143, 0.0063
z, 0.0010, 0.0000


What do you notice about the second column of B? It seems that the HMM has detected a vowel
state and a consonant state, without any prior input from an English speaker. Interestingly,
the whitespace character is grouped together with the vowels. A HMM can also detect the
vowel/consonant distinction in other languages.

Indeed, it look like the second column is vowels.

## Problem 8

Repeat the previous calculation with 3 hidden states and again with 4 hidden
states. Interpret/explain your results

In [37]:
N = 3
M = len(set(obs))
A,B,pi = initialize_random(N,M,sigma=.01)
declaration2 = hmm()
declaration2.fit(obs,A,B,pi,max_iter=300)

# example of typical results for well converged HMM
for i in range(len(declaration2.B)):
    print(u"{0}, {1:0.4f}, {2:0.4f}, {3:0.4f}".format(symbols[i], 
                                                      declaration2.B[i,0], 
                                                      declaration2.B[i,1],
                                                      declaration2.B[i,2]))

 , 0.0000, 0.3785, 0.0426
a, 0.0000, 0.0602, 0.1616
b, 0.0001, 0.0000, 0.0540
c, 0.0433, 0.0000, 0.0344
d, 0.0882, 0.0000, 0.0000
e, 0.0614, 0.1955, 0.0312
f, 0.0571, 0.0000, 0.0096
g, 0.0435, 0.0000, 0.0033
h, 0.0006, 0.1080, 0.0000
i, 0.0000, 0.1074, 0.0590
j, 0.0056, 0.0000, 0.0000
k, 0.0040, 0.0006, 0.0004
l, 0.0508, 0.0014, 0.0447
m, 0.0385, 0.0000, 0.0194
n, 0.0497, 0.0000, 0.1945
o, 0.0000, 0.0970, 0.1146
p, 0.0207, 0.0000, 0.0449
q, 0.0000, 0.0018, 0.0000
r, 0.1029, 0.0065, 0.0629
s, 0.1311, 0.0018, 0.0558
t, 0.2233, 0.0007, 0.0000
u, 0.0000, 0.0337, 0.0573
v, 0.0259, 0.0000, 0.0000
w, 0.0281, 0.0000, 0.0096
x, 0.0013, 0.0016, 0.0001
y, 0.0225, 0.0052, 0.0000
z, 0.0014, 0.0000, 0.0000


Here, my hypothesis is that the first column is last letters of words, because r,s,t,and y are all pretty popular (along with other characters). Second column is vowels (but a is somewhat "missing"). Third column is just everything else? Beginning and middle constants plus "a"?

In [38]:
N = 4
M = len(set(obs))
A,B,pi = initialize_random(N,M,sigma=.01)
declaration3 = hmm()
declaration3.fit(obs,A,B,pi,max_iter=300)

# example of typical results for well converged HMM
for i in range(len(declaration3.B)):
    print(u"{0}, {1:0.4f}, {2:0.4f}, {3:0.4f}, {4:0.4f}".format(symbols[i], 
                                                      declaration3.B[i,0], 
                                                      declaration3.B[i,1],
                                                      declaration3.B[i,2],
                                                      declaration3.B[i,3]))

 , 0.0925, 0.0412, 0.0541, 0.2984
a, 0.0000, 0.0000, 0.0000, 0.1381
b, 0.0000, 0.0220, 0.0392, 0.0000
c, 0.0326, 0.0293, 0.0646, 0.0000
d, 0.0995, 0.0709, 0.0082, 0.0000
e, 0.0237, 0.0000, 0.0000, 0.2404
f, 0.0012, 0.1053, 0.0000, 0.0000
g, 0.0559, 0.0185, 0.0213, 0.0000
h, 0.2632, 0.0188, 0.0000, 0.0000
i, 0.0299, 0.0000, 0.0000, 0.1195
j, 0.0074, 0.0041, 0.0000, 0.0000
k, 0.0045, 0.0038, 0.0000, 0.0006
l, 0.0383, 0.0920, 0.0176, 0.0000
m, 0.0000, 0.0831, 0.0003, 0.0008
n, 0.0122, 0.0107, 0.2940, 0.0051
o, 0.0561, 0.0098, 0.0000, 0.1241
p, 0.0000, 0.0794, 0.0008, 0.0006
q, 0.0016, 0.0024, 0.0000, 0.0000
r, 0.0165, 0.1971, 0.0483, 0.0000
s, 0.0810, 0.1072, 0.1274, 0.0033
t, 0.1391, 0.0197, 0.2884, 0.0042
u, 0.0000, 0.0000, 0.0000, 0.0605
v, 0.0046, 0.0404, 0.0000, 0.0000
w, 0.0047, 0.0239, 0.0345, 0.0000
x, 0.0000, 0.0041, 0.0014, 0.0000
y, 0.0355, 0.0136, 0.0000, 0.0044
z, 0.0000, 0.0024, 0.0000, 0.0000


Last column is vowels, and whitespace stuck with the vowels again. Again, I hypothesize that the first column is ending letter of word. No strong current beliefs about the second and third columns.

## Problem 9

Repeat the calculations for 2, and 3 hidden states for WarAndPeace.txt. Interpret/explain
your results. Which Cyrillic characters appear to be vowels?

In [41]:
symbols, obs = prep_data('WarAndPeace.txt')
N = 2
M = len(set(obs))
A,B,pi = initialize_random(N,M,sigma=.01)
warandpeace1 = hmm()
warandpeace1.fit(obs,A,B,pi,max_iter=300)

# example of typical results for well converged HMM
for i in range(len(warandpeace1.B)):
    print(u"{0}, {1:0.4f}, {2:0.4f}".format(symbols[i], 
                                            warandpeace1.B[i,0], 
                                            warandpeace1.B[i,1]))

 , 0.1610, 0.1669
а, 0.0725, 0.0681
б, 0.0156, 0.0145
в, 0.0417, 0.0370
г, 0.0164, 0.0191
д, 0.0239, 0.0224
е, 0.0742, 0.0615
ж, 0.0095, 0.0073
з, 0.0160, 0.0144
и, 0.0546, 0.0525
й, 0.0085, 0.0095
к, 0.0289, 0.0317
л, 0.0429, 0.0434
м, 0.0239, 0.0219
н, 0.0625, 0.0544
о, 0.0893, 0.1029
п, 0.0208, 0.0258
р, 0.0361, 0.0356
с, 0.0392, 0.0448
т, 0.0432, 0.0504
у, 0.0243, 0.0229
ф, 0.0012, 0.0012
х, 0.0062, 0.0070
ц, 0.0031, 0.0028
ч, 0.0122, 0.0110
ш, 0.0072, 0.0059
щ, 0.0035, 0.0022
ъ, 0.0003, 0.0002
ы, 0.0159, 0.0141
ь, 0.0169, 0.0188
э, 0.0021, 0.0032
ю, 0.0060, 0.0054
я, 0.0202, 0.0214
ё, 0.0000, 0.0000


In [40]:
symbols, obs = prep_data('WarAndPeace.txt')
N = 3
M = len(set(obs))
A,B,pi = initialize_random(N,M,sigma=.01)
warandpeace2 = hmm()
warandpeace2.fit(obs,A,B,pi,max_iter=300)

# example of typical results for well converged HMM
for i in range(len(warandpeace2.B)):
    print(u"{0}, {1:0.4f}, {2:0.4f}, {3:0.4f}".format(symbols[i], 
                                                      warandpeace2.B[i,0], 
                                                      warandpeace2.B[i,1],
                                                      warandpeace2.B[i,2]))

 , 0.0979, 0.0426, 0.4196
а, 0.0000, 0.1968, 0.0000
б, 0.0327, 0.0000, 0.0103
в, 0.0716, 0.0000, 0.0466
г, 0.0372, 0.0000, 0.0142
д, 0.0453, 0.0010, 0.0217
е, 0.0000, 0.1535, 0.0486
ж, 0.0188, 0.0000, 0.0050
з, 0.0272, 0.0000, 0.0186
и, 0.0000, 0.1427, 0.0095
й, 0.0000, 0.0000, 0.0336
к, 0.0581, 0.0000, 0.0316
л, 0.0920, 0.0000, 0.0322
м, 0.0433, 0.0000, 0.0249
н, 0.1307, 0.0000, 0.0349
о, 0.0000, 0.2672, 0.0026
п, 0.0464, 0.0000, 0.0218
р, 0.0865, 0.0000, 0.0124
с, 0.0434, 0.0002, 0.0960
т, 0.1042, 0.0000, 0.0286
у, 0.0000, 0.0646, 0.0018
ф, 0.0021, 0.0000, 0.0016
х, 0.0122, 0.0000, 0.0077
ц, 0.0076, 0.0000, 0.0004
ч, 0.0202, 0.0000, 0.0149
ш, 0.0147, 0.0000, 0.0038
щ, 0.0076, 0.0000, 0.0000
ъ, 0.0002, 0.0006, 0.0000
ы, 0.0000, 0.0420, 0.0000
ь, 0.0000, 0.0500, 0.0000
э, 0.0000, 0.0000, 0.0098
ю, 0.0000, 0.0029, 0.0175
я, 0.0000, 0.0359, 0.0298
ё, 0.0000, 0.0001, 0.0000


It seems that Cryllic has three hidden states because both times I ran the N=2 one it doesn't come up with results that make any sense. But it seems that in the N=3 the second column found the vowels\.