# HMM with TensorFlow

In [1]:
import tensorflow as tf
import numpy as np
import sys

## A HMM as a Special Case of a Recurrent Neural Network
We use the notation of RNNs similar to that in [Dive into Deep Learning](https://d2l.ai/chapter_recurrent-neural-networks/bptt.html). $h_t$ is a size $n$ vector of RNN-"hidden states" (these are real numbers, not to be confused with the hidden states of HMMs, which are from $Q$).  
$$ h_t = f(x_t, h_{t-1}; A, B)$$
We chose the outputs
$$ o_t = \text{sum}(h_t) = h_t[0] + \cdots + h_t[n-1] \in [0,1]$$
so that the final output $o_T$ is just the likelihood of the sequence $P(Y)$.
This RNN does not need to produce intermediate outputs $o_t$ for $t<T$ as they are not used yet. However, they could be used in conjunction with a backwards pass.

### HMMCell
As a template we use the code for [tf.keras.layers.SimpleRNNCell](https://github.com/tensorflow/tensorflow/blob/v2.4.1/tensorflow/python/keras/layers/recurrent.py#L1222-L1420)

In [2]:
from HMMCell import HMMCell

## Test the HMM cell

In [3]:
n=3

A_init = np.array([[[7, 2, 1], [3, 5, 2], [2, 6, 2]]]) / 10.0
B_init = np.array([[[4, 6], [8, 2], [9, 1]]]) / 10.0
I_init = np.array([[1, 1e-10, 1e-10]]) # start with X1=sun (very likely)

# take ln to cancel out the softmax that is applied to obtain a stochastic matrix 
A_init = np.log(A_init)
B_init = np.log(B_init)
I_init = np.log(I_init)

A_initializer = tf.keras.initializers.Constant(A_init)
B_initializer = tf.keras.initializers.Constant(B_init)
I_initializer = tf.keras.initializers.Constant(I_init)

yi = np.array([[1., 0]]).astype(np.float32) # np.random.random([batch_size, s]).astype(np.float32)
states = np.array([[[0.4, 0, 0]]]).astype(np.float32) # np.random.random([batch_size, n]).astype(np.float32)
hmmC = HMMCell(units=1, n=n,
               transition_initializer=A_initializer,
               emission_initializer=B_initializer,
               init_initializer=I_initializer)

output = hmmC(yi, [0, states, [0.]])
print("output:\n", output[0], "\n")

with np.printoptions(precision=5, suppress=True):
    print("transition matrix A:\n", hmmC.A.numpy())
    print("emission matrix B:\n", hmmC.B.numpy())
    print("initial distribution I:\n", hmmC.I.numpy())


output:
 tf.Tensor([[-1.551169]], shape=(1, 1), dtype=float32) 

transition matrix A:
 [[[0.7 0.2 0.1]
  [0.3 0.5 0.2]
  [0.2 0.6 0.2]]]
emission matrix B:
 [[[0.4 0.6]
  [0.8 0.2]
  [0.9 0.1]]]
initial distribution I:
 [[1. 0. 0.]]


In [4]:
inputs = np.array([[[1, 0],[1, 0],[1, 0]]]).astype(np.float32)
hmm = tf.keras.layers.RNN(hmmC, return_sequences = True, return_state = True)
  
#alpha, _, lastcol, 
loglik = hmm(inputs)
#alpha = alpha[1]

with np.printoptions(precision=5, suppress=True):
    #print ("α=\n", alpha.numpy(),
    #"\nlast column of forward table:", lastcol.numpy())
    print("\nlog-likelihood=", loglik)


log-likelihood= [<tf.Tensor: shape=(1, 3, 1), dtype=float32, numpy=
array([[[-0.91629],
        [-1.55117],
        [-2.03409]]], dtype=float32)>, <tf.Tensor: shape=(1,), dtype=int8, numpy=array([0], dtype=int8)>, <tf.Tensor: shape=(1, 1, 3), dtype=float32, numpy=array([[[0.32049, 0.46483, 0.21468]]], dtype=float32)>, <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-2.03409]], dtype=float32)>]


### Training

In [5]:
from dishonest_casino import get_casino_dataset
batch_size = 32
ds = get_casino_dataset().repeat().batch(batch_size)

for inputs in ds.take(1):
    pass

inputs.shape

TensorShape([32, 100, 6])

In [6]:
n=2
u=2
s = inputs.shape[-1]
dcc = HMMCell(u, n)

# test HMMCell and initialize emission alphabet size
Q = dcc(inputs[:,0,:], [tf.zeros(batch_size, dtype=tf.int8),
                    tf.ones([batch_size, u, n], dtype=tf.float32), 
                    tf.zeros([batch_size, u], dtype=tf.float32)])

In [7]:
# the model
F = tf.keras.layers.RNN(dcc, return_state = True)
len(F(inputs))

4

In [8]:
def print_pars(cell):
    with np.printoptions(precision=5, suppress=True):
        print("transition matrices A:\n", cell.A.numpy())
        print("emission matrices B:\n", cell.B.numpy())
        print("initial distributions I:\n", cell.I.numpy())

In [9]:
print_pars(dcc)

transition matrices A:
 [[[0.50019 0.49981]
  [0.50203 0.49797]]

 [[0.5049  0.4951 ]
  [0.49818 0.50182]]]
emission matrices B:
 [[[0.16853 0.16243 0.16842 0.16354 0.16395 0.17314]
  [0.16437 0.17153 0.16509 0.16757 0.16828 0.16315]]

 [[0.17257 0.16865 0.15846 0.16822 0.16189 0.17021]
  [0.171   0.1614  0.17441 0.16672 0.16194 0.16453]]]
initial distributions I:
 [[0.48994 0.51006]
 [0.4927  0.5073 ]]


In [10]:
def loss(model, y):
  alpha, _, lastcol, loglik = model(y)
  L = -tf.reduce_mean(loglik)
  return L

L = loss(F, inputs)
#print(f"likelihoods = {lik}\nloss (avg neg log lik)= {loss}")
print("Loss test: {}".format(L))

Loss test: 179.1325225830078


In [11]:
def grad(model, inputs):
  with tf.GradientTape() as tape:
    loss_value = loss(model, inputs)
  return loss_value, tape.gradient(loss_value, model.trainable_variables)

In [12]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)

In [13]:
model = F
loss_value, grads = grad(model, inputs)

print("Step: {}, Initial Loss: {}".format(opt.iterations.numpy(),
                                          loss_value.numpy()))


Step: 0, Initial Loss: 179.1325225830078


In [14]:
# Keep results for plotting
train_loss_results = []

num_epochs = 41
m = 10 # training batches

for epoch in range(num_epochs):
  epoch_loss_avg = tf.keras.metrics.Mean()
  epoch_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

  # Training loop - using batches
  for y in ds.take(m):
    # Optimize the model
    loss_value, grads = grad(model, y)
    opt.apply_gradients(zip(grads, model.trainable_variables))

    # Track progress
    epoch_loss_avg.update_state(loss_value)  # Add current batch loss

    # End epoch
  train_loss_results.append(epoch_loss_avg.result())

  if epoch % 2 == 0:
    print("Epoch {:03d}: Loss: {:.3f}".format(epoch, epoch_loss_avg.result()))
  if epoch % 10 == 0:
    print_pars(dcc)

Epoch 000: Loss: 178.562
transition matrices A:
 [[[0.54892 0.45108]
  [0.55017 0.44983]]

 [[0.55359 0.44641]
  [0.54661 0.45339]]]
emission matrices B:
 [[[0.16306 0.15654 0.16213 0.15843 0.1577  0.20215]
  [0.15948 0.16573 0.15928 0.16281 0.16218 0.19053]]

 [[0.16633 0.16278 0.15274 0.1627  0.15661 0.19884]
  [0.16504 0.15607 0.16832 0.16155 0.15692 0.1921 ]]]
initial distributions I:
 [[0.46595 0.53405]
 [0.48945 0.51055]]
Epoch 002: Loss: 177.140
Epoch 004: Loss: 176.980
Epoch 006: Loss: 177.186
Epoch 008: Loss: 177.049
Epoch 010: Loss: 177.308
transition matrices A:
 [[[0.63618 0.36382]
  [0.59637 0.40363]]

 [[0.63257 0.36743]
  [0.59144 0.40856]]]
emission matrices B:
 [[[0.14936 0.14634 0.1517  0.14884 0.15103 0.25273]
  [0.14815 0.15786 0.15223 0.15495 0.1563  0.23051]]

 [[0.14834 0.15201 0.146   0.15079 0.15251 0.25034]
  [0.14909 0.14899 0.16187 0.15169 0.15368 0.23467]]]
initial distributions I:
 [[0.1861  0.8139 ]
 [0.27128 0.72872]]
Epoch 012: Loss: 176.877
Epoch 014: 

In [15]:
print_pars(dcc)

transition matrices A:
 [[[0.87524 0.12476]
  [0.06454 0.93546]]

 [[0.87832 0.12168]
  [0.06558 0.93442]]]
emission matrices B:
 [[[0.10853 0.10944 0.11404 0.1136  0.11018 0.44421]
  [0.166   0.1722  0.17183 0.17223 0.17046 0.14727]]

 [[0.10989 0.11095 0.11536 0.11483 0.11147 0.4375 ]
  [0.16642 0.17244 0.17205 0.17244 0.17083 0.14582]]]
initial distributions I:
 [[0.04397 0.95603]
 [0.0462  0.9538 ]]
