In [3]:
import numpy as np
import h5py
from scipy.stats import norm

In [4]:
# Loading the data
with h5py.File('../HAR/preprocessed.hdf5','r') as hf:
    x_train = np.array(hf.get('x_train'))
    y_train = np.array(hf.get('y_train'))
    s_train = np.array(hf.get('s_train'))
    x_test = np.array(hf.get('x_test'))
    y_test = np.array(hf.get('y_test'))
    s_test = np.array(hf.get('s_test'))
    x_train_with_past = np.array(hf.get('x_train_with_past'))
    y_train_with_past = np.array(hf.get('y_train_with_past'))
    x_test_with_past = np.array(hf.get('x_test_with_past'))
    y_test_with_past = np.array(hf.get('y_test_with_past'))

In [8]:
s_train

array([ 1,  1,  1, ..., 30, 30, 30])

In [9]:
y_train

array([5, 5, 5, ..., 2, 2, 2])

In [10]:
np.sum(s_train == 1)

347

In [182]:
# Learning a one component Gaussian over all the features
def compute_transition(y, alpha=0.1):
    '''
    Compute the transition matrice.
    Rows: states to
    cols: states from
    States are indexed starting from 1
    '''
    num_state = np.max(y)
    transition = alpha*np.ones((num_state, num_state))
    for i in xrange(y.shape[0]-1):
        transition[y[i+1]-1, y[i]-1] += 1
    # Normalisation
    transition /= np.sum(transition, axis=1)[:, np.newaxis]
    return transition

def compute_emission(x, y):
    '''
    Compute the parameters of the gaussian distribution
    of the emission given each state.
    We assume each emission distribution is independent,
    the covariance matrix is diagonal then.
    States are indexed starting from 1
    '''
    num_state = np.max(y)
    
    sigma_diag = np.zeros((num_state, x.shape[1]))
    mu = np.zeros((num_state, x.shape[1]))
    for s in xrange(num_state):
        x_s = x[(y == s+1), :]
        # Computing mu_s
        mu[s] = np.mean(x_s, axis=0)
        # Computing sigma_s (by column)
        sigma_diag[s] = np.std(x_s, axis=0)

    return mu, sigma_diag

def compute_logscore(data, log_transition, mu, sigma, C):
    y = np.zeros((C, C))
    for j in xrange(C):
        y[j, :] = np.sum([norm.logpdf(d, loc=mu[j, i], scale=sigma[j, i]) for i, d in enumerate(data)])

    return y + log_transition

def viterbi(inputs, init, log_transition, mu, sigma, C):
    '''
    Evaluates the highest scoring sequence
    '''
    y = np.zeros((C, C))
    initial = np.zeros(C)

    initial[init] = 1
    initial = np.log(initial)

    n = inputs.shape[0]
    # To store the maxes
    max_table = np.zeros((n, C))
    backpointer_table = np.zeros((n, C))

    # first timestep
    # the initial most likely paths are the initial state distribution
    state_init = initial + compute_logscore(inputs[0,:], log_transition, mu, sigma, C)
    maxes = np.max(state_init, axis=1)
    backpointers = np.argmax(state_init, axis=1)
    max_table[0, :] = maxes

    for i in xrange(1, n):
        # COmputing the score
        y = compute_logscore(inputs[i, :], log_transition, mu, sigma, C)
        scores = y + np.repeat(maxes.reshape(1, C), C, axis=0)

        # compute new maxes
        maxes = np.max(scores, axis=1)
        backpointers = np.argmax(scores, axis=1)

        max_table[i, :] = maxes
        backpointer_table[i, :] = backpointers

    # follow backpointers to recover max path
    classes = np.zeros(n)
    classes[n-1] = np.argmax(maxes, axis=0)
    for i in xrange(n-1, 0, -1):
        classes[i-1] = backpointer_table[i, classes[i]]

    return classes

def standardize(x):
    '''
    Standardize each column of x
    '''
    x_std = np.std(x, axis=0)
    x_mu = np.mean(x, axis=0)
    
    return (x - x_mu)/x_std[np.newaxis, :]

def compute_accuracy(pred_classes, true_classes):
    '''
    Compute accuracy
    '''
    return np.sum(pred_classes == true_classes) /(1.*len(pred_classes))


In [176]:
C=6
init = 4
y = np.zeros((C, C))
initial = np.zeros(C)

initial[init] = 1
initial = np.log(initial)
print(initial)

state_init = initial + compute_logscore(x_standard[0,:], log_transition_train, mu, sigma_diag, C)
print(state_init)
print(np.max(state_init, axis=1))

[-inf -inf -inf -inf   0. -inf]
[[        -inf         -inf         -inf         -inf -10.65349168
          -inf]
 [        -inf         -inf         -inf         -inf -14.8395688
          -inf]
 [        -inf         -inf         -inf         -inf -12.82739194
          -inf]
 [        -inf         -inf         -inf         -inf  -7.88384383
          -inf]
 [        -inf         -inf         -inf         -inf  -0.84924053
          -inf]
 [        -inf         -inf         -inf         -inf -20.8956498
          -inf]]
[-10.65349168 -14.8395688  -12.82739194  -7.88384383  -0.84924053
 -20.8956498 ]




## 1) Experiment on sample

In [190]:
# We retain 6 features (known to be independent)

x = np.concatenate((x_train[:, :3], x_train[:, 41:44]), axis=1)
x_sub_test = np.concatenate((x_test[:, :3], x_test[:, 41:44]), axis=1)
print(x.shape)

(7352, 6)


In [178]:
# Learning the HMM

# standardization
x_standard = standardize(x)
print(x_standard.shape)

# ### TRANSITION
transition_train = compute_transition(y_train)
log_transition_train = np.log(transition_train)
print(transition_train.shape)

# ### EMISSION
mu, sigma_diag = compute_emission(x_standard, y_train)
print(mu.shape)
print(sigma_diag.shape)

(7352, 6)
(6, 6)
(6, 6)
(6, 6)


In [194]:
%time
# Sequence prediction
C = 6
sample_size = 3000
seq_pred = viterbi(x_standard[:sample_size,:], 4, log_transition_train, mu, sigma_diag, C)
# Shifting the index of 1
seq_pred += 1
print 'ACCURACY train: {}'.format(compute_accuracy(seq_pred, y_train[:sample_size]))



CPU times: user 3 µs, sys: 7 µs, total: 10 µs
Wall time: 20 µs
ACCURACY train: 0.876666666667


In [195]:
%time
x_sub_test_standard = standardize(x_sub_test)
seq_pred_test = viterbi(x_sub_test_standard[:sample_size,:], 4, log_transition_train, mu, sigma_diag, C)
seq_pred_test += 1
print 'ACCURACY test: {}'.format(compute_accuracy(seq_pred_test, y_test[:sample_size]))



CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.01 µs
ACCURACY test: 0.768917543264


In [193]:
print seq_pred_test[:100]

[ 5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.  5.
  5.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.
  6.  6.  6.  6.  6.  6.  6.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
