In [172]:
import numpy as np
import theano.tensor as T
from theano import function, printing
import theano

from theano import config
config.compute_test_value = 'raise'

# updates = OrderedDict()

In [None]:
class Evolution:

    def calculate_AER(self, S, P, A):
        s_a, p_a, len_s, len_a = 0, 0, 0, 0
        for s, p, a in zip(S, P, A):
            s_a += len(list(set(s).intersection(a)))
            p_a += len(list(set(p).intersection(a)))
            len_s += len(s)
            len_a += len(a)
        print ("s_a", s_a)
        p_a += s_a
        print ("p_a", p_a)
        aer = (s_a + p_a) / (len_s + len_a)
        print ("aer", 1.-aer)

        return 1. - aer 


    def calculate_one_AER(self, S, P, A):
        s_a = len(list(set(S).intersection(A)))
        print ("s_a", s_a)
        p_a = len(list(set(P).intersection(A))) + s_a
        print ("p_a", p_a)
        aer = (s_a + p_a) / (len(S) + len(A))
        print ("aer", 1.-aer)

        return 1. - aer 

In [192]:
X = np.arange(9)
np.split(X, 3)

[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8])]

In [177]:
class EmissionModel:
    """ Simple emission model without CNN
    word embedding layer -> ReLU layer -> softmax layer
    """
    
    def init_weights_bias(self, input_size, layer_size, output_size, seed=1402):
        random_state = np.random.RandomState(seed)
        
        size_list = np.concatenate(([input_size[0]], layer_size, [output_size]), axis=0)
        w = []
        b = []
        
        for i in range(len(size_list) - 1):
            w.append(
                theano.shared(
                    value=np.asarray(
                        random_state.uniform(low=-1.0, high=1.0, size=(size_list[i+1], size_list[i])), 
                        dtype=theano.config.floatX
                    ), 
                    borrow=True
                )
            )
            b.append(
                theano.shared(
                    value=np.asarray(
                        random_state.uniform(low=-1.0, high=1.0, size=(size_list[i+1], 1)), 
                        dtype=theano.config.floatX
                    ), 
                    borrow=True,
                    broadcastable=(False,True)
                )
            )
        
        return w, b
    
    #[7,512]
    def __init__(self, input_size, layer_size, output_size, epoch=1, batch=1, learning_rate = .01, seed=1412):
        
        self.epoch = epoch
        self.batch = batch
        self.learning_rate = learning_rate
        self.seed = seed
        self.posteriors = []
        
        x_training_input = T.matrix().astype(config.floatX)
        x_training_input.tag.test_value = np.asarray([
            [ 0.,  0.,  0.,  0.,  0.],
            [ 0.,  0.,  0.,  1.,  0.],
            [ 0.,  0.,  1.,  0.,  0.],
            [ 0.,  0.,  0.,  0.,  1.],
            [ 0.,  0.,  0.,  0.,  0.],
            [ 0.,  0.,  0.,  0.,  0.],
            [ 1.,  0.,  0.,  0.,  0.],
            [ 0.,  0.,  0.,  0.,  0.],
            [ 0.,  1.,  0.,  0.,  0.],
            [ 0.,  0.,  0.,  0.,  0.]
        ]).astype(x_training_input.dtype)
        
        self.w, self.b = self.init_weights_bias(input_size, layer_size, output_size, seed)
        
        # word embedding layer
        word_embedding_layer = T.dot(self.w[0], x_training_input) # [7, 10] * [10, 5] = [7, 5]
        
        # ReLU layer
        z_relu_layer = T.dot(self.w[1], word_embedding_layer) + self.b[1] # [512, 7] * [7, 5] = [512, 5]
        z_relu_layer_shape = T.shape(z_relu_layer)
        z_reshaped_relu_layer = T.reshape(z_relu_layer, [z_relu_layer_shape[0]*z_relu_layer_shape[1], 1])
        relu_layer = T.nnet.relu(z_reshaped_relu_layer)
        relu_layer_reshaped = T.reshape(relu_layer, z_relu_layer_shape) # [512, 5]
        
        # Softmax layer
        z_softmax_layer = T.dot(self.w[2], relu_layer_reshaped) + self.b[2] # [9, 512] * [512, 5] = [9, 5]
        softmax_layer = T.transpose(T.nnet.softmax(T.transpose(z_softmax_layer))) # [9, 5]
        
        # calculate new gradient
        posteriors = T.matrix().astype(config.floatX)
        posteriors.tag.test_value = np.asarray([
            [-0.15,  0.04, -0.26, -0.61, -0.93, -0.72, -0.15, -0.62,  0.62],
            [ 0.07,  0.42,  0.11,  0.95, -0.86, -0.17, -0.22, -0.69, -0.55],
            [-0.79,  0.3 ,  0.06, -0.79,  0.71,  0.86, -0.58,  0.38,  0.05],
            [ 0.92, -0.33, -0.63,  0.99,  0.67, -0.79, -0.08,  0.64, -0.51],
            [-0.08, -0.29,  0.87,  0.6 ,  0.31,  0.75,  0.38, -0.42,  0.11]
        ]).astype(posteriors.dtype)
        
        cost = T.sum(T.transpose(posteriors) * T.log(softmax_layer))
        # TODO: use dw[] and db[] abstractly 
        dw0,dw1,dw2,db1,db2 = T.grad(
            cost=cost, wrt=[self.w[0],self.w[1],self.w[2],self.b[1],self.b[2]]
        )

        # Update w and b
        updates = [
            (self.w[0], self.w[0] - self.learning_rate * dw0), 
            (self.w[1], self.w[1] - self.learning_rate * dw1), 
            (self.b[1], self.b[1] - self.learning_rate * db1),
            (self.w[2], self.w[2] - self.learning_rate * dw2), 
            (self.b[2], self.b[2] - self.learning_rate * db2)
        ]
        
        # Compile model
        self.test = theano.function(
            inputs=[x_training_input, posteriors], 
            outputs=[dw1, softmax_layer]
        ) 
        self.train_mini_batch = theano.function(
            inputs=[x_training_input, posteriors], 
            outputs=[dw2, self.w[2], softmax_layer], 
            updates=updates
        )
        self.test_values = theano.function(
            inputs=[x_training_input], 
            outputs=[softmax_layer]
        )
        
    def train_model(inputs):
        for i in range(self.epoch):
            for x_input in np.split(inputs, self.batch):
                self.posteriors = 
                self.train_mini_batch(x_input, posteriors)
            # TODO: create train_batch function 
    
x = np.asarray([
        [ 0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  1.,  0.],
        [ 0.,  0.,  1.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  1.],
        [ 0.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.],
        [ 1.,  0.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.,  0.],
        [ 0.,  0.,  0.,  0.,  0.]
]).astype(config.floatX)

posteriors = np.asarray([
    [ 0.65, -0.32,  0.44, -0.04, -0.36, -0.81,  0.38, -0.84, -0.93],
    [-0.41, -0.05,  0.96,  0.71,  0.08,  0.85,  0.12,  0.43, -0.08],
    [-0.45,  0.04, -0.94,  0.41,  0.04, -0.3 ,  0.89, -0.09, -0.42],
    [-0.19,  0.32,  0.  ,  0.02, -0.66, -0.41,  0.11, -0.05,  0.76],
    [-0.32,  0.86,  0.09, -0.41, -0.57, -0.55, -0.85, -0.09, -0.27]
]).astype(config.floatX)

input_size = np.shape(x)
d_embedding = 7
layer_size = [d_embedding, 512]
output_size = 9

model = EmissionModel(input_size=input_size, layer_size=layer_size, output_size=output_size)

result = model.train_mini_batch(x, posteriors)
print(np.shape(result[0]))
print(np.shape(result[1]))
print(np.shape(result[2]))
# print(np.shape(model.evaluate_model(x)))
# print(model.calculate_gradient(posteriors))


(9, 512)
(9, 512)
(9, 5)


In [166]:
print(model.test(x, posteriors)[0])
print("")
print(model.test(x, posteriors)[1])

[[ -4.25463021e-01  -1.74731529e+00  -3.40455741e-01 ...,   8.40714455e-01
    1.16684544e+00  -1.74363041e+00]
 [ -1.73396075e+00  -3.52505970e+00  -1.80466461e+00 ...,   1.56047273e+00
    2.06868815e+00  -2.42936778e+00]
 [ -5.50276101e-01   5.80202416e-02  -1.51815784e+00 ...,   9.53709185e-01
    2.68967301e-01   3.69657218e-01]
 ..., 
 [ -9.68661189e-01   5.42034388e-01   1.15126371e+00 ...,  -8.52006793e-01
   -6.70304239e-01   4.77848887e-01]
 [  3.67000699e-01   4.21549737e-01   8.97033513e-03 ...,   2.57458866e-01
    1.96427971e-01   7.65285119e-02]
 [ -4.27518426e-05  -1.21131372e-02   3.77787501e-02 ...,  -8.82423893e-02
    4.54101712e-03   5.87681048e-02]]

[[ -1.84710197e+01  -7.45869827e+01  -6.41599178e+00  -2.60746651e+01
   -1.36473475e+01]
 [ -3.23260193e+01  -8.30433655e+01  -8.10556221e+00  -3.15460014e+01
   -2.47860146e+01]
 [ -4.28678932e+01  -1.31970825e+02  -5.94393015e+00  -4.57771158e+00
   -1.53612356e+01]
 [ -3.80866966e+01  -1.26840454e+02  -1.99077091e

In [167]:
posteriors = np.asarray([
    [ 0.65, -0.32,  0.44, -0.04, -0.36, -0.81,  0.38, -0.84, -0.93],
    [-0.41, -0.05,  0.96,  0.71,  0.08,  0.85,  0.12,  0.43, -0.08],
    [-0.45,  0.04, -0.94,  0.41,  0.04, -0.3 ,  0.89, -0.09, -0.42],
    [-0.19,  0.32,  0.  ,  0.02, -0.66, -0.41,  0.11, -0.05,  0.76],
    [-0.32,  0.86,  0.09, -0.41, -0.57, -0.55, -0.85, -0.09, -0.27]
]).astype(config.floatX)

print(model.test(x, posteriors))

[array([[ -4.25463021e-01,  -1.74731529e+00,  -3.40455741e-01, ...,
          8.40714455e-01,   1.16684544e+00,  -1.74363041e+00],
       [ -1.73396075e+00,  -3.52505970e+00,  -1.80466461e+00, ...,
          1.56047273e+00,   2.06868815e+00,  -2.42936778e+00],
       [ -5.50276101e-01,   5.80202416e-02,  -1.51815784e+00, ...,
          9.53709185e-01,   2.68967301e-01,   3.69657218e-01],
       ..., 
       [ -9.68661189e-01,   5.42034388e-01,   1.15126371e+00, ...,
         -8.52006793e-01,  -6.70304239e-01,   4.77848887e-01],
       [  3.67000699e-01,   4.21549737e-01,   8.97033513e-03, ...,
          2.57458866e-01,   1.96427971e-01,   7.65285119e-02],
       [ -4.27518426e-05,  -1.21131372e-02,   3.77787501e-02, ...,
         -8.82423893e-02,   4.54101712e-03,   5.87681048e-02]], dtype=float32), array([[ -1.84710197e+01,  -7.45869827e+01,  -6.41599178e+00,
         -2.60746651e+01,  -1.36473475e+01],
       [ -3.23260193e+01,  -8.30433655e+01,  -8.10556221e+00,
         -3.15460014

# Alignment with Unsupervised neural hidden markov model

In [200]:
# Initlize non negative set
sentence_length = 4
max_distance = 2
non_negative_set = np.random.randint(low=1, high=100, size=[max_distance+max_distance+3])
po = 0.3
global_transition_matrix = generate_transition_distant_matrix(sentence_length, max_distance, non_negative_set, po)
global_transition_matrix

[[ 63.  69.  76.  58.   0.   0.   0.   0.]
 [ 87.  63.  69.  76.   0.   0.   0.   0.]
 [ 93.  87.  63.  69.   0.   0.   0.   0.]
 [ 53.  93.  87.  63.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]]


array([[ 0.21283784,  0.22115385,  0.25762712,  0.21804511,  0.3       ,
         0.        ,  0.        ,  0.        ],
       [ 0.29391892,  0.20192308,  0.23389831,  0.28571429,  0.        ,
         0.3       ,  0.        ,  0.        ],
       [ 0.31418919,  0.27884615,  0.21355932,  0.2593985 ,  0.        ,
         0.        ,  0.3       ,  0.        ],
       [ 0.17905405,  0.29807692,  0.29491525,  0.23684211,  0.        ,
         0.        ,  0.        ,  0.3       ],
       [ 0.21283784,  0.22115385,  0.25762712,  0.21804511,  0.3       ,
         0.        ,  0.        ,  0.        ],
       [ 0.29391892,  0.20192308,  0.23389831,  0.28571429,  0.        ,
         0.3       ,  0.        ,  0.        ],
       [ 0.31418919,  0.27884615,  0.21355932,  0.2593985 ,  0.        ,
         0.        ,  0.3       ,  0.        ],
       [ 0.17905405,  0.29807692,  0.29491525,  0.23684211,  0.        ,
         0.        ,  0.        ,  0.3       ]])

Transition matrix and "Baum Welch Algorithm"

Compute forward messages: alpha <br>
Compute backward messages: beta <br>
Compute posteriors: <br>
    p(z|x) = alpha * beta <br>
    p(z_i, z_i+1 | x) <br>

In [202]:
class BaumWelchModel:
    
    def generate_transition_distant_matrix(
            self, sentence_length, po=0.):
        """ Generate a transition matrix based on jump distance in the latent sentence.
        We extend the latent sentence for 2*length in which each word has 
        an empty word to represent no-alignment state.
        where [sentence_length:end] elements are empty words considered as 
        latent words having no direct aligment.

        Input
        -----
        sentence_length: the length of latent sentence
                      int value
        non_negative_set: random non-negative set as max_distance size
        po: default value for A->A_empty_word

        Output
        ------
        trans_distant_matrix
        """
        if po==0.:
            po = self.po
        trans_distant_matrix = np.zeros((2*sentence_length, 2*sentence_length))

        for i in range(sentence_length):
            for j in range(sentence_length):
                indice = i - j + self.max_distance + 1
                if indice < 0:
                    p_ = self.non_negative_set[0]
                elif (indice > self.max_distance + self.max_distance + 2):
                    p_ = self.non_negative_set[-1]
                else:
                    p_ = self.non_negative_set[indice]
                trans_distant_matrix[i][j] = p_

        print(trans_distant_matrix)

        for i in range(sentence_length):
            trans_distant_matrix[i+sentence_length][i+sentence_length] = po
            trans_distant_matrix[i][i+sentence_length] = po

            sum_d = np.sum(trans_distant_matrix[:sentence_length, i])
            trans_distant_matrix[:sentence_length, i] = \
                    np.divide(
                        trans_distant_matrix[:sentence_length, i], 
                        sum_d
                    )
            trans_distant_matrix[sentence_length:, i] = \
                    np.copy(trans_distant_matrix[:sentence_length, i])

        return trans_distant_matrix 
    
    def __init__(self, max_distance, po=0.3, seed=1402):
        np.random.seed(seed)
        self.max_distance = max_distance
        self.non_negative_set = np.random.randint(
                                    low=1, high=100, 
                                    size=[max_distance + max_distance + 3]
        )
        self.po = po
        
    def _calc_forward_messages(self, unary_matrix, transition_matrix, emission_matrix,
                              latent_indice_len, observation_sentence):
        """Calcualte the forward messages ~ alpha values.

        Input
        -----
        unary_matrix: marginal probabilities ~ initial matrix.

        Return
        ------
        alpha
        """

        # TODO: verify matrix length
        observation_len = len(observation_sentence)

        alpha = np.zeros( (observation_len, latent_indice_len) )
        alpha[0] = np.multiply(emission_matrix[0], unary_matrix)

        for t in np.arange(1, observation_len):
            for j in range(latent_indice_len):
                sum_al = 0.0;
    #             print("alpha : ", t, j, " :: ", emission_matrix[ observation_sentence[t] ][ j ])
                for i in range(latent_indice_len):
                    sum_al += alpha[t-1][i] * transition_matrix[i][j]
    #                 print("   sum_al: ", alpha[t-1][i], transition_matrix[i][j])

                alpha[t][j] = emission_matrix[ observation_sentence[t] ][ j ] * sum_al


        return alpha
    
    
    def _calc_backward_messages(self, transition_matrix, emission_matrix,
                               latent_indice_len, observation_sentence):
        """Calcualte the backward messages ~ beta values.

        Return
        ------
        beta
        """
        # TODO: verify matrix length
        observation_len = len(observation_sentence)

        beta = np.zeros( (observation_len, latent_indice_len) )
        beta[-1] = [1]*latent_indice_len

        for t in reversed(range(observation_len-1)):
            for i in range(latent_indice_len):
    #             print("beta ", t, i)
                for j in range(latent_indice_len):
                    beta[t][i] += \
                            beta[t+1][j] \
                            * transition_matrix[i][j] \
                            * emission_matrix[ observation_sentence[t+1] ][j]
    #                 print("    ", beta[t+1][j], transition_matrix[i][j], emission_matrix[ observation_sentence[t+1] ][j], beta[t][i])

        return beta

    def _calc_posterior_matrix(self, alpha, beta, latent_indice_len, observation_sentence):
        """Calcualte the gama and epsilon values in order to reproduce 
        better transition and emission matrix.

        Return
        ------
        unary_matrix, posterior_gamma, posterior_epsilon
        """
        # TODO: verify matrix length
        observation_len = len(observation_sentence)


        gamma = np.multiply(alpha, beta)
        epsilon = np.zeros( (observation_len-1, latent_indice_len, latent_indice_len) )

        # Normalization on rows
        for i in range(len(gamma)):
            sum_gamma = np.sum(gamma[i])
            gamma[i] = np.divide(gamma[i], sum_gamma)

        for t in range(observation_len - 1):   
            for i in range(latent_indice_len):
                for j in range(latent_indice_len):
                    epsilon[t][i][j] = \
                            alpha[t][i] * \
                            transition_matrix[j][i] * \
                            beta[t+1][j] * \
                            emission_matrix[ observation_sentence[t+1] ][j]
            # Normalization
            sum_ep = np.sum(epsilon[t])
            epsilon[t] = np.divide(epsilon[t], sum_ep)

        print("gamma: ", gamma)
        print("epsilon: ", epsilon)

        # Update unary matrix
        new_unary_matrix = np.copy(gamma[0])
        #Normalization unary
        sum_unary = np.sum(new_unary_matrix)
        new_unary_matrix = np.divide(new_unary_matrix, sum_unary)

        new_transition_matrix = np.zeros( (latent_indice_len, latent_indice_len) )
        new_emission_matrix = np.zeros( (observation_len, latent_indice_len) )

        # Update transition matrix
        for i in range(latent_indice_len):
            sum_gamma = np.sum(gamma.T[i][:-1])
            for j in range(latent_indice_len):
                sum_ep = np.sum( epsilon.T[j][i][:-1] )
                new_transition_matrix[i][j] = sum_ep/sum_gamma
            # Normalization
            sum_trans = np.sum(new_transition_matrix[i])
            new_transition_matrix[i] = np.divide(new_transition_matrix[i], sum_trans)

        # Update emission matrix
        sum_gamma = [np.sum(gamma.T[i]) for i in range(latent_indice_len)]   
        for i in range(latent_indice_len):
            new_emission_matrix.T[i] = np.divide(gamma.T[i], sum_gamma[i])

        return new_unary_matrix, gamma, epsilon


    def calculate_baum_welch_posteriors(self, sentence_length, 
                                        unary_matrix, emission_matrix,
                                        latent_indice_len, observation_sentence):
        transition_matrix = self.generate_transition_distant_matrix(sentence_length, max_distance)
        
        alpha = calc_forward_messages(
            unary_matrix, transition_matrix, emission_matrix,
            latent_indice_len, observation_sentence
        )
        beta = calc_backward_messages(
            transition_matrix, emission_matrix, latent_indice_len, observation_sentence
        )

        new_unary_matrix, emission_postorior, transition__postorior = calc_posterior_matrix(
                                                alpha, beta, latent_indice_len, observation_sentence
        )

        return gamma, epsilon

In [203]:
# Initlize non negative set
sentence_length = 4
max_distance = 2

baum_welch_model = BaumWelchModel(max_distance)
baum_welch_model.generate_transition_distant_matrix(sentence_length)

[[ 10.  56.  48.  58.   0.   0.   0.   0.]
 [ 59.  10.  56.  48.   0.   0.   0.   0.]
 [ 11.  59.  10.  56.   0.   0.   0.   0.]
 [ 62.  11.  59.  10.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]]


array([[ 0.07042254,  0.41176471,  0.27745665,  0.3372093 ,  0.3       ,
         0.        ,  0.        ,  0.        ],
       [ 0.41549296,  0.07352941,  0.32369942,  0.27906977,  0.        ,
         0.3       ,  0.        ,  0.        ],
       [ 0.07746479,  0.43382353,  0.05780347,  0.3255814 ,  0.        ,
         0.        ,  0.3       ,  0.        ],
       [ 0.43661972,  0.08088235,  0.34104046,  0.05813953,  0.        ,
         0.        ,  0.        ,  0.3       ],
       [ 0.07042254,  0.41176471,  0.27745665,  0.3372093 ,  0.3       ,
         0.        ,  0.        ,  0.        ],
       [ 0.41549296,  0.07352941,  0.32369942,  0.27906977,  0.        ,
         0.3       ,  0.        ,  0.        ],
       [ 0.07746479,  0.43382353,  0.05780347,  0.3255814 ,  0.        ,
         0.        ,  0.3       ,  0.        ],
       [ 0.43661972,  0.08088235,  0.34104046,  0.05813953,  0.        ,
         0.        ,  0.        ,  0.3       ]])

In [None]:
unary_matrix = [.85, .15]
transition_matrix = [
    [.3, .7], 
    [.1, .9]
]
emission_matrix = [
    [.4, .5],
    [.6, .5]
]
latent_indice_len = 2
observation_sentence = [0,1,1,0]


baum_welch_model.calculate_baum_welch_posteriors(sentence_length, unary_matrix, emission_matrix,
                                        latent_indice_len, observation_sentence)