# Conditional Language models with attention

Step by step guide to implementing an Attention model from first principles in Tensorflow based on the notes from the [Oxford Deep NLP course](https://github.com/oxford-cs-deepnlp-2017/lectures/blob/master/Lecture%208%20-%20Conditional%20Language%20Modeling%20with%20Attention.pdf).

In [20]:
# import libraries
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import math

In [4]:
tf.reset_default_graph()

### Generate test data

To start with we need to create some input and output data, taking into account the fact that this data might have different lengths. Let us randomly generate some inputs and outputs. These will represent sentences which have already been processed so that words are represented by integer word ids.

In [18]:
# generate arrays of random numbers where the array length is a random number between 5 and 11
source_inputs = [np.random.randint(2,101,n) for n in  np.random.randint(5,11,25)]
target_inputs = [np.random.randint(2,101,n) for n in  np.random.randint(5,11,25)]

print(source_inputs)
print(target_inputs)

[array([58, 73, 16, 73, 54, 69]), array([91, 18, 85, 84, 16,  5]), array([41, 35,  7, 13, 98, 47, 58]), array([38,  5, 72, 71, 41]), array([38, 37, 83, 51, 98, 26, 10, 66, 92, 13]), array([27, 66, 92, 95, 13, 49]), array([67, 41, 47, 63, 99, 89, 24]), array([18, 48, 79, 71, 27, 16, 87]), array([100,  39,  21,  50,  25]), array([ 6, 84, 99, 77,  4, 29, 94, 89, 67, 25]), array([51, 75, 55,  8, 25, 64]), array([93, 58, 20, 81, 25]), array([31, 49, 86, 76, 84, 64]), array([89, 57, 39, 43, 64, 89, 48]), array([81, 70, 26, 89, 81, 19, 60, 97,  4]), array([69, 21, 80, 83,  2, 61, 63, 84]), array([17, 59, 73, 65, 47, 79, 80]), array([92, 91, 84, 44, 87]), array([ 34,  54,  69,  17,  27,  52,  33, 100,  41]), array([11, 27, 46, 32, 56, 99]), array([39, 41, 75, 52, 58, 26]), array([73,  3,  7, 83, 24, 13, 41, 15,  2]), array([  2, 100,  67,  52,  58]), array([94, 96, 21,  6,  7, 29,  4, 75, 34, 16]), array([76, 80, 39, 56, 37, 90, 45])]
[array([92,  6, 64, 44, 49]), array([10, 74, 84, 31, 40,  2

For use in the RNN, let us calculate sequence lengths

In [8]:
# check lengths of inputs and targets
source_lens = list(map(len,source_inputs))
target_lens = list(map(len,target_inputs))

print(source_lens, target_lens)

([7, 8, 6, 6, 6, 6, 8, 5, 5, 9, 8, 10, 8, 8, 10, 6, 9, 9, 8, 5, 5, 9, 5, 8, 7], [8, 5, 6, 6, 7, 5, 10, 7, 5, 5, 8, 10, 6, 7, 9, 9, 10, 8, 7, 6, 6, 8, 5, 9, 8])


### Pad the input and output arrays so they are of length 10

In [13]:
padded_inputs = np.array([np.concatenate((i,
            [0 for j in range(10-len(i))])) for i in source_inputs])
padded_inputs[0:5]

array([[ 29.,  77.,  41.,  18.,  79.,   5.,  46.,   0.,   0.,   0.],
       [ 13.,  45.,  82.,  93.,   9.,  95.,  75.,  42.,   0.,   0.],
       [ 37.,  92.,  39.,  64.,   4.,  74.,   0.,   0.,   0.,   0.],
       [ 99.,  84.,  59.,  18.,  32.,  73.,   0.,   0.,   0.,   0.],
       [  8.,  83.,   7.,  21.,  10.,  78.,   0.,   0.,   0.,   0.]])

In [15]:
padded_outputs = np.array([np.concatenate((i,
            [0 for j in range(10-len(i))])) for i in target_inputs])
padded_outputs[:5]

array([[ 13.,  73.,  66.,  15.,  58.,  16.,  34.,  14.,   0.,   0.],
       [ 88.,  94.,  16.,  32.,  59.,   0.,   0.,   0.,   0.,   0.],
       [ 93.,  14.,  26.,  18.,  62.,  83.,   0.,   0.,   0.,   0.],
       [  9.,  51.,  96.,  22.,  86.,  98.,   0.,   0.,   0.,   0.],
       [ 37.,  46.,  80.,   2.,  84.,  14.,  30.,   0.,   0.,   0.]])

### Generate a mask to apply to the source and target

The mask is used in the calculation of loss. The mask is 1 when the input is > 0 and 0 otherwise. When there are sentences which are shorter than the maximum length we pad them with 0s. So the RNN outputs that correspond to these 0s should not be included in the calculation of the loss. 

In [14]:
source_mask_array = 1.0*(padded_inputs>0)
source_mask_array[:5]

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.]])

In [17]:
target_mask_array = 1.0*(padded_outputs>0)
target_mask_array[:5]

array([[ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.]])

## Bi-directional RNN to calculate the input matrix representation
The first step is to run a bidirectional RNN and get all the states which are used to create the matrix representation of the input sentence. 

In the output matrix there will be one column per word which has two halves concatenated together:
- a “forward representation”, i.e., a word and its left context (sentence read from left to right)
- a “reverse representation”, i.e., a word and its right context (sentence read from right to left)

We will use tensorflow's birectional_dynamic_RNN for this purpose.

### Define Parameters for model

In [871]:
batch_size = 3
source_vocab_size = 100
target_vocab_size = 100
embed_size = 20
hidden_size = 25
align_size = 15

### Define placeholder for the input

In [872]:
source_ids = tf.placeholder(dtype=tf.int32,shape=[None,None])
source_seq_lens = tf.placeholder(dtype=tf.int32,shape=[None])
source_mask = tf.placeholder(dtype=tf.float32,shape=[None,None])

### Operation to look up embeddings for input

In [873]:
source_embeddings = tf.get_variable('source_embedding_matrix',
                            [source_vocab_size+1, embed_size])
enc_inputs = tf.nn.embedding_lookup(source_embeddings, source_ids)

###  Define RNN

We will use GRU cells for both directions of the RNN and use tensorflow's bidirectional_dynamic_rnn to dynamically unroll the network.

In [875]:
enc_fw_cell = tf.contrib.rnn.GRUCell(hidden_size)
enc_bw_cell = tf.contrib.rnn.GRUCell(hidden_size)

enc_outputs, states = \
tf.nn.bidirectional_dynamic_rnn(cell_fw = enc_fw_cell, 
                                         cell_bw = enc_bw_cell,
                                         inputs = enc_inputs,
                                         sequence_length = source_seq_lens,
                                         dtype = tf.float32)

# concatenate the forward and backward representations
concat_enc_outputs = tf.concat(enc_outputs, axis=2)

So far this is not much different from running an ordinary RNN as you might do for a language modelling task. However now we need to implement a decoder with attention. Whilst tensorflow has functions that can simplify this process, let us go through it step-by-step and create all the necessary intermediary variables. 

First we need to calculate an initial state for the decoder. 

Dimensions of the different matrices.

- X - batch_size x max_sequence_length
- rnn_inputs - batch_size x max_sequence_length x source_embed_size
- Each output - batch_size x max_sequence_length x encoder_state_size
- F = concat_outputs - batch_size x (max_sequence_length\*2) x encoder_state_size
- U - encoder_state_size x decoder_state_size
- bw_output1 - batch_size x encoder_state_size

- s_0 = decoder_state = tf.matmul(U,tf.transpose(bw_output1))

- s_0 = decoder_state - batch_size x decoder_state_size

- W - [dim] x (max_sequence_length\*2)

- X - batch_size x [dim] x encoder_state_size

- V - [dim] x 


- c_t - batch_size x target_embed_size
- P - vocab_size x decoder_state_size
- b - vocab_size

$U$ is called the attention energy. It is calculated by taking the dot product with every column in the source matrix

In [28]:
bw_enc_output1 = enc_outputs[-1][:,1,:] # this is the last hidden state of the encoder in the reverse direction
U = tf.get_variable('U',[hidden_size,hidden_size])

$s_0$ this is the last hidden state of the encoder

In [None]:
decoder_state = tf.matmul(bw_enc_output1,U)

$W$ is a learned parameter of the model - a weight matrix applied to the input

In [881]:
batch_size = tf.placeholder(tf.int32)
W = tf.get_variable('W',[1,2*hidden_size,align_size],dtype=tf.float32)
W_rep = tf.tile(W,[batch_size,1,1])

We do the calculation for $WF$ in advance as it does not depend on the output dimensions.

In [882]:
X = tf.matmul(concat_enc_outputs,W_rep) # concat_enc_outputs is F

$V$ is a learned parameter of the model - weight matrix applied for transforming the previous hidden state

In [883]:
V = tf.get_variable('V',[hidden_size,align_size])
v = tf.get_variable('v',[1,align_size,1])
v_rep = tf.tile(v,[batch_size,1,1])

$P$ and $b$ are also learned parameters of the model - the weights and the biases for the output layer

In [886]:
P = tf.get_variable('P',[hidden_size,target_vocab_size+2])
b = tf.get_variable('b',[target_vocab_size+2])

An embedding matrix for the target vocabulary

In [887]:
target_embeddings = tf.get_variable('target_embedding_matrix',
                            [target_vocab_size+2, embed_size])

Create an RNN cell for the decoder

In [888]:
with tf.variable_scope('RNN'):
    decoder_cell = tf.contrib.rnn.GRUCell(hidden_size)
    
decoder_embed = tf.nn.embedding_lookup(target_embeddings, 
                                       tf.ones([batch_size],tf.int32))

decoder_output = tf.placeholder(tf.int32,[None,None])
decoder_length = 10

target_mask = tf.placeholder(dtype=tf.float32,shape=[None,None])
target_seq_lens = tf.placeholder(dtype=tf.int32,shape=[None])

total_loss = 0

## Define the decoder operations

Here we will the operations of the decoder as outlined in slide 72 of Lecture 8. For each time step of the decoder, we calculate the attention weights and then apply this to the input to calculate the output at the current time step.

In [895]:
for t in range(decoder_length):
    with tf.variable_scope('RNN'):
        logits = []
        if t > 0:
            tf.get_variable_scope().reuse_variables()
            
        # compute attention
        r_t = tf.matmul(decoder_state,V)
        
        tanh_input = X + tf.expand_dims(r_t,axis=1)
        u_t = tf.matmul(tanh_input,v_rep)
        u_t = tf.squeeze(u_t,axis=2)
        
        # for this time step calculate the attention weight for each word of the input
        exp_u_t = tf.exp(u_t)
        softmax_denom = tf.reduce_sum(exp_u_t*source_mask,axis=1,keep_dims=True)
        a_t = exp_u_t/softmax_denom
        a_t = a_t*source_mask
        a_expn = tf.expand_dims(a_t,axis=1)
        
        # apply attention weights to the input matrix F
        c_t = tf.matmul(a_expn,concat_enc_outputs)
        c_t = tf.squeeze(c_t,axis=1)
        
        # use the attented input to calculate the 
        decoder_input = tf.concat([decoder_embed,c_t],
                                  axis=1)
        decoder_state,_ = decoder_cell(decoder_input,
                                       decoder_state) 
        
        logit = tf.nn.xw_plus_b(decoder_state,P,b)
        
        # choose the output word based on the argmax of the output logits 
        logits.append(tf.argmax(logit, 1))
        
        # transform the output ids into one hot vectors of length vocab size in order to calculate cross entropy
        onehot = tf.one_hot(decoder_output[:,t],
                            depth=target_vocab_size+2)
        
        # compute cross entropy loss 
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                        logits = logit,
                        labels = onehot
                    )
        
        xe_mask = tf.cast(tf.greater(decoder_output[:,t],0),tf.float32)
        
        cross_entropy = xe_mask*cross_entropy
        
        loss = tf.reduce_sum(cross_entropy)

        total_loss += loss
    
avg_loss = total_loss / tf.reduce_sum(target_mask)
train_op = tf.train.AdamOptimizer(1e-4).minimize(avg_loss)

## Run the decoder 

Here we define a TensorFlow session, initialise the variables for the source and target, and execute the operations of the decoder.

In [21]:
# set the parameters for the decoder
num_examples = len(padded_inputs)
batch_size_ = 6
start = 0

In [23]:
for i in range(math.ceil(num_examples/batch_size_)):
    start = i*batch_size_
    end = start + batch_size_
    with tf.Session() as sess:
        init_op = tf.global_variables_initializer()
        sess.run(init_op)
        logits_,avg_loss_,_ = sess.run([logits,avg_loss,train_op],
                            feed_dict={
                                       batch_size:min(num_examples, end) - start,
                                       source_mask:source_mask_array[start:end,:],
                                       target_mask:target_mask_array[start:end,:],
                                       source_ids:padded_inputs[start:end], 
                                       source_seq_lens:source_lens[start:end],
                                       target_seq_lens:target_lens[start:end],
                                       decoder_output:padded_outputs[start:end,:],
                                      })
    print(logits_,avg_loss_)