## Implementing RNN with Attention for the task of date normalization

The technique used is outlined in the paper - <i>Neural Machine Translation by Jointly Learning to Align and Translate</i>. This specific project is a component of the deeplearning.ai deep learning course, and the helper functions used to create the fake date data was ported from their tools (as well as much of the architecture). Here, the model is tasked to translate dates such as "25th of october 1990" to "1990-10-25". The output of this model is of fixed length, so there is no necessity for there to be a token indicating the end of the encoding. In addition, I built out a standard encoder-decoder network in order to compare performances. 

In [1]:
from tensorflow.keras import backend as K
from tensorflow.keras.layers import RepeatVector,LSTM,Bidirectional,Dense,ReLU,Softmax
from tensorflow.keras.layers import Input,Concatenate,Dot,Activation,Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.insert(1,'../helpers/')
from nmt_utils import *

In [2]:
# human_vocab is characters, numbers, and certain symbols
# machine_vocab is numbers, and the "-" symbol
# inv_machine_vocab is translation of model prediction argmax to character
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m=10000)
X,Y,Xoh,Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx=30, Ty=10) # output is len 10. assume max input length is 30
print(Xoh.shape,Yoh.shape) # one-hot encodings

(10000, 30, 37) (10000, 10, 11)


In [3]:
dataset[:3] # (human_input, machine_output)

[('9 may 1998', '1998-05-09'),
 ('10.11.19', '2019-11-10'),
 ('9/10/70', '1970-09-10')]

In [4]:
# to run keras functions with numpy data
# sess = tf.InteractiveSession()
# a = np.array([1.0,2,3,4,5])
# out = softmax(a)
# out.eval()
# sess.close()

### RNN With Attention

In [6]:
# initialized as global layers so their paramters are not re-initialized in model inner-loop
repeat = RepeatVector(30) # the max-size of the input, function to be applied
concatenate = Concatenate()
dense_layer = Dense(1,activation = "relu") # single layered DNN, for calculating attention weights
softmax = Softmax(axis=1) # is not axis=1, then softmax won't work (b/c shape is (None,30,1))
dot_product = Dot(axes=1)
post_activation_lstm_cell = LSTM(128,activation="tanh",return_state=True) # returns last state AND output
output_layer = Dense(11,activation='softmax')

In [7]:
def one_step_attention(a,s):
    """ Calculates the context for this timestep
    args:
        a: output from bidirectional rnn
        s: rnn state at the previous timestep; (,128)
    """
    s = repeat(s) # (30,128)
    concat = concatenate([a,s]) # (30,256)
    e = dense_layer(concat) # (30,1), calculating the energies which represent the unbounded attention weights
    att = softmax(e) # (30,1), scaling the attention weights
    context = dot_product([att,a]) # (1,128), calculating the input context vector for post-attention lstm
    return context

In [8]:
def model(input_len=30,output_len=10,in_vocab_size=37,out_vocab_size=11,pre_att_n=64,post_att_n=128):
    """ Returns model object
    args:
        input_len: length of input
        output_len: length of output
        in_vocab_size: size of vocabulary for input
        out_vocab_size: size of vocabulary for output
        pre_att_n: pre-attention lstm number of hidden units
        post_att_n: post-attention lstm number of hidden units
    """
    X = Input(shape=(input_len,in_vocab_size),name="X")
    So = Input(shape=(post_att_n,),name="S") # starting hidden state (zeros)
    Co = Input(shape=(post_att_n,),name="C") # starting cell state (zeros)
    s = So
    c = Co
    
    a = Bidirectional(LSTM(pre_att_n,activation="tanh",return_sequences=True),merge_mode="concat")(X)
    outputs = []
    
    for _ in range(output_len):
        context = one_step_attention(a,s)
        s,_,c = post_activation_lstm_cell(context,initial_state=[s,c]) # one step with the post-activation lstm
        out = output_layer(s) # linear layer, followed by softmax activation
        outputs.append(out)
        
    model = Model(inputs=[X,So,Co],outputs=outputs)
    
    return model

In [10]:
rnn_model = model()

In [12]:
out = rnn_model.compile(optimizer=Adam(lr=0.005),metrics=['accuracy'],loss='categorical_crossentropy')

In [13]:
s0 = np.zeros((10000, 128)) # initialize cell states for all of the training examples
c0 = np.zeros((10000, 128))
outputs = list(Yoh.swapaxes(0,1))

In [None]:
rnn_model.fit([Xoh,s0,c0],outputs,epochs=20,batch_size=100)

In [15]:
example_i = 2000
print(dataset[example_i])

('feb 12 1970', '1970-02-12')


In [22]:
s0 = np.zeros((1,128))
c0 = np.zeros((1,128))
example_x = Xoh[example_i]
example_x = np.expand_dims(example_x,axis=0)

In [23]:
prediction = rnn_model.predict([example_x,s0,c0])

In [25]:
prediction = np.argmax(prediction,axis=-1) # getting index with the largest probability

In [26]:
output = [inv_machine_vocab[int(i)] for i in prediction] # turning prediction back into text

In [27]:
''.join(output) # slight error with the encoding

'1970-02-22'

### Baseline Encoder-Decoder

In [4]:
# initialized as global layers so their paramters are not re-initialized in model inner-loop
softmax = Softmax()
output_layer = Dense(11,activation='softmax')
decoder_lstm_cell = LSTM(138,activation="tanh",return_state=True)

In [5]:
def model(input_len=30,output_len=10,in_vocab_size=37,out_vocab_size=11):
    """ Returns model object, w/ comparable number of trainable parameters as the rnn w/ attention model
    args:
        input_len: length of input
        output_len: length of output
        in_vocab_size: size of vocabulary for input
        out_vocab_size: size of vocabulary for output
    """
    X = Input(shape=(input_len,in_vocab_size),name="X")
    in_0 = Input(shape=(1,11),name="in_0") # first input vector to decoder network (zeros)
    out = in_0
    
    # encoding the input into a 138x1 vector:
    s,_,c = LSTM(138,activation="tanh",return_sequences=False,return_state=True)(X) # encoder
    outputs = []
    
    for _ in range(output_len):
        s,_,c = decoder_lstm_cell(out,initial_state=[s,c])
        out = output_layer(s) # linear layer, followed by softmax activation
        outputs.append(out)
        out = Reshape(target_shape=(1,11))(out) # serves as the next input to the decoder
        
    model = Model(inputs=[X,in_0],outputs=outputs)
    return model

In [6]:
rnn_model = model()

In [8]:
out = rnn_model.compile(optimizer=Adam(lr=0.005),metrics=['accuracy'],loss='categorical_crossentropy')

In [9]:
in0 = np.zeros((10000,1,11)) # initialize the first input to the decoder for all training data
outputs = list(Yoh.swapaxes(0,1))

In [None]:
rnn_model.fit([Xoh,in0],outputs,epochs=20,batch_size=100)

In [11]:
example_i = 2000
print(dataset[example_i])

('feb 12 1970', '1970-02-12')


In [14]:
in0 = np.zeros((1,1,11))
example_x = Xoh[example_i]
example_x = np.expand_dims(example_x,axis=0)

In [15]:
prediction = rnn_model.predict([example_x,in0])

In [16]:
prediction = np.argmax(prediction,axis=-1) # getting index with the largest probability

In [17]:
output = [inv_machine_vocab[int(i)] for i in prediction] # turning prediction back into text

In [18]:
''.join(output)

'1970-02-12'