We implement Bahdanau et al. (2015)'s attention architecture with an encoder-decoder to translate French and English phrases from the Tatoeba Project.

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import tensorflow as tf
import tensorflow.keras as keras
import unicodedata
import gc
import collections
import os
import time
import re
import pickle
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Activation, Bidirectional, Embedding, Input

<h2> STEP 1: Data Preprocessing

In [2]:
eng, frn = collections.deque(), collections.deque()
with open("./data/fra.txt",encoding='utf-8') as f:
    for line in f.readlines():
        text = line.strip().split("\t")
        eng.append(text[0])
        frn.append(text[1])
eng, ind = np.unique(eng, return_index=True)
frn = np.array(frn)[ind]

In [3]:
def preprocess(string):
    regexp = re.compile(r'\s+', re.UNICODE)
    ns = regexp.sub(' ', string)
    ns = re.sub("[^a-zA-Z0-9«».,?!\"\']"," ",ns)
    ns = re.sub(r'([«».,?!\"\'])', r' \1 ', ns)
    return ns.lower()
eng = np.vectorize(preprocess)(eng)
eng = [elem.split() for elem in eng]
frn = np.vectorize(preprocess)(frn)
frn = [elem.split() for elem in frn]

In [4]:
#Enforce a 14-word restriction on the set
mask = np.array([len(elem)<=14 for elem in eng])
mask = mask & np.array([len(elem)<=14 for elem in frn])
eng = [eng[i] for i in range(len(eng)) if mask[i]]
frn = [frn[i] for i in range(len(frn)) if mask[i]]

In [5]:
#Tokenize
def tokenize(sents):
    data = np.zeros((len(sents),16), dtype=np.int64)
    word_to_index, index_to_word = {"<begin>":1,"<end>":0}, {1:"<begin>", 0:"<end>"}
    curindex = 2
    for i in range(len(sents)):
        data[i,0] = 1
        for j in range(len(sents[i])):
            if word_to_index.get(sents[i][j], None) is None:
                word_to_index[sents[i][j]] = curindex
                index_to_word[curindex] = sents[i][j]
                curindex+=1
            data[i,j+1] = word_to_index[sents[i][j]]
    return data, word_to_index, index_to_word

In [6]:
#Complete Tokenization and Create train-test sets
engdata, engword_to_index, engindex_to_word = tokenize(eng)
frndata, frnword_to_index, frnindex_to_word = tokenize(frn)
train_eng, test_eng = engdata[:90000], engdata[90000:]
train_frn, test_frn = frndata[:90000], frndata[90000:]
train_eng.shape, test_eng.shape, train_frn.shape, test_frn.shape

((90000, 16), (26084, 16), (90000, 16), (26084, 16))

<h2> STEP 2: Designing Encoder, Decoder, and Attention Systems

In [7]:
def build_encoder():
    tf.keras.backend.clear_session()
    inp = Input((16,))
    embed = Embedding(len(engword_to_index), 256, embeddings_initializer="uniform")
    rep = embed(inp)
    recur = Bidirectional(LSTM(256, return_sequences=True, return_state=True), merge_mode="concat")
    encoding, fwd_hidden_1, fwd_hidden_2, _, _ = recur(rep)
    return Model(inputs=inp, outputs=[encoding, fwd_hidden_1, fwd_hidden_2], name="Encoder")
build_encoder().summary()

Model: "Encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 16)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 16, 256)           3397888   
_________________________________________________________________
bidirectional (Bidirectional [(None, 16, 512), (None,  1050624   
Total params: 4,448,512
Trainable params: 4,448,512
Non-trainable params: 0
_________________________________________________________________


In [8]:
def build_attention():
    tf.keras.backend.clear_session()
    #Take in inputs from encoder
    enc_output = Input((16,512))
    hidden_1, hidden_2 = Input((256,)), Input((256,))
    #Concatenate hidden states and expand dims to broadcast to the output shape
    hidden = tf.expand_dims(tf.concat([hidden_1, hidden_2], axis=1), axis=1)
    #Define the attention layer's sub-layers
    dense1 = Dense(units=512, activation=None)
    dense2 = Dense(units=512, activation=None)
    mid = Activation(activation="tanh")
    final = Dense(units=1, activation=None)
    #Calculate score and attention matrix
    score = final(mid(dense1(enc_output)+dense2(hidden)))
    attmatrix = tf.nn.softmax(score, axis=1)
    vector = tf.reduce_sum(attmatrix * enc_output, axis=1)
    return Model(inputs=[enc_output, hidden_1, hidden_2], outputs=[vector, attmatrix], name="Bahdanau-Attention")
build_attention().summary()

Model: "Bahdanau-Attention"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
tf_op_layer_concat (TensorFlowO [(None, 512)]        0           input_2[0][0]                    
                                                                 input_3[0][0]                    
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 16, 512)]    0                           

In [9]:
def build_decoder(attlayer):
    tf.keras.backend.clear_session()
    #Read in Encoder and previous-prediction Decoder input
    enc_output = Input((16,512))
    hidden_1, hidden_2 = Input((256,)), Input((256,))
    prevpred = Input((1,))
    #Run Bahdanau Attention
    vector, attmatrix = attlayer([enc_output, hidden_1, hidden_2])
    #Extract the French Embedding
    embed = Embedding(len(frnword_to_index), 256, embeddings_initializer="uniform")
    rep = embed(prevpred)
    rep = tf.concat([tf.expand_dims(vector, axis=1), rep], axis=2)
    #Run a Forward LSTM
    recur = LSTM(256, return_state=True)
    pred, newhidden_1, newhidden_2 = recur(rep, initial_state=[hidden_1, hidden_2])
    #Predict Next Word
    pred = Dense(len(frnword_to_index), activation="sigmoid")(pred)
    return Model(inputs=[enc_output, hidden_1, hidden_2, prevpred], 
                 outputs=[pred, newhidden_1, newhidden_2, attmatrix], name="Decoder")
build_decoder(build_attention()).summary()

Model: "Decoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 16, 512)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
Bahdanau-Attention (Model)      [(None, 512), (None, 525825      input_1[0][0]                    
                                                                 input_2[0][0]              

<h2> STEP 3: Build Training Infrastructure

In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
def loss(true, pred):
    ls = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    ls = ls(true, pred)
    temptrue = tf.cast(true, tf.float32)
    ls = tf.where(tf.math.equal(temptrue, 0.0), 0.0, ls)
    return tf.reduce_mean(ls)

In [11]:
#Build all model graphs
encoder = build_encoder()
attnlayer = build_attention()
decoder = build_decoder(attnlayer)

In [12]:
@tf.function
def batch_trainer(engsent, frnsent):
    batchloss = 0
    with tf.GradientTape() as tape:
        encoutput, h1, h2 = encoder(engsent)
        prevpred = tf.expand_dims(frnsent[:,0], axis=1)
        for i in range(1,frnsent.shape[1]):
            prevpred, h1, h2, _ = decoder([encoutput, h1, h2, prevpred])
            batchloss+=loss(frnsent[:,i], prevpred)
            prevpred = tf.expand_dims(tf.argmax(prevpred, axis=-1), axis=1)
        grads = tape.gradient(batchloss, encoder.trainable_variables+decoder.trainable_variables)
        optimizer.apply_gradients(zip(grads, encoder.trainable_variables+decoder.trainable_variables))
    return batchloss.numpy()

In [13]:
numepochs=2
batchsize = 256
edges = np.arange(0, train_eng.shape[0]+batchsize, batchsize)
lossq = collections.deque()
for epoch in range(numepochs):
    eploss = 0
    for i in range(len(edges)-1):
        eploss+=batch_trainer(train_eng[edges[i]:edges[i+1]], train_frn[edges[i]:edges[i+1]])
    lossq.append(eploss/(len(edges)-1))

In [None]:
pickle.dump(lossq, open("./lossvec.pkl","wb"))
encoder.save("./encoder.h5")
decoder.save("./decoder.h5")

<h2>STEP 4: Visualize Performance

In [14]:
lossq

deque([<tf.Tensor: shape=(), dtype=float32, numpy=77.526215>,
       <tf.Tensor: shape=(), dtype=float32, numpy=77.38449>])

In [16]:
lossq[0].numpy()

77.526215