We implement Bahdanau et al. (2015)'s attention architecture with an encoder-decoder to translate French and English phrases from the Tatoeba Project.

In [5]:
import pandas as pd
import numpy as np
import scipy as sp
import tensorflow as tf
import tensorflow.keras as keras
import unicodedata
import gc
import collections
import os
import time
import re
import pickle
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from copy import deepcopy
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Activation, Bidirectional, Embedding, Input

<h2> STEP 1: Data Preprocessing

In [6]:
eng, frn = collections.deque(), collections.deque()
with open("./data/fra.txt",encoding='utf-8') as f:
    for line in f.readlines():
        text = line.strip().split("\t")
        eng.append(text[0])
        frn.append(text[1])
eng, ind = np.unique(eng, return_index=True)
frn = np.array(frn)[ind]

In [7]:
def preprocess(string):
    regexp = re.compile(r'\s+', re.UNICODE)
    ns = regexp.sub(' ', string)
    ns = re.sub("[^a-zA-Z0-9«».,?!\"\']"," ",ns)
    ns = re.sub(r'([«».,?!\"\'])', r' \1 ', ns)
    return ns.lower()
eng = np.vectorize(preprocess)(eng)
eng = [elem.split() for elem in eng]
frn = np.vectorize(preprocess)(frn)
frn = [elem.split() for elem in frn]

In [8]:
#Enforce a 14-word restriction on the set
mask = np.array([len(elem)<=14 for elem in eng])
mask = mask & np.array([len(elem)<=14 for elem in frn])
eng = [eng[i] for i in range(len(eng)) if mask[i]]
frn = [frn[i] for i in range(len(frn)) if mask[i]]

In [9]:
#Tokenize
def tokenize(sents):
    data = np.zeros((len(sents),16), dtype=np.int64)
    word_to_index, index_to_word = {"<begin>":1,"<end>":2}, {1:"<begin>", 2:"<end>"}
    curindex = 3
    for i in range(len(sents)):
        data[i,0] = 1
        for j in range(len(sents[i])):
            if word_to_index.get(sents[i][j], None) is None:
                word_to_index[sents[i][j]] = curindex
                index_to_word[curindex] = sents[i][j]
                curindex+=1
            data[i,j+1] = word_to_index[sents[i][j]]
        data[i,len(sents[i])+1] = word_to_index["<end>"]
    return data, word_to_index, index_to_word

In [10]:
#Complete Tokenization and Create train-test sets
engdata, engword_to_index, engindex_to_word = tokenize(eng)
frndata, frnword_to_index, frnindex_to_word = tokenize(frn)
train_eng, test_eng = engdata[:90000], engdata[90000:]
train_frn, test_frn = frndata[:90000], frndata[90000:]
train_eng.shape, test_eng.shape, train_frn.shape, test_frn.shape

((90000, 16), (26084, 16), (90000, 16), (26084, 16))

<h2> STEP 2: Designing Encoder, Decoder, and Attention Systems

In [11]:
def build_encoder():
    tf.keras.backend.clear_session()
    inp = Input((16,))
    embed = Embedding(len(engword_to_index)+1, 256, embeddings_initializer="uniform")
    rep = embed(inp)
    encoding, hidden_h, hidden_c = LSTM(512, return_sequences=True, return_state=True)(rep)
    return Model(inputs=inp, outputs=[encoding, hidden_h, hidden_c], name="Encoder")
build_encoder().summary()

Model: "Encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 16)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 16, 256)           3398144   
_________________________________________________________________
lstm (LSTM)                  [(None, 16, 512), (None,  1574912   
Total params: 4,973,056
Trainable params: 4,973,056
Non-trainable params: 0
_________________________________________________________________


In [12]:
def build_attention():
    tf.keras.backend.clear_session()
    #Take in inputs from encoder
    enc_output = Input((16,512))
    hidden_h = Input((512,))
    #expand dims to broadcast to the output shape
    hidden = tf.expand_dims(hidden_h, axis=1)
    #Define the attention layer's sub-layers
    dense1 = Dense(units=512, activation=None)
    dense2 = Dense(units=512, activation=None)
    mid = Activation(activation="tanh")
    final = Dense(units=1, activation=None)
    #Calculate score and attention matrix
    score = final(mid(dense1(enc_output)+dense2(hidden)))
    attmatrix = tf.nn.softmax(score, axis=1)
    vector = tf.reduce_sum(attmatrix * enc_output, axis=1)
    return Model(inputs=[enc_output, hidden_h], outputs=[vector, attmatrix], name="Bahdanau-Attention")
build_attention().summary()

Model: "Bahdanau-Attention"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 16, 512)]    0                                            
__________________________________________________________________________________________________
tf_op_layer_ExpandDims (TensorF [(None, 1, 512)]     0           input_2[0][0]                    
__________________________________________________________________________________________________
dense (Dense)                   (None, 16, 512)      262656      input_1[0][0]                    
_________________________________________________________________________________

In [13]:
def build_decoder(attlayer):
    tf.keras.backend.clear_session()
    #Read in Encoder and previous-prediction Decoder input
    enc_output = Input((16,512))
    hidden_h, hidden_c = Input((512,)), Input((512,))
    prevpred = Input((1,))
    #Run Bahdanau Attention
    vector, attmatrix = attlayer([enc_output, hidden_h])
    #Extract the French Embedding
    embed = Embedding(len(frnword_to_index)+1, 256, embeddings_initializer="uniform")
    rep = embed(prevpred)
    rep = tf.concat([tf.expand_dims(vector, axis=1), rep], axis=2)
    #Run a Forward LSTM
    recur = LSTM(512, return_sequences=True, return_state=True)
    pred, newhidden_h, newhidden_c = recur(rep, initial_state=[hidden_h, hidden_c])
    pred = tf.squeeze(pred, [1])
    #Predict Next Word
    pred = Dense(len(frnword_to_index)+1)(pred)
    return Model(inputs=[enc_output, hidden_h, hidden_c, prevpred], 
                 outputs=[pred, newhidden_h, newhidden_c, attmatrix], name="Decoder")
build_decoder(build_attention()).summary()

Model: "Decoder"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 16, 512)]    0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 512)]        0                                            
__________________________________________________________________________________________________
Bahdanau-Attention (Model)      [(None, 512), (None, 525825      input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                      

<h2> STEP 3: Build Training Infrastructure

In [29]:
optimizer = tf.keras.optimizers.Adam()
def loss(true, pred):
    ls = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
    ls = ls(true, pred)
    print(ls)
    temptrue = tf.cast(true, tf.float32)
    ls = tf.where(tf.math.equal(temptrue, 0.0), 0.0, ls)
    return tf.reduce_mean(ls)

In [15]:
#Build all model graphs
encoder = build_encoder()
attnlayer = build_attention()
decoder = build_decoder(attnlayer)

In [16]:
@tf.function
def batch_trainer(engsent, frnsent):
    batchloss = 0
    with tf.GradientTape() as tape:
        encoutput, hh, hc = encoder(engsent)
        prevpred = tf.expand_dims(frnsent[:,0], axis=1)
        for i in range(1,frnsent.shape[1]):
            pred, hh, hc, _ = decoder([encoutput, hh, hc, prevpred])
            batchloss+=loss(frnsent[:,i], pred)
            prevpred = tf.expand_dims(frnsent[:,i], axis=1)
        batchloss /= frnsent.shape[1]
        grads = tape.gradient(batchloss, encoder.trainable_variables+decoder.trainable_variables)
        optimizer.apply_gradients(zip(grads, encoder.trainable_variables+decoder.trainable_variables))
    return batchloss

In [None]:
numepochs=100
batchsize = 128
trainedges = np.arange(0, train_eng.shape[0]+batchsize, batchsize)
trainloss = collections.deque()
for epoch in range(numepochs):
    eptrain = 0
    for i in range(len(trainedges)-1):
        eptrain+=batch_trainer(train_eng[trainedges[i]:trainedges[i+1]], 
                               train_frn[trainedges[i]:trainedges[i+1]])
    trainloss.append(eptrain/(len(trainedges)-1))

In [None]:
pickle.dump(trainloss, open("./data/trainloss.pkl","wb"))
encoder.save("./data/encoder.h5")
decoder.save("./data/decoder.h5")

<h2>STEP 4: Visualize Performance

In [18]:
encoder.load_weights("./data/encoder.h5")
decoder.load_weights("./data/decoder.h5")

In [44]:
def evaluator(engsent, frnsent):
    #Set up the sentence prediction matrix
    predfrnsent = np.zeros(frnsent.shape, dtype=np.int64)
    predfrnsent[:,0] = frnsent[:,0]
    #Set up the attention matrix
    frn_attn_matrix = np.zeros((frnsent.shape[0], engsent.shape[1], frnsent.shape[1]))
    encoutput, hc, hh = encoder.predict(engsent)
    prevpred = deepcopy(frnsent[:,0]).reshape(-1,1)
    for i in range(1,frnsent.shape[1]):
        pred, hh, hc, attmatrix = decoder.predict([encoutput, hh, hc, prevpred])
        predfrnsent[:,i] = np.argmax(pred, axis=1)
        prevpred = predfrnsent[:,i].reshape(-1,1)
        frn_attn_matrix[:,:,i] = attmatrix.reshape(-1,16)
    return predfrnsent, frn_attn_matrix

In [57]:
def get_sentences(sent, index_to_word):
    ret = collections.deque()
    for i in range(sent.shape[0]):
        phrase = ""
        for j in range(sent.shape[1]):
            phrase+=index_to_word[sent[i,j]]+" "
            if index_to_word[sent[i,j]]=="<end>":
                break
        ret.append(phrase)
    return ret

In [58]:
pred = np.zeros(test_frn.shape, dtype=np.int64)
batchsize = 256
edges = np.arange(0,  pred.shape[0]+batchsize, batchsize)
for i in range(len(edges)-1):
    pred[edges[i]:edges[i+1]] = evaluator(test_eng[edges[i]:edges[i+1]],
                                         test_frn[edges[i]:edges[i+1]])[0]

In [61]:
predictedsent = get_sentences(pred, frnindex_to_word)
truesent = get_sentences(test_frn, frnindex_to_word)