## Transformer Implementation

Based on the following paper: <i>Attention Is All You Need</i>. The following are good guides: [here](http://www.peterbloem.nl/blog/transformers) and [here](http://jalammar.github.io/illustrated-transformer/). Task is date normalization/translation.

In [1]:
import math
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Embedding,Input,Dense,Dot,LayerNormalization,Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.losses import CategoricalCrossentropy
import numpy as np

tf.keras.backend.set_floatx('float64')
import warnings
warnings.filterwarnings('ignore')
import sys
sys.path.insert(1,'../helpers/')
from nmt_utils import *
tf.compat.v1.enable_eager_execution()

### Data Cleaning

In [2]:
# data for the date normalization task
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m=10000)

In [3]:
# adjusting the data so that the vocab is shared between input and output
## this means that the X and Y data (input to encoder/decoder) use the human_vocab dictionary,
## but the output of the decoder will still use the machine_vocab dictionary
human_vocab['-'] = 37
human_vocab['<s>'] = 38
X,Y,_,_ = preprocess_data(dataset, human_vocab=human_vocab, machine_vocab=human_vocab, Tx=30, Ty=10) # used as input to the encoder/decoder
_,_,Xoh,Yoh = preprocess_data(dataset, human_vocab=human_vocab, machine_vocab=machine_vocab, Tx=30, Ty=10) # used as labels for output of the decoder
print(X.shape,Y.shape) # used for the keras embedding layer

(10000, 30) (10000, 10)


In [4]:
dataset[:2] # (human_input, machine_output)

[('9 may 1998', '1998-05-09'), ('10.11.19', '2019-11-10')]

In [5]:
Y[:2]

array([[ 4, 12, 12, 11, 37,  3,  8, 37,  3, 12],
       [ 5,  3,  4, 12, 37,  4,  4, 37,  4,  3]])

In [6]:
# in order to prevent future information contaminating the model's predictions, the start of the input to the decoder must have a new symbol
# the new symbol will essentially represent zero-padding/no-information
# additionally, the right-most element will be deleted, given that the model will never know the final prediction (last element)
decoder_start = np.ones((10000,1))*38 # the symbol '<s>'
decoder_input = np.hstack([decoder_start,Y[:,:9]])
decoder_input[:2]

array([[38.,  4., 12., 12., 11., 37.,  3.,  8., 37.,  3.],
       [38.,  5.,  3.,  4., 12., 37.,  4.,  4., 37.,  4.]])

In [7]:
# ensuring the datatypes are correct
decoder_input = decoder_input.astype('float64') # input to decoder
X = X.astype('float64') # input to encoder
Yoh = Yoh.astype('float64') # labels for output of decoder

In [8]:
def get_position_vectors(num_positions):
    """ returns position vectors of shape num_positionsx256
    args:
        num_positions: length of the input
    """
    position_embeddings = []
    positions = [i for i in range(num_positions)]
    d=256 # the vector size
    
    for pos in positions: # creating an embedding for each item in sequence
        emb = []
        for i in range(0,128):
            emb.append(math.sin(pos/(10000**(2*i/d))))
            emb.append(math.cos(pos/(10000**(2*i/d))))
        emb = np.array(emb)
        position_embeddings.append(emb)
    
    position_embeddings = np.array(position_embeddings)
    position_embeddings = position_embeddings.astype('float64')
    return position_embeddings

In [9]:
position_embeddings = get_position_vectors(10)
position_embeddings.shape

(10, 256)

In [10]:
def get_offset_mask(seq_len=10):
    """ Returns the mask for the decoder attention mechanism; mask intended to be added pre-softmax
    args:
        seq_len: length of the input to the decoder
    """
    mask = np.zeros((seq_len,seq_len))
    mask_locations = np.triu_indices(seq_len,k=1)
    mask[mask_locations] = float("-inf")
    return mask

In [11]:
mask = get_offset_mask(seq_len=3)
mask

array([[  0., -inf, -inf],
       [  0.,   0., -inf],
       [  0.,   0.,   0.]])

### Transformer Model

In [12]:
def attention_layer(Q_x,KV_x,mask=None):
    """ Individual attention block for the encoder/decoder (dim=64)
    args:
        Q_x: input to caclulate the Q matrix (differs from KV_x in encoder-decoder attention block)
        KV_x: input to calculate the K and V matrices
        mask: masking for the decoder attention block
    """
    # Dense layers w/ no bias&activation are equivalent to linear transformations:
    Q = Dense(64,use_bias=False,activation=None)(Q_x) # queries
    K = Dense(64,use_bias=False,activation=None)(KV_x) # keys
    V = Dense(64,use_bias=False,activation=None)(KV_x) # values
    
    unscaled_att_weights = Dot(axes=-1)([Q,K])/tf.cast(tf.sqrt(64.0),tf.float64)
    if mask is not None: # only for the decoder layer
        unscaled_att_weights += mask
    
    att_weights = tf.nn.softmax(unscaled_att_weights,axis=-1)
    att_output = tf.matmul(att_weights,V)
    return att_output

In [13]:
def encoder_block(x,h=4):
    """ Encoder block; num_attention_heads=4
    args:
        h: number of attention heads
    """
    # multi-head attention:
    attention_heads=[]
    for _ in range(h):
        att_output = attention_layer(x,x,mask=None)
        attention_heads.append(att_output)
    
    multi_head_att_output = Concatenate()(attention_heads)
    multi_head_att_output = Dense(256,use_bias=False,activation=None)(multi_head_att_output)
    attention_output = LayerNormalization()(multi_head_att_output+x) # residual block 1
    
    # feed-forward:
    ffn = Dense(512,activation='relu')(attention_output)
    ffn = Dense(256,activation=None)(ffn)
    encoder_output = LayerNormalization()(attention_output+ffn) # residual block 2
    return encoder_output

In [14]:
def decoder_block(x,encoder_output,mask,h=4):
    """ Decoder block; num_attention_heads=4
    args:
        encoder_output: output sequence from encoder
    """
    # decoder multi-head attention:
    attention_heads=[]
    for _ in range(h):
        att_output = attention_layer(x,x,mask=mask)
        attention_heads.append(att_output)
        
    multi_head_att_output = Concatenate()(attention_heads)
    multi_head_att_output = Dense(256,use_bias=False,activation=None)(multi_head_att_output)
    attention_output_1 = LayerNormalization()(multi_head_att_output+x) # residual block 1
    
    # encoder-decoder multi-head attention:
    attention_heads=[]
    for _ in range(h):
        att_output = attention_layer(attention_output_1,encoder_output,mask=None)
        attention_heads.append(att_output)
        
    multi_head_att_output = Concatenate()(attention_heads)
    multi_head_att_output = Dense(256,use_bias=False,activation=None)(multi_head_att_output)
    attention_output_2 = LayerNormalization()(multi_head_att_output+attention_output_1) # residual block 2
    
    # feed-forward:
    ffn = Dense(512,activation='relu')(attention_output_2)
    ffn = Dense(256,activation=None)(ffn)
    decoder_output = LayerNormalization()(attention_output_2+ffn) # residual block 3
    return decoder_output

In [15]:
def transformer(embedder):
    """ Transformer implementation: d_model=256; n_encoder_layers=2; n_decoder_layers=2
    """
    x = Input(shape=(30)) # input to the encoder
    delayed_y = Input(shape=(10)) # input to decoder
    mask = Input(shape=(10,10)) # for the decoder attention calculation
    position_emb = Input(shape=(30,256))
    
    # encoder block
    input_emb = embedder(x)
    embedding = input_emb+position_emb
    e1 = encoder_block(embedding)
    e2 = encoder_block(e1)
    e3 = encoder_block(e2)
    e4 = encoder_block(e3)
    
    # decoder block
    delayed_emb = embedder(delayed_y)
    embedding = delayed_emb+position_emb[:,:10,:]
    d1 = decoder_block(embedding,e4,mask)
    d2 = decoder_block(d1,e4,mask)
    d3 = decoder_block(d2,e4,mask)
    d4 = decoder_block(d3,e4,mask)
    
    # model predictions
    out = Dense(11,activation=None)(d4) # to output vocab_size=11
    
    model = Model(inputs=[x,delayed_y,mask,position_emb],outputs=out)
    return model

In [16]:
embedder = Embedding(39,256)
model = transformer(embedder)

In [17]:
def cost_function(labels,logits): # reduce mean over batches
    return tf.math.reduce_mean(tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels,logits=logits),axis=-1))

In [18]:
optimizer=Adam(lr=0.0001)

In [19]:
# training the model
position_emb = get_position_vectors(30)
position_embedding = np.array([position_emb for i in range(100)])
mask = get_offset_mask(seq_len=10)
mask = np.array([mask for i in range(100)])

for _ in range(20):
    losses = []
    for i in range(0,len(X)-100,100): # batch size of 100
        x_subset = X[i:i+100] # input to encoder
        y_subset = Yoh[i:i+100] # one-hot labels
        y_delayed = decoder_input[i:i+100] # input to decoder
        
        with tf.GradientTape() as tape:
            prediction_logits = model([x_subset,y_delayed,mask,position_embedding])
            loss = cost_function(y_subset,prediction_logits)
            
        losses.append(float(loss))
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    print(sum(losses)/len(losses))

20.533532973148834
12.040308459114357
10.143167755837124
10.05425456858201
9.9752173483436
9.816161877660827
9.21421149737957
8.461537647061093
7.6649314410351925
6.934595526335201
6.097987069469845
5.327165545203649
4.391387596383244
3.4292546149248784
2.485484851235204
1.8928204090485177
1.3170541442834323
0.9330483456528719
1.0586879242693292
0.6299525997656727


In [20]:
i = 11
x_subset = np.expand_dims(X[i],axis=0)
y_delayed = np.expand_dims(decoder_input[i],axis=0)
position_emb = np.expand_dims(get_position_vectors(30),axis=0)
mask = np.expand_dims(get_offset_mask(seq_len=10),axis=0)

In [21]:
pred = K.softmax(model([x_subset,y_delayed,mask,position_emb])).numpy()[0]
"".join([inv_machine_vocab[num] for num in list(np.argmax(pred,axis=-1))])

'1978-06-16'

In [22]:
dataset[i]

('friday june 16 1978', '1978-06-16')