<img src="https://github.com/martin-fabbri/colab-notebooks/raw/master/bert/images/attention-zoom-in.png" width=1000px alt="Big Picture"/>

In [108]:
import io
import os
import re
import time
import unicodedata

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Embedding, GRU, Layer, Dense
from sklearn.model_selection import train_test_split

## 1. Inputs

In [2]:
x = [
    [1.0, 0.0, 1.0, 0.0],  # input 1
    [0.0, 2.0, 0.0, 2.0],  # input 2
    [1.0, 1.0, 1.0, 1.0],  # input 3
]

x = tf.constant(x)

## 2. Initialize Queries, Keys, and Values Weights

In [3]:
w_key = [
  [0.0, 0.0, 1.0],
  [1.0, 1.0, 0.0],
  [0.0, 1.0, 0.0],
  [1.0, 1.0, 0.0]
]
w_query = [
  [1.0, 0.0, 1.0],
  [1.0, 0.0, 0.0],
  [0.0, 0.0, 1.0],
  [0.0, 1.0, 1.0]
]
w_value = [
  [0.0, 2.0, 0.0],
  [0.0, 3.0, 0.0],
  [1.0, 0.0, 3.0],
  [1.0, 1.0, 0.0]
]
w_key = tf.constant(w_key)
w_query = tf.constant(w_query)
w_value = tf.constant(w_value)

In [4]:
keys = tf.linalg.matmul(x, w_key)
keys

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0., 1., 1.],
       [4., 4., 0.],
       [2., 3., 1.]], dtype=float32)>

In [5]:
queries = tf.linalg.matmul(x, w_query)
queries

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 0., 2.],
       [2., 2., 2.],
       [2., 1., 3.]], dtype=float32)>

In [6]:
values = tf.linalg.matmul(x, w_value)
values

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 2., 3.],
       [2., 8., 0.],
       [2., 6., 3.]], dtype=float32)>

<img src="https://github.com/martin-fabbri/colab-notebooks/raw/master/bert/images/attention-nn.png" alt="self-attention block" width=800px>

## 2. Calculate attention scores

<img src="https://github.com/martin-fabbri/colab-notebooks/raw/master/bert/images/multi-head-attention.png" alt="multihead-attention" width="700px">

In [15]:
attention_scores = tf.matmul(queries, keys, transpose_b=True)
attention_scores

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[ 2.,  4.,  4.],
       [ 4., 16., 12.],
       [ 4., 12., 10.]], dtype=float32)>

### 3. Softmax

In [16]:
attention_scores_softmax = tf.nn.softmax(attention_scores)
attention_scores_softmax

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[6.3378938e-02, 4.6831051e-01, 4.6831051e-01],
       [6.0336647e-06, 9.8200780e-01, 1.7986100e-02],
       [2.9538720e-04, 8.8053685e-01, 1.1916770e-01]], dtype=float32)>

## 4. Multiply scores with values

In [17]:
weighted_values = values[:, None] * tf.transpose(attention_scores_softmax)[:,:,None]
weighted_values

<tf.Tensor: shape=(3, 3, 3), dtype=float32, numpy=
array([[[6.3378938e-02, 1.2675788e-01, 1.9013682e-01],
        [6.0336647e-06, 1.2067329e-05, 1.8100995e-05],
        [2.9538720e-04, 5.9077441e-04, 8.8616158e-04]],

       [[9.3662101e-01, 3.7464840e+00, 0.0000000e+00],
        [1.9640156e+00, 7.8560624e+00, 0.0000000e+00],
        [1.7610737e+00, 7.0442948e+00, 0.0000000e+00]],

       [[9.3662101e-01, 2.8098631e+00, 1.4049315e+00],
        [3.5972200e-02, 1.0791660e-01, 5.3958301e-02],
        [2.3833540e-01, 7.1500623e-01, 3.5750312e-01]]], dtype=float32)>

In [19]:
outputs = tf.reduce_sum(weighted_values, axis=0)
outputs

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1.936621 , 6.683105 , 1.5950683],
       [1.9999939, 7.963991 , 0.0539764],
       [1.9997045, 7.759892 , 0.3583893]], dtype=float32)>

## Seq2Seq Attention

In [7]:
dataset_url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

path_to_zip = tf.keras.utils.get_file(
    'spa-end.zip', origin=dataset_url, extract=True
)

path_to_file = os.path.join(os.path.dirname(path_to_zip), "spa-eng/spa.txt")
path_to_file

'/root/.keras/datasets/spa-eng/spa.txt'

In [24]:
# Uncase sentence and removes accents from sentence
def unicode_to_ascii(s):
    return "".join(
        c
        for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn"
    )

In [15]:
unicode_to_ascii("¿Dónde está la farmacia?")

'¿Donde esta la farmacia?'

In [19]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1", w)
    w = re.sub('[" "]+1', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()

    # adding a start and an end token to the sentence
    w = f"<start>{w}<end>"
    return w

In [22]:
en_sentence = u"Where is the drug store?"
sp_sentence = u"¿Dónde está la farmacia?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))

<start>where is the drug store ?<end>
<start>¿donde esta la farmacia ?<end>


In [25]:
en_sentence = u"I am going home."
sp_sentence = u"Me voy a la casa."
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))

<start>i am going home .<end>
<start>me voy a la casa .<end>


In [74]:
# cleans the sentences and returns word pairs in the format [en, sp]
NUM_EXAMPLES = 3000

lines = io.open(path_to_file, encoding="UTF-8").read().strip().split("\n")
word_pairs = [
    [preprocess_sentence(w) for w in l.split("\t")]
    for l in lines[:NUM_EXAMPLES]
]
en, sp = zip(*word_pairs)
print(en[0])
print(sp[0])

<start>go .<end>
<start>ve .<end>


In [81]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="")
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(
        tensor, padding="post"
    )
    return tensor, lang_tokenizer

In [82]:
input_tensor, sp_lang_tokenizer = tokenize(sp)
target_tensor, en_lang_tokenizer = tokenize(en)
max_length_target, max_lenght_input = (
    target_tensor.shape[1],
    input_tensor.shape[1],
)

In [84]:
(
    input_tensor_train,
    input_tensor_val,
    target_tensor_train,
    target_tensor_val,
) = train_test_split(input_tensor, target_tensor, test_size=0.2)
# Show length
print(f"Input train length       {len(input_tensor_train):,}")
print(f"Target train length      {len(target_tensor_train):,}")
print(f"Input validation length  {len(input_tensor_val)}")
print(f"Target validation length {len(target_tensor_val)}")

Input train length       2,400
Target train length      2,400
Input validation length  600
Target validation length 600


In [85]:
def convert(lang, tensor):
  for t in tensor:
    if t != 0:
      print(f"{t} ---->{lang.index_word[t]}")

In [86]:
print ("Input Language; index to word mapping")
convert(sp_lang_tokenizer, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(en_lang_tokenizer, target_tensor_train[0])

Input Language; index to word mapping
61 ----><start>ven
1949 ---->enseguida
1 ---->.<end>

Target Language; index to word mapping
21 ----><start>come
102 ---->at
932 ---->once
1 ---->.<end>


### Create a tf.data Dataset

In [87]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
STEPS_PER_EPOCH = BUFFER_SIZE // BATCH_SIZE
EMBEDDING_DIM = 256
UNITS = 1024
VOCAB_INPUT_SIZE = len(sp_lang_tokenizer.word_counts) + 1
VOCAB_TARGET_SIZE = len(en_lang_tokenizer.word_counts) + 1

In [90]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=37>

In [91]:
input_batch, target_batch = next(iter(dataset))
input_batch.shape, target_batch.shape

(TensorShape([64, 8]), TensorShape([64, 6]))

In [102]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units # number of units in output space
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.gru = GRU(self.enc_units, return_sequences=True, return_state=True, recurrent_initializer="glorot_uniform")

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [105]:
encoder = Encoder(VOCAB_INPUT_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE)

sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(input_batch, sample_hidden)
print("VOCAB_INPUT_SIZE ", VOCAB_INPUT_SIZE)
print("EMBEDDING_DIM    ", EMBEDDING_DIM)
print("UNIT             ", UNITS)
print("BATCH SIZR       ", BATCH_SIZE)
print("Encoder output shape: (batch size, seq len, units)", sample_output.shape)
print("Encoder Hidden state shape: (batch size, units)", sample_hidden.shape)

VOCAB_INPUT_SIZE  2177
EMBEDDING_DIM     256
UNIT              1024
BATCH SIZR        64
Encoder output shape: (batch size, seq len, units) (64, 8, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


## Bahdanau attention

We will implement Bahdanau attention where:

* FC = Fully connected (dense) layer
* EO = Encoder output
* H = hidden state
* X = input to the decoder

And the pseudo-code:

* `score = FC(tanh(FC(EO) + FC(H)))`
* `attention weights = softmax(score, axis = 1)`. Softmax by default is applied on the last axis but here we want to apply it on the *1st axis*, since the shape of score is *(batch_size, max_length, hidden_size)*. `Max_length` is the length of our input. Since we are trying to assign a weight to each input, softmax should be applied on that axis.
* `context vector = sum(attention weights * EO, axis = 1)`. Same reason as above for choosing axis as 1.
* `embedding output` = The input to the decoder X is passed through an embedding layer.
* `merged vector = concat(embedding output, context vector)`
* This merged vector is then given to the GRU

The shapes of all the vectors at each step have been specified in the comments in the code:

In [109]:
#@markdown <img src="https://www.tensorflow.org/images/seq2seq/attention_equation_0.jpg" alt="attention equation 0" width="800">
#@markdown <img src="https://www.tensorflow.org/images/seq2seq/attention_equation_1.jpg" alt="attention equation 1" width="800">
class BahdanauAttention(Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to 
        # calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


In [110]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 8, 1)


In [113]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    # enc_output shape == (batch_size, max_length, hidden_size)
    context_vector, attention_weights = self.attention(hidden, enc_output)

    # x shape after passing through embedding == (batch_size, 1, embedding_dim)
    x = self.embedding(x)

    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

    # passing the concatenated vector to the GRU
    output, state = self.gru(x)

    # output shape == (batch_size * 1, hidden_size)
    output = tf.reshape(output, (-1, output.shape[2]))

    # output shape == (batch_size, vocab)
    x = self.fc(output)

    return x, state, attention_weights

In [115]:
decoder = Decoder(VOCAB_TARGET_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 1036)


In [116]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [117]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [123]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([en_lang_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [124]:
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()

  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(dataset.take(STEPS_PER_EPOCH)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

KeyError: ignored

In [125]:
en_lang_tokenizer.word_index

{'.<end>': 1,
 '<start>i': 2,
 '?<end>': 3,
 's': 4,
 'm': 5,
 '!<end>': 6,
 '<start>tom': 7,
 'it': 8,
 'you': 9,
 'tom': 10,
 '<start>we': 11,
 '<start>he': 12,
 '<start>it': 13,
 'is': 14,
 'me': 15,
 'a': 16,
 't': 17,
 '<start>you': 18,
 're': 19,
 'this': 20,
 '<start>come': 21,
 'll': 22,
 'go': 23,
 'in': 24,
 'up': 25,
 '<start>go': 26,
 '<start>let': 27,
 '<start>is': 28,
 '<start>don': 29,
 'on': 30,
 'him': 31,
 '<start>who': 32,
 'here': 33,
 'am': 34,
 '<start>are': 35,
 'can': 36,
 '<start>that': 37,
 '<start>be': 38,
 '<start>they': 39,
 'that': 40,
 'like': 41,
 '<start>get': 42,
 'was': 43,
 'love': 44,
 'out': 45,
 '<start>keep': 46,
 '<start>look': 47,
 'us': 48,
 '<start>do': 49,
 '<start>stop': 50,
 'home': 51,
 'i': 52,
 '<start>how': 53,
 '<start>what': 54,
 'know': 55,
 'saw': 56,
 'need': 57,
 '<start>take': 58,
 'not': 59,
 '<start>she': 60,
 'help': 61,
 'away': 62,
 '<start>have': 63,
 'see': 64,
 '<start>can': 65,
 'want': 66,
 '<start>no': 67,
 'to': 68,
