<img src="https://github.com/martin-fabbri/colab-notebooks/raw/master/bert/images/attention-zoom-in.png" width=1000px alt="Big Picture"/>

In [2]:
import io
import os
import re
import time
import unicodedata

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split

## 1. Inputs

In [2]:
x = [
    [1.0, 0.0, 1.0, 0.0],  # input 1
    [0.0, 2.0, 0.0, 2.0],  # input 2
    [1.0, 1.0, 1.0, 1.0],  # input 3
]

x = tf.constant(x)

## 2. Initialize Queries, Keys, and Values Weights

In [3]:
w_key = [
  [0.0, 0.0, 1.0],
  [1.0, 1.0, 0.0],
  [0.0, 1.0, 0.0],
  [1.0, 1.0, 0.0]
]
w_query = [
  [1.0, 0.0, 1.0],
  [1.0, 0.0, 0.0],
  [0.0, 0.0, 1.0],
  [0.0, 1.0, 1.0]
]
w_value = [
  [0.0, 2.0, 0.0],
  [0.0, 3.0, 0.0],
  [1.0, 0.0, 3.0],
  [1.0, 1.0, 0.0]
]
w_key = tf.constant(w_key)
w_query = tf.constant(w_query)
w_value = tf.constant(w_value)

In [4]:
keys = tf.linalg.matmul(x, w_key)
keys

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[0., 1., 1.],
       [4., 4., 0.],
       [2., 3., 1.]], dtype=float32)>

In [5]:
queries = tf.linalg.matmul(x, w_query)
queries

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 0., 2.],
       [2., 2., 2.],
       [2., 1., 3.]], dtype=float32)>

In [6]:
values = tf.linalg.matmul(x, w_value)
values

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 2., 3.],
       [2., 8., 0.],
       [2., 6., 3.]], dtype=float32)>

<img src="https://github.com/martin-fabbri/colab-notebooks/raw/master/bert/images/attention-nn.png" alt="self-attention block" width=800px>

## 2. Calculate attention scores

<img src="https://github.com/martin-fabbri/colab-notebooks/raw/master/bert/images/multi-head-attention.png" alt="multihead-attention" width="700px">

In [15]:
attention_scores = tf.matmul(queries, keys, transpose_b=True)
attention_scores

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[ 2.,  4.,  4.],
       [ 4., 16., 12.],
       [ 4., 12., 10.]], dtype=float32)>

### 3. Softmax

In [16]:
attention_scores_softmax = tf.nn.softmax(attention_scores)
attention_scores_softmax

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[6.3378938e-02, 4.6831051e-01, 4.6831051e-01],
       [6.0336647e-06, 9.8200780e-01, 1.7986100e-02],
       [2.9538720e-04, 8.8053685e-01, 1.1916770e-01]], dtype=float32)>

## 4. Multiply scores with values

In [17]:
weighted_values = values[:, None] * tf.transpose(attention_scores_softmax)[:,:,None]
weighted_values

<tf.Tensor: shape=(3, 3, 3), dtype=float32, numpy=
array([[[6.3378938e-02, 1.2675788e-01, 1.9013682e-01],
        [6.0336647e-06, 1.2067329e-05, 1.8100995e-05],
        [2.9538720e-04, 5.9077441e-04, 8.8616158e-04]],

       [[9.3662101e-01, 3.7464840e+00, 0.0000000e+00],
        [1.9640156e+00, 7.8560624e+00, 0.0000000e+00],
        [1.7610737e+00, 7.0442948e+00, 0.0000000e+00]],

       [[9.3662101e-01, 2.8098631e+00, 1.4049315e+00],
        [3.5972200e-02, 1.0791660e-01, 5.3958301e-02],
        [2.3833540e-01, 7.1500623e-01, 3.5750312e-01]]], dtype=float32)>

In [19]:
outputs = tf.reduce_sum(weighted_values, axis=0)
outputs

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1.936621 , 6.683105 , 1.5950683],
       [1.9999939, 7.963991 , 0.0539764],
       [1.9997045, 7.759892 , 0.3583893]], dtype=float32)>

## Seq2Seq Attention

In [7]:
dataset_url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"

path_to_zip = tf.keras.utils.get_file(
    'spa-end.zip', origin=dataset_url, extract=True
)

path_to_file = os.path.join(os.path.dirname(path_to_zip), "spa-eng/spa.txt")
path_to_file

'/root/.keras/datasets/spa-eng/spa.txt'

In [24]:
# Uncase sentence and removes accents from sentence
def unicode_to_ascii(s):
    return "".join(
        c
        for c in unicodedata.normalize("NFD", s)
        if unicodedata.category(c) != "Mn"
    )

In [15]:
unicode_to_ascii("¿Dónde está la farmacia?")

'¿Donde esta la farmacia?'

In [19]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    w = re.sub(r"([?.!,¿])", r" \1", w)
    w = re.sub('[" "]+1', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()

    # adding a start and an end token to the sentence
    w = f"<start>{w}<end>"
    return w

In [22]:
en_sentence = u"Where is the drug store?"
sp_sentence = u"¿Dónde está la farmacia?"
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))

<start>where is the drug store ?<end>
<start>¿donde esta la farmacia ?<end>


In [25]:
en_sentence = u"I am going home."
sp_sentence = u"Me voy a la casa."
print(preprocess_sentence(en_sentence))
print(preprocess_sentence(sp_sentence))

<start>i am going home .<end>
<start>me voy a la casa .<end>


In [43]:
# cleans the sentences and returns word pairs in the format [en, sp]
NUM_EXAMPLES = 3000

lines = io.open(path_to_file, encoding="UTF-8").read().strip().split("\n")
word_pairs = [
    [preprocess_sentence(w) for w in l.split("\t")]
    for l in lines[:NUM_EXAMPLES]
]
en, sp = zip(*word_pairs)
print(en[-1])
print(sp[-1])

<start>i m very hot .<end>
<start>estoy muy cachondo .<end>


In [46]:
def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters="")
    lang_tokenizer.fit_on_texts(en)
    tensor = lang_tokenizer.texts_to_sequences(en)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(
        tensor, padding="post"
    )
    return tensor, lang_tokenizer

In [50]:
input_tensor, sp_lang_tokenizer = tokenize(sp)
target_tensor, en_lang_tokenizer = tokenize(en)
max_length_target, max_lenght_input = (
    target_tensor.shape[1],
    input_tensor.shape[1],
)

In [56]:
(
    input_tensor_train,
    input_tensor_val,
    target_tensor_train,
    target_tensor_val,
) = train_test_split(input_tensor, target_tensor, test_size=0.2)
# Show length
print(f"Input train length       {len(input_tensor_train):,}")
print(f"Target train length      {len(target_tensor_train):,}")
print(f"Input validation length  {len(input_tensor_val)}")
print(f"Target validation length {len(target_tensor_val)}")

Input train length       2,400
Target train length      2,400
Input validation length  600
Target validation length 600
