# Stage 1: Importing dependencies

Implementation of a Transformer described in the ["Attention is all you need" paper](https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf)

In [1]:
import numpy as np
import math
import re
import time

In [2]:
#!pip install tensorflow==2.* tensorflow-datasets==4.5.2
#!conda install -y -c anaconda tensorflow==2.* tensorflow-datasets==4.5.2

In [3]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Stage 2: Data preprocessing

## Loading files

We import files previously downloaded from the [European Parliament Proceedings Parallel Corpus 1996-2011](https://www.statmt.org/europarl/) including the corpus and the nonbreaking prefixes for both languages

In [4]:
with open("data/europarl-v7.es-en.en",
          mode='r',
          encoding="utf-8") as f:
    europarl_en = f.read()
with open("data/europarl-v7.es-en.es",
          mode='r',
          encoding="utf-8") as f:
    europarl_es = f.read()
with open("data/nonbreaking_prefix.en",
          mode='r',
          encoding="utf-8") as f:
    non_breaking_prefix_en = f.read()
with open("data/nonbreaking_prefix.es",
          mode='r',
          encoding="utf-8") as f:
    non_breaking_prefix_es = f.read()

print(europarl_en[:25])
print(europarl_es[:25])

Resumption of the session
Reanudación del período d


## Cleaning data

Getting the non_breaking_prefixes as a clean list of words with a point at the end so it is easier to use.

In [5]:
non_breaking_prefix_en = non_breaking_prefix_en.split("\n")
non_breaking_prefix_en = [' ' + pref + '.' for pref in non_breaking_prefix_en if not pref.startswith('#')]
non_breaking_prefix_es = non_breaking_prefix_es.split("\n")
non_breaking_prefix_es = [' ' + pref + '.' for pref in non_breaking_prefix_es if not pref.startswith('#')]

print(non_breaking_prefix_en[:3])
print(non_breaking_prefix_es[:3])

[' .', ' A.', ' B.']
[' .', ' A.', ' B.']


We will need each word and other symbol that we want to keep to be in lower case and separated by spaces so we can "tokenize" them.

In [6]:
corpus_en = europarl_en
for prefix in non_breaking_prefix_en:
    corpus_en = corpus_en.replace(prefix, prefix + "###")           # prefix replaced -> prefix###
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".###", corpus_en)   # Dot not followed by space -> .###
corpus_en = re.sub(r"\.###", '', corpus_en)                         # Remove ending sentence point
corpus_en = re.sub(r"  +", ' ', corpus_en)                          # Replace multiple spaces with one single space
corpus_en = corpus_en.split("\n")                                   # Split each line

corpus_es = europarl_es
for prefix in non_breaking_prefix_es:
    corpus_es = corpus_es.replace(prefix, prefix + "###")           # prefix replaced -> prefix###
corpus_es = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".###", corpus_es)   # Dot not followed by space -> .###
corpus_es = re.sub(r"\.###", '', corpus_es)                         # Remove ending sentence point
corpus_es = re.sub(r"  +", ' ', corpus_es)                          # Replace multiple spaces with one single space
corpus_es = corpus_es.split("\n")                                   # Split each line

## Tokenizing text

Transform each work in a number. Using SubwordTextEncoder. Using an aproximate number of words in target_vocab_size 2**13. Using a lower number of words can improve the model.

In [7]:
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus_en, target_vocab_size=2**13)
tokenizer_es = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    corpus_es, target_vocab_size=2**13)

print(tokenizer_en.decode([1,200,3000]))
print(tokenizer_es.decode([1,200,3000]))

print(tokenizer_en.encode("Hi, how are you?"))
print(tokenizer_en.encode("Hola, cómo estás?"))

the situation Perhaps 
de sistema juego 
[6181, 2, 172, 17, 362, 8005]
[2191, 796, 2, 8041, 3963, 1835, 7974, 1814, 3235, 8057, 8005]


In [8]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2
VOCAB_SIZE_ES = tokenizer_es.vocab_size + 2

print(VOCAB_SIZE_EN, VOCAB_SIZE_ES)

8200 8227


In [9]:
inputs = [[VOCAB_SIZE_EN-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN-1]
          for sentence in corpus_en]
outputs = [[VOCAB_SIZE_ES-2] + tokenizer_es.encode(sentence) + [VOCAB_SIZE_ES-1]
          for sentence in corpus_es]

print(inputs[0])
print(outputs[0])

[8198, 4378, 999, 2622, 3, 1, 2509, 8199]
[8225, 5967, 1688, 265, 52, 12, 683, 1, 3427, 1377, 8226]


## Remove too long sentences

In [10]:
print("Length before removing long sentences", len(inputs), len(outputs))

MAX_LENGTH = 20
idx_to_remove = [count for count, sent in enumerate(inputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]
idx_to_remove = [count for count, sent in enumerate(outputs)
                 if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

print("Length after removing long sentences", len(inputs), len(outputs))

Length before removing long sentences 1965735 1965735
Length after removing long sentences 411696 411696


## Inputs/outputs creation

As we train with batches, we need each input to have the same length. We pad with the appropriate token, and we will make sure this padding token doesn't interfere with our training later.

In [11]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs,
                                                       value=0,
                                                       padding="post",
                                                       maxlen=MAX_LENGTH)
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs,
                                                        value=0,
                                                        padding="post",
                                                        maxlen=MAX_LENGTH)

print(inputs[0])
print(outputs[0])

[8198 4378  999 2622    3    1 2509 8199    0    0    0    0    0    0
    0    0    0    0    0    0]
[8225 5967 1688  265   52   12  683    1 3427 1377 8226    0    0    0
    0    0    0    0    0    0]


In [12]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

print(dataset)

<PrefetchDataset shapes: ((None, 20), (None, 20)), types: (tf.int32, tf.int32)>


2023-05-09 13:31:22.368490: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-05-09 13:31:23.680352: E tensorflow/stream_executor/cuda/cuda_driver.cc:313] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-05-09 13:31:23.680413: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (computetarget): /proc/driver/nvidia/version does not exist
2023-05-09 13:31:23.686942: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 AVX512F FMA
2023-05-09 13:31:23.766438: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2593905000 Hz
2023-05-09 13:31:23.768982: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fc588000b60 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
20

# Stage 3: Model building

## Positional Encoding

Positional encoding formulae:

$PE_{(pos, 2i)} = \sin(pos/n^{2i/dmodel})$

$PE_{(pos, 2i+1)} = \cos(pos/n^{2i/dmodel})$


**Here**:

pos: Position of an object in the input sequence, 

dmodel: Dimension of the output embedding space

P(k,j): Position function for mapping a position in the input sequence to index of the positional matrix

n: User-defined scalar, set to 10,000 by the authors of Attention Is All You Need.

i: Used for mapping to column indices, with a single value of maps to both sine and cosine functions

__In the above expression, you can see that even positions correspond to a sine function and odd positions correspond to cosine functions.__

More details in the following [article](https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/)

In [13]:
class PositionalEncoding(layers.Layer):

    def __init__(self):
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model, n=10000.): # pos: (seq_length, 1) i: (1, d_model)
        angles = 1 / np.power(n, (2*(i//2))/np.float32(d_model))
        return pos * angles # (seq_length, d_model)
    
    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis],
                                 np.arange(d_model)[np.newaxis, :],
                                 d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]

        return inputs + tf.cast(pos_encoding, tf.float32)

## Attention

### Attention computation

$Attention(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V$

![Attention Image](imgs/Dot_Product_Attention.png)

In [14]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)

    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)

    if mask is not None:
        scaled_product += (mask * -1e9)
    
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)

    return attention

### Multi-head attention sublayer

![Multi Head Attention Image](imgs/Multi_Head_Attention.png)

In [15]:
class MultiHeadAttention(layers.Layer):

    def __init__(self, nb_proj):
        super(MultiHeadAttention, self).__init__()
        self.nb_proj = nb_proj
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        assert self.d_model % self.nb_proj == 0

        self.d_proj = self.d_model // self.nb_proj

        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)

        self.final_lin = layers.Dense(units=self.d_model)
    
    def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_length, d_model)
        shape = (batch_size,
                 -1,
                 self.nb_proj,
                 self.d_proj)
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, nb_proj, d_proj)
        
        return tf.transpose(splited_inputs, perm=[0, 2, 1, 3]) # (batch_size, nb_proj, seq_length, d_proj)
    
    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]

        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)

        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)

        attention = scaled_dot_product_attention(queries, keys, values, mask)

        attention = tf.transpose(attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(attention,
                                      shape=(batch_size, -1, self.d_model))
        
        outputs = self.final_lin(concat_attention)

        return outputs

## Encoder

![Encoder Image](imgs/Encoder.png)

In [16]:
class EncoderLayer(layers.Layer):

    def __init__(self, FFN_units, nb_proj, dropout):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(inputs,
                                              inputs,
                                              inputs,
                                              mask)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)

        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attention)

        return outputs

In [17]:
class Encoder(layers.Layer):

    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.enc_layers = [EncoderLayer(FFN_units,
                                        nb_proj,
                                        dropout)
                           for _ in range(nb_layers)]
    
    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)

        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training)
        
        return outputs

## Decoder

![Decoder Image](imgs/Decoder.jpg)

In [18]:
class DecoderLayer(layers.Layer):

    def __init__(self, FFN_units, nb_proj, dropout):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout = dropout
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        attention = self.multi_head_attention_1(inputs,
                                                inputs,
                                                inputs,
                                                mask_1)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)

        attention_2 = self.multi_head_attention_2(attention,
                                                  enc_outputs,
                                                  enc_outputs,
                                                  mask_2)
        attention_2 = self.dropout_2(attention_2, training=training)
        attention_2 = self.norm_2(attention_2 + attention)

        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training=training)
        outputs = self.norm_3(outputs + attention_2)

        return outputs

In [19]:
class Decoder(layers.Layer):

    def __init__(self,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model

        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.dec_layers = [DecoderLayer(FFN_units,
                                        nb_proj,
                                        dropout)
                           for _ in range(nb_layers)]
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)

        for i in range(self.nb_layers):
            outputs = self.dec_layers[i](outputs,
                                         enc_outputs,
                                         mask_1,
                                         mask_2,
                                         training)
        
        return outputs

## Transformer

![Transformer Image](imgs/Transformer.png)

In [20]:
class Transformer(tf.keras.Model):

    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 nb_layers,
                 FFN_units,
                 nb_proj,
                 dropout,
                 name="transformer"):
        super(Transformer, self).__init__(name=name)

        self.encoder = Encoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout,
                               vocab_size_enc,
                               d_model)
        self.decoder = Decoder(nb_layers,
                               FFN_units,
                               nb_proj,
                               dropout,
                               vocab_size_dec,
                               d_model)
        self.last_linear = layers.Dense(units=vocab_size_dec)
    
    def create_padding_mask(self, seq): # seq: (batch_size, seq_length)
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask
    
    def call(self, enc_inputs, dec_inputs, training):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(
            self.create_padding_mask(dec_inputs),
            self.create_look_ahead_mask(dec_inputs)
        )
        dec_mask_2 = self.create_padding_mask(enc_inputs)

        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs,
                                   enc_outputs,
                                   dec_mask_1,
                                   dec_mask_2,
                                   training)
        
        outputs = self.last_linear(dec_outputs)

        return outputs

# Stage 4: Application

## Training

The recomended parameters in the paper are commented next to the ones used for this test.

In [21]:
tf.keras.backend.clear_session()

# Hyper-parameters
D_MODEL = 128 # 512
NB_LAYERS = 4 # 6
FFN_UNITS = 512 # 2048
NB_PROJ = 8 # 8
DROPOUT = 0.1 # 0.1

transformer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
                          vocab_size_dec=VOCAB_SIZE_ES,
                          d_model=D_MODEL,
                          nb_layers=NB_LAYERS,
                          FFN_units=FFN_UNITS,
                          nb_proj=NB_PROJ,
                          dropout=DROPOUT)

Set loss function, train loss and train accuracy

In [22]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                            reduction="none")
def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0)) 
    loss_ = loss_object(target, pred)                       # Compute the standard loss without reduction

    mask = tf.cast(mask, dtype=loss_.dtype)                 # Make sures values in mask and loss are the same type
    loss_ *= mask                                           # Apply mask

    return tf.reduce_mean(loss_)                            # Apply reduction

train_loss = tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

Adam optimizer with custom learning rate described in the paper

In [23]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)

In [26]:
checkpoint_path = "data/"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Latest checkpoint restored!")

Latest checkpoint restored!


In [27]:
EPOCHS = 10
for epoch in range(EPOCHS):
    print("Start of epoch {}".format(epoch+1))
    start = time.time()

    train_loss.reset_states()
    train_accuracy.reset_states()

    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)
        
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)

        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}".format(
                epoch+1, batch, train_loss.result(), train_accuracy.result()))
    
    ckpt_save_path = ckpt_manager.save()
    print("Saving checkpont for epoch {} at {}".format(epoch+1, ckpt_save_path))
    print("time taken for 1 epoch: {} secs\n".format(time.time() - start))

Start of epoch 1
Epoch 1 Batch 0 Loss 1.1361 Accuracy 0.4597
Epoch 1 Batch 50 Loss 1.0939 Accuracy 0.4521
Epoch 1 Batch 100 Loss 1.1022 Accuracy 0.4565
Epoch 1 Batch 150 Loss 1.1014 Accuracy 0.4571
Epoch 1 Batch 200 Loss 1.0998 Accuracy 0.4579
Epoch 1 Batch 250 Loss 1.0961 Accuracy 0.4577
Epoch 1 Batch 300 Loss 1.0969 Accuracy 0.4577
Epoch 1 Batch 350 Loss 1.0941 Accuracy 0.4580
Epoch 1 Batch 400 Loss 1.0907 Accuracy 0.4578
Epoch 1 Batch 450 Loss 1.0880 Accuracy 0.4575
Epoch 1 Batch 500 Loss 1.0859 Accuracy 0.4575
Epoch 1 Batch 550 Loss 1.0845 Accuracy 0.4575
Epoch 1 Batch 600 Loss 1.0853 Accuracy 0.4578
Epoch 1 Batch 650 Loss 1.0854 Accuracy 0.4572
Epoch 1 Batch 700 Loss 1.0849 Accuracy 0.4580
Epoch 1 Batch 750 Loss 1.0839 Accuracy 0.4587
Epoch 1 Batch 800 Loss 1.0795 Accuracy 0.4594
Epoch 1 Batch 850 Loss 1.0743 Accuracy 0.4604
Epoch 1 Batch 900 Loss 1.0687 Accuracy 0.4614
Epoch 1 Batch 950 Loss 1.0629 Accuracy 0.4625
Epoch 1 Batch 1000 Loss 1.0564 Accuracy 0.4638
Epoch 1 Batch 1050 

## Evaluation

In [28]:
def evaluate(inp_sentence):
    inp_sentence = \
        [VOCAB_SIZE_EN-2] + tokenizer_en.encode(inp_sentence) + [VOCAB_SIZE_EN-1]
    enc_input = tf.expand_dims(inp_sentence, axis=0)

    output = tf.expand_dims([VOCAB_SIZE_ES-2], axis=0)

    for _ in range(MAX_LENGTH):
        predictions = transformer(enc_input, output, False) # (1, seq_length, vocab_size_es)

        prediction = predictions[:, -1:, :]

        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)

        if predicted_id == VOCAB_SIZE_ES-1:
            return tf.squeeze(output, axis=0)
        
        output = tf.concat([output, predicted_id], axis=-1)
    
    return tf.squeeze(output, axis=0)

In [29]:
def translate(sentence):
    output = evaluate(sentence).numpy()

    predicted_sentence = tokenizer_es.decode(
        [i for i in output if i < VOCAB_SIZE_ES-2]
    )

    print("Input: {}".format(sentence))
    print("Predicted translation: {}".format(predicted_sentence))

In [32]:
translate("This is great.")
translate("This is a really powerful tool!")
translate("We should solve this problem.")

Input: This is great.
Predicted translation: Esto es muy bueno.
Input: This is a really powerful tool!
Predicted translation: ¡Es una herramienta realmente poderosa!
Input: We should solve this problem.
Predicted translation: Debemos resolver este problema.


In [33]:
translate("We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.")
translate("Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signiﬁcantly less time to train.")

Input: We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely.
Predicted translation: Proponemos una nueva cuestión de alta importancia, la reforma de las transiciones.
Input: Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signiﬁcantly less time to train.
Predicted translation: Las experiencias de dos tareas de la producción de hoy en día son más fuertes y más fuertes.
