### Automatic Translation FR to EN using Transformers

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

2024-05-19 23:09:15.628367: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-19 23:09:15.707626: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow as tf

import tensorflow_text

In [3]:
dataset_name = 'wmt14_translate/fr-en'
data_dir = 'nlp_lab_dataset/'
train_samples = 60000  



# Load the dataset with specified splits
ds_splits = tfds.load(dataset_name, split=['train', 'validation', 'test'], data_dir=data_dir)

# Take a subset of the training set
train_ds = ds_splits[0].take(train_samples)
val_ds = ds_splits[1]
test_ds = ds_splits[2]

In [4]:
# Function to count number of samples in a dataset
def count_samples(dataset):
    return sum(1 for _ in dataset)

# Count samples in each dataset
num_train_samples = count_samples(train_ds)
num_val_samples = count_samples(val_ds)
num_test_samples = count_samples(test_ds)

print(f"Number of training samples: {num_train_samples}")
print(f"Number of validation samples: {num_val_samples}")
print(f"Number of test samples: {num_test_samples}")

2024-05-19 23:09:31.364283: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-05-19 23:09:31.771480: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Number of training samples: 60000
Number of validation samples: 3000
Number of test samples: 3003


2024-05-19 23:09:32.476546: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [5]:
# Example of how to use the subsets
print("Training set samples:")
for batch in train_ds.batch(3).take(1):
    print('> Examples in English:')
    en_examples = batch["en"].numpy()
    for en in en_examples:
        print(en.decode("utf-8"))

    print()
    
    print('> Examples in French:')
    fr_examples = batch["fr"].numpy()
    for fr in fr_examples:
        print(fr.decode("utf-8"))

2024-05-19 23:09:32.672636: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Training set samples:
> Examples in English:
In his briefing on economic development, Al Horner will give you details of programs we fund to foster partnerships between the private sector and First Nations and Inuit communities, in areas like resource development projects, for example.
(b) Positive aspects
Crop insurance payments include only government crop insurance programs; private hail insurance payments are excluded.

> Examples in French:
Dans sa présentation sur le développement économique, M. Al Horner vous donnera des détails sur les programmes que nous finançons pour favoriser l'établissement de partenariats entre le secteur privé et les collectivités des Premières nations et inuites dans des domaines comme celui de l'exploitation des ressources naturelles.
b) Aspects positifs
Les indemnités d’assurance-récolte comprennent uniquement celles des programmes publics; les indemnités de l’assurance-grêle privée sont exclues.


#### Load tokenizer

In [6]:
tokenizer_name = 'fr_en_tokenizer'
tf.keras.utils.get_file(
    f'{tokenizer_name}.zip',
    f'https://storage.googleapis.com/download.tensorflow.org/models/{tokenizer_name}.zip',
    cache_dir='.', cache_subdir='', extract=True
)

'./fr_en_tokenizer.zip'

In [7]:
tokenizers = tf.saved_model.load(tokenizer_name)

In [8]:
[item for item in dir(tokenizers.en) if not item.startswith('_')]

['detokenize',
 'get_reserved_tokens',
 'get_vocab_path',
 'get_vocab_size',
 'lookup',
 'tokenize',
 'tokenizer',
 'vocab']

### Set up a data pipeline with tf.data

In [9]:
MAX_TOKENS=128
def prepare_batch(batch):
    fr = batch["fr"]
    en = batch["en"]
    fr = tokenizers.fr.tokenize(fr)      # Output is ragged.
    fr = fr[:, :MAX_TOKENS]    # Trim to MAX_TOKENS.
    fr = fr.to_tensor()  # Convert to 0-padded dense Tensor

    en = tokenizers.en.tokenize(en)
    en = en[:, :(MAX_TOKENS+1)]
    en_inputs = en[:, :-1].to_tensor()  # Drop the [END] tokens
    en_labels = en[:, 1:].to_tensor()   # Drop the [START] tokens

    return (fr, en_inputs), en_labels

In [10]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [11]:
def make_batches(ds):
  return (
      ds
      .shuffle(BUFFER_SIZE)
      .batch(BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [12]:
# Create training and validation set batches.
train_batches = make_batches(train_ds)
val_batches = make_batches(val_ds)

In [13]:
for (fr, en), en_labels in train_batches.take(1):
  break

print(fr.shape)
print(en.shape)
print(en_labels.shape)

2024-05-19 23:09:35.903973: E tensorflow/core/util/util.cc:131] oneDNN supports DT_INT64 only on platforms with AVX-512. Falling back to the default Eigen-based implementation if present.


(64, 128)
(64, 128)
(64, 128)


In [14]:
print(en[0][:10])
print(en_labels[0][:10])

tf.Tensor([   2  295  434  292 1109   15  350  306 1097  346], shape=(10,), dtype=int64)
tf.Tensor([ 295  434  292 1109   15  350  306 1097  346   17], shape=(10,), dtype=int64)


### Define the components

In [15]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

In [16]:
class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

In [17]:
embed_fr = PositionalEmbedding(vocab_size=tokenizers.fr.get_vocab_size().numpy(), d_model=512)
embed_en = PositionalEmbedding(vocab_size=tokenizers.en.get_vocab_size().numpy(), d_model=512)

fr_emb = embed_fr(fr)
en_emb = embed_en(en)

In [18]:
en_emb._keras_mask

<tf.Tensor: shape=(64, 128), dtype=bool, numpy=
array([[ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       ...,
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False]])>

In [19]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()


In [20]:
d = {'color': 'blue', 'age': 22, 'type': 'pickup'}
result = d['color']

In [21]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [22]:
sample_ca = CrossAttention(num_heads=2, key_dim=512)

print(fr_emb.shape)
print(en_emb.shape)
print(sample_ca(en_emb, fr_emb).shape)

(64, 128, 512)
(64, 128, 512)




(64, 128, 512)




In [23]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [24]:
sample_gsa = GlobalSelfAttention(num_heads=2, key_dim=512)

print(fr_emb.shape)
print(sample_gsa(fr_emb).shape)

(64, 128, 512)
(64, 128, 512)




In [25]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [26]:
sample_csa = CausalSelfAttention(num_heads=2, key_dim=512)

print(en_emb.shape)
print(sample_csa(en_emb).shape)

(64, 128, 512)
(64, 128, 512)




In [27]:
out1 = sample_csa(embed_en(en[:, :3])) 
out2 = sample_csa(embed_en(en))[:, :3]

tf.reduce_max(abs(out1 - out2)).numpy()

4.7683716e-07

In [28]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

In [29]:
sample_ffn = FeedForward(512, 2048)

print(en_emb.shape)
print(sample_ffn(en_emb).shape)

(64, 128, 512)
(64, 128, 512)




In [30]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [31]:
sample_encoder_layer = EncoderLayer(d_model=512, num_heads=8, dff=2048)

print(fr_emb.shape)
print(sample_encoder_layer(fr_emb).shape)

(64, 128, 512)




(64, 128, 512)




In [32]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

In [33]:
# Instantiate the encoder.
sample_encoder = Encoder(num_layers=4,
                         d_model=512,
                         num_heads=8,
                         dff=2048,
                         vocab_size=8500)

sample_encoder_output = sample_encoder(fr, training=False)

# Print the shape.
print(fr.shape)
print(sample_encoder_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.



(64, 128)
(64, 128, 512)


In [34]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

In [35]:
sample_decoder_layer = DecoderLayer(d_model=512, num_heads=8, dff=2048)

sample_decoder_layer_output = sample_decoder_layer(
    x=en_emb, context=fr_emb)

print(en_emb.shape)
print(fr_emb.shape)
print(sample_decoder_layer_output.shape)  # `(batch_size, seq_len, d_model)`



(64, 128, 512)
(64, 128, 512)
(64, 128, 512)




In [36]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

In [37]:
# Instantiate the decoder.
sample_decoder = Decoder(num_layers=4,
                         d_model=512,
                         num_heads=8,
                         dff=2048,
                         vocab_size=8000)

output = sample_decoder(
    x=en,
    context=fr_emb)

# Print the shapes.
print(en.shape)
print(fr_emb.shape)
print(output.shape)



(64, 128)
(64, 128, 512)
(64, 128, 512)


In [38]:
sample_decoder.last_attn_scores.shape  # (batch, heads, target_seq, input_seq)

TensorShape([64, 8, 128, 128])

### The Transformer 

In [39]:
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

#### Hyperparameters

In [40]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [41]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizers.fr.get_vocab_size().numpy(),
    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
    dropout_rate=dropout_rate)

In [42]:
output = transformer((fr, en))

print(en.shape)
print(fr.shape)
print(output.shape)



(64, 128)
(64, 128)
(64, 128, 7955)


In [43]:
attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape)  # (batch, heads, target_seq, input_seq)

(64, 8, 128, 128)


In [44]:
transformer.summary()

#### Training 

In [45]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [46]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

#### Set up the loss and metrics

In [47]:
def masked_loss(label, pred):
  mask = label != 0
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
  loss = loss_object(label, pred)

  mask = tf.cast(mask, dtype=loss.dtype)
  loss *= mask

  loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
  return loss


def masked_accuracy(label, pred):
  pred = tf.argmax(pred, axis=2)
  label = tf.cast(label, pred.dtype)
  match = label == pred

  mask = label != 0

  match = match & mask

  match = tf.cast(match, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(match)/tf.reduce_sum(mask)

### Train the model

In [48]:
transformer.compile(
    loss=masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

In [49]:
transformer.fit(train_batches,
                epochs=20,
                validation_data=val_batches)

Epoch 1/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4166s[0m 4s/step - loss: 7.9205 - masked_accuracy: 0.0713 - val_loss: 5.8780 - val_masked_accuracy: 0.1724
Epoch 2/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4104s[0m 4s/step - loss: 5.4704 - masked_accuracy: 0.2091 - val_loss: 5.0699 - val_masked_accuracy: 0.2249
Epoch 3/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4041s[0m 4s/step - loss: 4.6238 - masked_accuracy: 0.2747 - val_loss: 4.5022 - val_masked_accuracy: 0.2751
Epoch 4/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4061s[0m 4s/step - loss: 4.0105 - masked_accuracy: 0.3336 - val_loss: 4.1594 - val_masked_accuracy: 0.3144
Epoch 5/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4061s[0m 4s/step - loss: 3.5639 - masked_accuracy: 0.3830 - val_loss: 3.8103 - val_masked_accuracy: 0.3547
Epoch 6/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3971s[0m 4s/step - loss: 3.1521 - maske

2024-05-20 18:14:19.293803: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:299: Filling up shuffle buffer (this may take a while): 1 of 20000
2024-05-20 18:14:20.076203: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9151s[0m 10s/step - loss: 1.6550 - masked_accuracy: 0.6501 - val_loss: 3.5856 - val_masked_accuracy: 0.4236
Epoch 18/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5407s[0m 6s/step - loss: 1.5895 - masked_accuracy: 0.6613 - val_loss: 3.6507 - val_masked_accuracy: 0.4261
Epoch 19/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5337s[0m 6s/step - loss: 1.5143 - masked_accuracy: 0.6738 - val_loss: 3.7104 - val_masked_accuracy: 0.4204
Epoch 20/20
[1m938/938[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5374s[0m 6s/step - loss: 1.4578 - masked_accuracy: 0.6830 - val_loss: 3.7793 - val_masked_accuracy: 0.4176


<keras.src.callbacks.history.History at 0x7f384212dc10>

In [73]:
class Translator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, max_length=MAX_TOKENS):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.tokenizers.fr.tokenize(sentence).to_tensor()

    encoder_input = sentence

    # As the output language is English, initialize the output with the
    # English `[START]` token.
    start_end = self.tokenizers.en.tokenize([''])[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`.
    text = tokenizers.en.detokenize(output)[0]  # Shape: `()`.

    tokens = tokenizers.en.lookup(output)[0]

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, tokens, attention_weights

In [74]:
translator = Translator(tokenizers, transformer)

In [75]:
def print_translation(sentence, tokens, ground_truth):
  print(f'{"Input:":15s}: {sentence}')
  print(f'{"Prediction":15s}: {tokens.numpy().decode("utf-8")}')
  print(f'{"Ground truth":15s}: {ground_truth}')

### The model predicted accurately these two exemples:

In [77]:
sentence = "Dans sa présentation sur le développement économique, M. Al Horner vous donnera des détails sur les programmes que nous finançons pour favoriser l'établissement de partenariats entre le secteur privé et les collectivités des Premières nations et inuites dans des domaines comme celui de l'exploitation des ressources naturelles."
ground_truth = ''

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

Input:         : Dans sa présentation sur le développement économique, M. Al Horner vous donnera des détails sur les programmes que nous finançons pour favoriser l'établissement de partenariats entre le secteur privé et les collectivités des Premières nations et inuites dans des domaines comme celui de l'exploitation des ressources naturelles.
Prediction     : in presentation on economic development , al . m h . al horner will give you details of programmes which we funded with partnerships between the private sector and first nation communities and inuit in development in areas such as exploitation of natural resources .
Ground truth   : 


In [79]:


sentence = " Aspects positifs Les indemnités d’assurance-récolte comprennent uniquement celles des programmes publics; les indemnités de l’assurance-grêle privée sont exclues."
ground_truth = ''

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

Input:         :  Aspects positifs Les indemnités d’assurance-récolte comprennent uniquement celles des programmes publics; les indemnités de l’assurance-grêle privée sont exclues.
Prediction     : positive aspects of - harvest compensation are only those in public programs ; crown - crown compensation are excluded .
Ground truth   : 


### The model failed in predicting these other examples

In [59]:
sentence = 'Salut, comment Ça va?'
ground_truth = ''

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

Input:         : Salut, comment Ça va?
Prediction     : known how many , how or not , how orly , how , how or because of how dos not stays only ? but what of how to stay , how , in the point of how , how , how , how , how , how , something , in the night , how , to stay , for times , when that point to point of how to stays are going to stay that ? not ? not , to stay , to point of how , how , of how , one stay , one point of how ? only only only , do , not , not , not , not , only , not , only , not ,
Ground truth   : 


In [60]:
sentence = 'Bonjour.'
ground_truth = ''

translated_text, translated_tokens, attention_weights = translator(
    tf.constant(sentence))
print_translation(sentence, translated_text, ground_truth)

Input:         : Bonjour.
Prediction     : good faith in the media with media with natural background .
Ground truth   : 


### Export the model for later usage (testing)

In [61]:
class ExportTranslator(tf.Module):
  def __init__(self, translator):
    self.translator = translator

  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])
  def __call__(self, sentence):
    (result,
     tokens,
     attention_weights) = self.translator(sentence, max_length=MAX_TOKENS)

    return result

In [62]:
translator = ExportTranslator(translator)

In [63]:
translator('test.').numpy()

b'the test is not applicable in the test .'

In [65]:
tf.saved_model.save(translator, export_dir='translator-fr-en')

INFO:tensorflow:Assets written to: translator-fr-en/assets


INFO:tensorflow:Assets written to: translator-fr-en/assets
