# Sequence to sequence models

In [1]:
import os
import re
import string
import random

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn import preprocessing
from sklearn.utils import shuffle

## Import data

We'll use a language dataset provided by http://www.manythings.org/anki/ this time containing English and French sentences.

In [None]:
tf.keras.utils.get_file('fra-eng.zip',
                        'http://www.manythings.org/anki/fra-eng.zip',
                        cache_dir='./',
                        cache_subdir='datasets',
                        extract=True)

In [8]:
!curl -O http://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 6379k  100 6379k    0     0  33.8M      0 --:--:-- --:--:-- --:--:-- 33.8M
Archive:  fra-eng.zip
replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: _about.txt              
replace fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: fra.txt                 


In [24]:
# Import the "fra.txt" text file which contains pairs of english & french sentences. 
text_file = "fra.txt"
# Split on new lines to get a single pair.
with open(text_file, "r", encoding="utf-8") as text:
    lines = text.read().split("\n")
text_pairs = []
# For each pair, split the english sentence from the french one using the "\t" marker.
# Mark the begining and the end of the french translation with "[start]" and "[eng]".
for line in lines:
    english = line.split("\t")[0]
    if len(line.split("\t"))>1:
        french = line.split("\t")[1]
        french = "[start] " + french + " [end]"
        text_pairs.append((english, french))

In [25]:
len(text_pairs)

192341

In [28]:
text_pairs[0:10]

[('Go.', '[start] Va ! [end]'),
 ('Go.', '[start] Marche. [end]'),
 ('Go.', '[start] Bouge ! [end]'),
 ('Hi.', '[start] Salut ! [end]'),
 ('Hi.', '[start] Salut. [end]'),
 ('Run!', '[start] Cours\u202f! [end]'),
 ('Run!', '[start] Courez\u202f! [end]'),
 ('Run!', '[start] Prenez vos jambes à vos cous ! [end]'),
 ('Run!', '[start] File ! [end]'),
 ('Run!', '[start] Filez ! [end]')]

In [29]:
# Shuffle the text pairs before splitting them into training, validation and test sets.
text_pairs = shuffle(text_pairs)

num_train = int(0.7 * len(text_pairs))
num_valid = int(0.2 * len(text_pairs))

train_pairs = text_pairs[:num_train]
valid_pairs = text_pairs[num_train:num_train + num_valid]
test_pairs = text_pairs[num_train + num_valid:]

In [30]:
train_pairs[0:10]

[('He has dedicated his life to the preservation of nature.',
  "[start] Il a dédié sa vie à la préservation de l'environnement. [end]"),
 ("It's you who've acted inappropriately.",
  "[start] C'est vous qui avez agi de manière inappropriée. [end]"),
 ('How could you do that?', '[start] Comment as-tu pu faire ça ? [end]'),
 ('The more laws, the more offenders.',
  '[start] Plus il y a de lois, plus il y a de délinquants. [end]'),
 ("I'm going to Australia to visit my family for Christmas.",
  '[start] Je vais aller en Australie pour rendre visite à ma famille pour Noël. [end]'),
 ('I suggest that you not wait any longer.',
  "[start] Je suggère que tu n'attendes pas davantage. [end]"),
 ('She likes to arrange flowers.',
  '[start] Elle aime composer des arrangements floraux. [end]'),
 ('We were outnumbered.', '[start] Nous avons été surpassés en nombre. [end]'),
 ('We enjoyed watching the fireworks on a bridge last summer.',
  "[start] Nous avons apprécié regarder le feu d'artifice sur

## Preprocessing

### Standardization & vectorization

In [31]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
# Import punctuations and add "\u202f" for french text.
strip_chars = string.punctuation + "\u202f"
# Remove square brackets from puntuations since we use them to mark
# the endpoints of french sentences.
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

def custom_standardization(input_string):
    # Standardizes a string by convering it to all lowercase
    # and removing punctuations in the strip_chars string.
    # Returns a tf.string.
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

# Restrict the size of the vocabulary and the sequence length.
# The least frequent words beyond vocab_size will be classified as [UNK].
vocab_size = 15000
sequence_length = 20

# TextVectorization layer for english sentences
source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    standardize=custom_standardization,
)

# TextVectorization layer for french sentences
target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    # target sentences are longer since they start with [start].
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

# Adapt the vectorization layers to the corresponding sentences.
train_english_texts = [pair[0] for pair in train_pairs]
train_french_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_french_texts)

In [33]:
train_english_texts[0:10]

['He has dedicated his life to the preservation of nature.',
 "It's you who've acted inappropriately.",
 'How could you do that?',
 'The more laws, the more offenders.',
 "I'm going to Australia to visit my family for Christmas.",
 'I suggest that you not wait any longer.',
 'She likes to arrange flowers.',
 'We were outnumbered.',
 'We enjoyed watching the fireworks on a bridge last summer.',
 'He read the poem with a loud voice.']

In [34]:
# Indices 0 and 1 are always reserved for '' and '[UNK]'.
source_vectorization.get_vocabulary()[0:10]

['', '[UNK]', 'i', 'you', 'to', 'the', 'a', 'is', 'tom', 'that']

In [35]:
source_vectorization.vocabulary_size()

14141

In [36]:
# The french sentences contain more that 15000 unique words.
target_vectorization.vocabulary_size()

15000

In [38]:
# Build datasets from our training and validation sets. The important object
# here is "tf.data.Dataset.from_tensor_slices" which is a fancier version of
# the generators we have encountered before.
batch_size = 64

# Note that the input is a dictionary with both the english and the french
# version of the sentence. The enlish sentence is the input of the encoder
# and the french sentence is the input of the decoder during training.

def format_dataset(eng, fre):
    # Passes eng and fre through source and target vectorization layers 
    # respectively. Returns a tuple (dict, v_fre) where dict is a dictionary and
    # v_fre is vectorized french sentence without markers ([start],[end]).
    eng = source_vectorization(eng)
    fre = target_vectorization(fre)
    return ({
        "english": eng,
        "french": fre[:, :-1],
    }, fre[:, 1:])

def make_dataset(pairs):
    # Create a tf.dataset from our english-french pairs of sentences.
    eng_texts, fre_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    fre_texts = list(fre_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, fre_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
valid_ds = make_dataset(valid_pairs)

In [39]:
# Here is what the output of format_dataset looks like:
format_dataset([train_pairs[0][0]], [train_pairs[0][1]])

({'english': <tf.Tensor: shape=(1, 20), dtype=int64, numpy=
  array([[   12,    67,  3056,    41,   236,     4,     5, 10868,    11,
           1893,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0]])>,
  'french': <tf.Tensor: shape=(1, 20), dtype=int64, numpy=
  array([[   2,   14,   18, 8120,   88,  194,    9,   10,    1,    5, 6277,
             3,    0,    0,    0,    0,    0,    0,    0,    0]])>},
 <tf.Tensor: shape=(1, 20), dtype=int64, numpy=
 array([[  14,   18, 8120,   88,  194,    9,   10,    1,    5, 6277,    3,
            0,    0,    0,    0,    0,    0,    0,    0,    0]])>)

In [40]:
# The dataset generates tuples -- created via format_dataset -- of batches of 
# size batch_size (=64).
for x in train_ds:
    print(x)
    break

({'english': <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[   2,   31,   24, ...,    0,    0,    0],
       [   2,  256,   24, ...,    0,    0,    0],
       [1142,    3,  290, ...,    0,    0,    0],
       ...,
       [   2,   19,  377, ...,    0,    0,    0],
       [  57,   64,  174, ...,    0,    0,    0],
       [  53,    3,  231, ...,    0,    0,    0]])>, 'french': <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[   2,    4,  375, ...,    0,    0,    0],
       [   2,    4,   27, ...,    0,    0,    0],
       [   2,    5,  105, ...,    0,    0,    0],
       ...,
       [   2,   24,  201, ...,    0,    0,    0],
       [   2,   89,    8, ...,    0,    0,    0],
       [   2,   19, 3786, ...,    0,    0,    0]])>}, <tf.Tensor: shape=(64, 20), dtype=int64, numpy=
array([[   4,  375,   78, ...,    0,    0,    0],
       [   4,   27,  684, ...,    0,    0,    0],
       [   5,  105, 2421, ...,    0,    0,    0],
       ...,
       [  24,  201,  695, ...,    0,    

In [41]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['french'].shape: {inputs['french'].shape}")
    print(f"targets.shape: {targets.shape}")

inputs['english'].shape: (64, 20)
inputs['french'].shape: (64, 20)
targets.shape: (64, 20)


## Sequence to sequence model

The seq2seq model is built by combining 2 RNNs: the encoder and the decoder. 

The purpose of the encoder is feature extraction: the output should be a sequence that encodes the meaning of the input sentence in a way that is (somewhat) independent of the language.

The purpose of the decoder is to do the reverse, that is to build a sentence in french from the feature representation of the english sentence constructed by the encoder.

### Encoder

In [42]:
# The encoder has a simple structure: it takes a vectorized english sentence as
# a sequence, embeds it a embed_dim-dimensional vector space and passes the
# resulting sequence through a bidirectional GRU layer.
embed_dim = 256
latent_dim = 1024

encoder_input = keras.Input(shape=(None,), dtype="int64", name="english")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(encoder_input)
encoder_output = layers.Bidirectional(layers.GRU(latent_dim), merge_mode="sum")(x)

### Decoder

In [43]:
# The decoder is treated differently during training and inference.
# During training it takes the output of the encoder as well as the vectorized
# french sentence as inputs. The french sentence is similarly passed through
# an embedding layer.
decoder_input = keras.Input(shape=(None,), dtype="int64", name="french")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(decoder_input)

# The decoder recurrent layer takes the embedded french sentence as its input
# and the the encoder output as its initial state.
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoder_output)
x = layers.Dropout(0.5)(x)

# At each pass, the decoder's output is used to predict the next word/token.
next_target = layers.Dense(vocab_size, activation="softmax")(x)

seq2seq_rnn = keras.Model(inputs = [encoder_input, decoder_input], 
                          outputs = next_target)

In [52]:
seq2seq_rnn.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 french (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    3840000     ['english[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    3840000     ['french[0][0]']                 
                                                                                              

In [44]:
sample_pred = seq2seq_rnn.predict(train_ds.take(1))[0]

In [45]:
# The model outputs a sequence of predictions where each prediction
# is a classification in our vocabulary of 15000 words.
sample_pred.shape

(20, 15000)

In [46]:
# Recall what the target looks like:
for x in train_ds:
    print(x[1][0])
    break

tf.Tensor(
[  30 2548   31  406    3    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0], shape=(20,), dtype=int64)


In [47]:
seq2seq_rnn.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])

In [48]:
seq2seq_rnn.load_weights('seq2seq_rnn.h5')

In [51]:
seq2seq_rnn.layers[4].trainable=False

In [53]:
seq2seq_rnn.fit(train_ds, epochs=1, validation_data=valid_ds)



<keras.callbacks.History at 0x7f83de448290>

### Inference

In [54]:
# To de-vectorize the output of our seq2seq model we need the following.
fre_vocab = target_vectorization.get_vocabulary()
fre_index_lookup = dict(zip(range(len(fre_vocab)), fre_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    # Passes the english sentence input_sentence through the seq2seq_rnn model.
    # Outputs the top prediction for the french translation.
    #
    # Vectorize the input sentence before passing it to the model.
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        # Vectorize the partial output sentence.
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        # Run model.predict to predict the next word of the output sentence
        # from the input sentence and the partial output sentence.
        next_token_predictions = seq2seq_rnn.predict(
            [tokenized_input_sentence, tokenized_target_sentence])
        # Find the index of the word with the highest confidence value.
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        # Pick the corresponding word
        sampled_token = fre_index_lookup[sampled_token_index]
        # Add the word with the highest confidence value to the output sentence.
        decoded_sentence += " " + sampled_token
        # Check if the model predicts that the sentence should end.
        if sampled_token == "[end]":
            break
    return decoded_sentence

In [55]:
# Run inference on a few sentences for comparison.
for _ in range(5):
    ind = random.choice(range(len(test_pairs)))
    input_sentence = test_pairs[ind][0]
    print("-")
    print(input_sentence)
    print('prediction: {}'.format(decode_sequence(input_sentence)))
    print('target: {}'.format(test_pairs[ind][1]))

-
He never touched wine.
prediction: [start] il ne jamais jamais vin [end]
target: [start] Il ne buvait jamais de vin. [end]
-
Do you want some cherry pie?
prediction: [start] voulezvous que je ne vous [UNK] [end]
target: [start] Veux-tu de la tarte aux cerises ? [end]
-
I will not tolerate this.
prediction: [start] je ne vais pas [UNK] [end]
target: [start] Je ne le tolérerai pas. [end]
-
Tom is my boss at work.
prediction: [start] tom est le meilleur au travail [end]
target: [start] Tom est mon supérieur hiérarchique. [end]
-
I can't put up with the way he spits.
prediction: [start] je ne peux pas supporter le chemin [end]
target: [start] Je ne peux pas supporter sa manière de cracher. [end]


## Transformers

Using RNNs to build a machine translation network has quite a few shortcommings, the most apparent being:
- Our text embeddings so far fail to capture context dependence of words/tokens.
- While RNNs do capture word ordering, they are also designed to respect the ordering. However the ordering of words is not so strict and can be completely different in different languages.

It would seem like a better idea then to focus instead on capturing the context. This is the idea behind self attention.

### Positional Encoding

Switching from RNNs to Transformers, we trade word order awareness for context awareness. But we can always add the ordering of the words as an extra input!

In [58]:
class PositionalEmbedding(layers.Layer):
    # Combines the standard embedding layer with a position embedding.
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        # The standard embedding layer embedding the vectorized tokens.
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        # Position embedding layer embedding the sequence index.
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

### Transformer Encoder

The transformer encoder combines multi-head attention mechanism together with dense projections.

In [59]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        # Dimension of the vector space where tokens are represented as dense vectors.
        self.embed_dim = embed_dim
        # Dimension of the projected or "downsampled" dense vectors.
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        # The projection module downsamples and upsamples the inputs.
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            # The mask will be generated by the preceding embedding layer and
            # needs to be reshaped to the correct shape.
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        # Add a residual connection after the attention layer.
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        # Add a residual connection after the dense projection module.
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

### Transformer Decoder

The crutial difference between the decoder and the encoder is that it has 2 inputs: 
- the english sentence passed through the encoder
- and the (partial) french sentence

The partial french sentence should be treated the same way as the english one so it needs to pass through a PositionalEmbedding layer as well as a MultiHeadAttention layer.

In [60]:
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        # The attention layer for the partial target input.
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        # The attention layer that compares the two inputs.
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        # Dense projection following the attention layers.
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
        return tf.tile(mask, mult)

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(
                mask[:, tf.newaxis, :], dtype="int32")
            # Mask for the "upper-half" of the partial target input.
            padding_mask = tf.minimum(padding_mask, causal_mask)
        attention_output_1 = self.attention_1(
            query=inputs,
            value=inputs,
            key=inputs,
            attention_mask=causal_mask)
        attention_output_1 = self.layernorm_1(inputs + attention_output_1)
        attention_output_2 = self.attention_2(
            query=attention_output_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        attention_output_2 = self.layernorm_2(
            attention_output_1 + attention_output_2)
        proj_output = self.dense_proj(attention_output_2)
        return self.layernorm_3(attention_output_2 + proj_output)

### End-to-end Transformer

In [64]:
embed_dim = 256
dense_dim = 128
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="french")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [65]:
transformer.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 english (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 french (InputLayer)            [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_2 (Positi  (None, None, 256)   3845120     ['english[0][0]']                
 onalEmbedding)                                                                                   
                                                                                                  
 positional_embedding_3 (Positi  (None, None, 256)   3845120     ['french[0][0]']           

In [66]:
sample_pred = transformer.predict(train_ds.take(1))[0]

In [68]:
sample_pred.shape

(20, 15000)

In [70]:
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_ds, epochs=1, validation_data=valid_ds)



<keras.callbacks.History at 0x7f83da18aa90>