In [39]:
import numpy as np
import math
import re
import time
import tensorflow as tf
import tensorflow
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
MAX_VOCAB_SIZE = 2**13
MAX_SEQUENCE_LENGTH = 20
BATCH_SIZE = 64
BUFFER_SIZE = 20000

# Stage 1: Importing dependencies

In [4]:
try:
    %tensorflow_version 2.x
except:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Stage 2: Data preprocessing

## Loading files

We import files from our personal google drive.

In [5]:
def text_gen(file_path):
    with open(file_path, mode='r', encoding='utf-8') as f:
        return (row for row in f.readlines()) 

## Cleaning data

Getting the non_breaking_prefixes as a clean list of words with a point at the end so it is easier to use.

In [6]:
with open("./nonbreaking_prefix.en",
          mode='r',
          encoding='utf-8') as f:
    non_breaking_prefix_en = f.read()
with open("./nonbreaking_prefix.fr",
          mode='r',
          encoding='utf-8') as f:
    non_breaking_prefix_fr = f.read()

non_breaking_prefix_en = non_breaking_prefix_en.split("\n")
non_breaking_prefix_en = [' ' + pref + '.' for pref in non_breaking_prefix_en]
non_breaking_prefix_fr = non_breaking_prefix_fr.split("\n")
non_breaking_prefix_fr = [' ' + pref + '.' for pref in non_breaking_prefix_fr]

We will need each word and other symbol that we want to keep to be in lower case and separated by spaces so we can "tokenize" them.

In [7]:
# prepare generator for extremely large text file

def reset_generators():
    europarl_en = text_gen("./europarl-v7.fr-en.en")
    europarl_fr = text_gen("./europarl-v7.fr-en.fr")

    corpus_en = europarl_en

    for prefix in non_breaking_prefix_en:
        corpus_en = (row.replace(prefix, prefix + "###") for row in corpus_en)

    corpus_en = (re.sub(r"\.(?=[0-9a-zA-Z])", ".###", row) for row in corpus_en)    
    corpus_en = (re.sub(".###", "", row) for row in corpus_en)
    corpus_en = (re.sub(r"\s+", " ", row) for row in corpus_en)
    corpus_en = ("<sos> " + row + " <eos>" for row in corpus_en)

    corpus_fr = europarl_fr

    for prefix in non_breaking_prefix_en:
        corpus_fr = (row.replace(prefix, prefix + "###") for row in corpus_fr)

    corpus_fr = (re.sub(r"\.(?=[0-9a-zA-Z])", ".###", row) for row in corpus_fr)    
    corpus_fr = (re.sub(".###", "", row) for row in corpus_fr)
    corpus_fr = (re.sub(r"\s+", " ", row) for row in corpus_fr)
    corpus_fr = ("<sos> " + row + " <eos>"  for row in corpus_fr)
    
    return corpus_en, corpus_fr

In [8]:
corpus_en, corpus_fr = reset_generators()

In [10]:
def get_gen_length(gen):
    return len(list(gen))

In [11]:
print(get_gen_length(corpus_en), get_gen_length(corpus_fr))

2007723 2007723


## Tokenizing text

In [15]:
tokenizer_en = Tokenizer(num_words=MAX_VOCAB_SIZE, filters="")
tokenizer_fr = Tokenizer(num_words=MAX_VOCAB_SIZE, filters="")

In [16]:
corpus_en, corpus_fr = reset_generators()
tokenizer_en.fit_on_texts(corpus_en)
tokenizer_fr.fit_on_texts(corpus_fr)

In [51]:
VOCAB_SIZE_EN = min(MAX_VOCAB_SIZE, len(tokenizer_en.word_index))
VOCAB_SIZE_FR = min(MAX_VOCAB_SIZE, len(tokenizer_fr.word_index))
print(VOCAB_SIZE_EN, VOCAB_SIZE_FR)

8192 8192


In [17]:
corpus_en, corpus_fr = reset_generators()
input_sentences = tokenizer_en.texts_to_sequences(corpus_en)
output_sentences = tokenizer_fr.texts_to_sequences(corpus_fr)

In [55]:
indexes = set()

for input_seq in input_sentences:
    indexes.update(input_seq)

print(len(indexes))

8192


#### Remark
Even we have fed in the parameter ```num_words=MAX_VOCAB_SIZE``` in the constructor of ```Tokenizer```, both ```len(tokenizer_en.word_index)``` and ```len(tokenizer_fr.word_index)``` still exceed ```MAX_VOCAB_SIZE```. 

However, the number of output indexes in the text to seq transformation still agress with the number ```MAX_VOCAB_SIZE```.

## Remove too long sentences and do padding

In [444]:
input_sentences = pad_sequences(
    input_sentences, 
    maxlen=MAX_SEQUENCE_LENGTH, 
    padding="post"
)

output_sentences = pad_sequences(
    output_sentences, 
    maxlen=MAX_SEQUENCE_LENGTH, 
    padding="post"
)

In [445]:
print(input_sentences.shape, output_sentences.shape)

(2007723, 20) (2007723, 20)


## Inputs/outputs creation

As we train with batches, we need each input to have the same length. We pad with the appropriate token, and we will make sure this padding token doesn't interfere with our training later.

In [446]:
dataset = tensorflow.data.Dataset.from_tensor_slices((input_sentences, output_sentences))

In [447]:
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tensorflow.data.experimental.AUTOTUNE)

# Stage 3: Model building

## Embedding

Positional encoding formulae:

$PE_{(pos,2i)} =\sin(pos/10000^{2i/dmodel})$

$PE_{(pos,2i+1)} =\cos(pos/10000^{2i/dmodel})$

In [481]:
# nuermical example:
a = np.array([1,2,3])
b = np.array([2,2,2])
#shape: (3, 1), make 3 copies horizontally (so stack column by column)
a = a[:, np.newaxis]
#shape: (1, 3), make 3 copies vertically (so stack row by row)
b = b[np.newaxis, :]

print(a*b)

[[2 2 2]
 [4 4 4]
 [6 6 6]]


In [482]:
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model):
        # pos of size (seg_length, 1) and 
        # i of size (1, d_model)
        # pos * angles of size (seq_length, d_model)
        angles = 1/np.power(10000., 2*(i//2)/np.float32(d_model))
        return pos * angles
    
    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]

        angles = self.get_angles(
            np.arange(seq_length)[:, np.newaxis],
            np.arange(d_model)[np.newaxis, :],
            d_model
        )

        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2]) 
        pos_encoding = angles[np.newaxis, ...]


        return inputs + tf.cast(pos_encoding, tf.float32)

## Attention

### Attention computation

$\mathrm{attention}(Q, K, V ) = \left[\mathrm{softmax}\bigg(\dfrac{QK^T}{\sqrt{d_k}}\bigg)\right]V $

In [499]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dim)
    
    if mask is not None:
        scaled_product +=  mask * -1e9
        
    softmax = tf.nn.softmax(scaled_product, axis=-1)
    attention = tf.matmul(softmax, values)
    
    return attention

### Multi-head attention sublayer

In [500]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, n_proj):
        self.n_proj = n_proj
        super(MultiHeadAttention, self).__init__()
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]  
        assert self.d_model % self.n_proj == 0
        
        self.d_proj = self.d_model // self.n_proj
        
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)
        self.final_lin = layers.Dense(units=self.d_model)
    
    def split_proj(self, inputs, batch_size):
        # inputs of shape: (batch_size, seq_length, d_model)
        # split across the second dimension
        shape = (batch_size,
                -1,
                 self.n_proj,
                 self.d_proj
                )
        splited_inputs = tf.reshape(inputs, shape=shape)
        
        # desired output shape: 
        # (batch_size, n_proj, seq_length, d_proj)
        return tf.transpose(
            splited_inputs, 
            perm = [0, 2, 1, 3]
        )
        
    
    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]
        
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)
        
        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)
        
        attention = scaled_dot_product_attention(
            queries,
            keys,
            values,
            mask
        )
        
        # attention of shape:
        # (batch_size, n_proj, d_proj, 1)

        attention = tf.transpose(
            attention,
            perm=[0, 2, 1, 3]
        )
        
        concat_attention = tf.reshape(
            attention, 
            shape=(batch_size, -1, self.d_model)
        )
        
        outputs = self.final_lin(concat_attention)
        
        return outputs

## Encoder

In [501]:
class EncoderLayer(layers.Layer):
    def __init__(self, FFN_units, n_proj, dropout):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.n_proj = n_proj
        self.dropout = dropout
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        self.multi_head_attention = MultiHeadAttention(self.n_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon = 1e-6)
        
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon = 1e-6)
        
    def call(self, inputs, mask, training):
        attention = self.multi_head_attention(
            inputs, 
            inputs, 
            inputs, 
            mask
        )
        
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)
        
        outputs = self.dense_1(attention)
        outouts = self.dense_2(outputs)
        outputs = self.dropout_2(outputs)
        outputs = self.norm_2(tf.concat([outputs, attention], axis=-1))
        
        return outputs

In [502]:
class Encoder(layers.Layer):
    def __init__(self,
                 n_layers,
                 FFN_units,
                 n_proj,
                 dropout,
                 vocab_size,
                 d_model,
                 name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.n_layers = n_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        self.enc_layers = [
            EncoderLayer(FFN_units, n_proj, dropout)
            for _ in range(n_layers)        
        ]
    
    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs = outputs * tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.n_layers):
            outputs = self.enc_layers[i](outputs, mask, training)
        
        return outputs

## Decoder

In [503]:
class DecoderLayer(layers.Layer):
    def __init__(self, FFN_units, n_proj, dropout):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.n_proj = n_proj
        self.dropout = dropout
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        self.multi_head_attention_1 = MultiHeadAttention(self.n_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        self.multi_head_attention_2 = MultiHeadAttention(self.n_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        attention = self.multi_head_attention_1(
            inputs, 
            inputs, 
            inputs,
            mask_1
        )
        
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)
        
        attention_2 = self.multi_head_attention_2(
            attention,
            enc_outputs,
            enc_outputs,
            mask_2
        )
        
        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)
        
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(tf.concat([outputs, attention_2], axis=-1))
        
        return outputs
        

In [504]:
class Decoder(layers.Layer):
    def __init__(self,
                 n_layers,
                 FFN_units,
                 n_proj, 
                 dropout,
                 vocab_size,
                 d_model,
                 name="decoder"
                ):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.n_layers = n_layers
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout)
        
        self.dec_layers = [
            DecoderLayer(FFN_units, n_proj, dropout)
            for _ in range(n_layers)
        ]
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs = outputs * tf.math.sqrt(tf.cast(self.d_model, tf.float32))        
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.n_layers):
            outputs = self.dec_layers[i](
                outputs,
                enc_outputs,
                mask_1,
                mask_2,
                training
            )
        
        return outputs

## Transformer

In [505]:
class Transformer(tf.keras.Model):
    def __init__(self,
                 vocab_size_enc,
                 vocab_size_dec,
                 d_model,
                 n_layers,
                 FFN_units,
                 n_proj,
                 dropout,
                 name="transformer"
                ):
        super(Transformer, self).__init__(name=name)
        
        self.encoder = Encoder(n_layers,
                               FFN_units,
                               n_proj, 
                               dropout, 
                               vocab_size_enc,
                               d_model)
        
        self.decoder = Decoder(n_layers,
                               FFN_units,
                               n_proj,
                               dropout,
                               vocab_size_dec,
                               d_model)
        
        self.last_linear = layers.Dense(units=vocab_size_dec)
        
    def create_padding_mask(self, seqs):
        mask = tf.cast(tf.math.equal(seqs, 0), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seqs):
        seq_len = tf.shape(seqs)[1]
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask

    def call(self, enc_inputs, dec_inputs, training):
        enc_mask = self.create_padding_mask(enc_inputs)
        dec_mask_1 = tf.maximum(
            self.create_padding_mask(dec_inputs),
            self.create_look_ahead_mask(dec_inputs)
        )
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        dec_outputs = self.decoder(dec_inputs, enc_outputs, dec_mask_1, dec_mask_2, training)
        outputs = self.last_linear(dec_outputs)

        return outputs
        
        
        

# Training

In [506]:
tf.keras.backend.clear_session()

D_MODEL = 128
N_LAYERS = 4
FFN_UNITS = 512
N_PROJ = 8
DROPOUT = 0.1

transformer = Transformer(vocab_size_enc=VOCAB_SIZE_EN,
                        vocab_size_dec=VOCAB_SIZE_FR,
                        d_model=D_MODEL,
                        n_layers=N_LAYERS,
                        FFN_units=FFN_UNITS,
                        n_proj=N_PROJ,
                        dropout=DROPOUT,
                        name="transformer")

In [507]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction="none"
)

In [508]:
def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss_ = loss_object(target, pred)  
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ = loss_ * mask
    
    return tf.reduce_mean(loss_)

In [509]:
train_loss=tf.keras.metrics.Mean(name="train_loss")
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name="train_accuracy")

In [514]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps
        
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
    
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(
    learning_rate, 
    beta_1=0.9,
    beta_2=0.98,
    epsilon=1e-9
)

In [515]:
checkpoint_dir_path = "./checkpoints/"

checkpoint = tf.train.Checkpoint(
    model=transformer,
    optimizer=optimizer
)

checkpoint_manager = tf.train.CheckpointManager(
    checkpoint,
    checkpoint_dir_path,
    max_to_keep=5
)

if checkpoint_manager.latest_checkpoint:
    checkpoint.restore(checkpoint_manager.latest_checkpoint)
    print("[INFO] Latest checkpoint restored.")


In [None]:
EPOCHS = 10
for epoch in range (EPOCHS):
    print(f"start of epoch {epoch+1}")
    start = time.time()
    
    train_loss.reset_states()
    train_accuracy.reset_states()
    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        dec_inputs = targets[:,:-1]
        dec_outputs_real = targets[:, 1:]
        
        with tf.GradientTape() as tape:
            predictions = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)
        
        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients((
            (grad, variable) 
            for (grad, variable) in zip(gradients, transformer.trainable_variables) if grad is not None))
        
        train_loss(loss)
        train_accuracy(dec_outputs_real, predictions)
        
        if batch % 50 ==0:
            print("Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}".format(
            epoch+1, batch, train_loss.result(), train_accuracy.result()))
        
    
    checkpoint_save_path = checkpoint_manager.save()
    print("Saving checkpoint for epoch {} at {}".format(
        epoch+1,
        checkpoint_save_path
    ))
    
    print("Time take for 1 epoch: {} secs \n".format(time.time()-start))               

start of epoch 1
Epoch 1 Batch 0 Loss 5.6067 Accuracy 0.0584
Epoch 1 Batch 50 Loss 5.5944 Accuracy 0.0555
Epoch 1 Batch 100 Loss 5.5152 Accuracy 0.0594
Epoch 1 Batch 150 Loss 5.4622 Accuracy 0.0690
Epoch 1 Batch 200 Loss 5.4328 Accuracy 0.0756
Epoch 1 Batch 250 Loss 5.4039 Accuracy 0.0797
Epoch 1 Batch 300 Loss 5.3753 Accuracy 0.0829
Epoch 1 Batch 350 Loss 5.3477 Accuracy 0.0855
Epoch 1 Batch 400 Loss 5.3156 Accuracy 0.0883
Epoch 1 Batch 450 Loss 5.2832 Accuracy 0.0912
Epoch 1 Batch 500 Loss 5.2353 Accuracy 0.0946
Epoch 1 Batch 550 Loss 5.1852 Accuracy 0.0982
Epoch 1 Batch 600 Loss 5.1296 Accuracy 0.1017
Epoch 1 Batch 650 Loss 5.0749 Accuracy 0.1054
Epoch 1 Batch 700 Loss 5.0218 Accuracy 0.1091
Epoch 1 Batch 750 Loss 4.9711 Accuracy 0.1128
Epoch 1 Batch 800 Loss 4.9215 Accuracy 0.1164
Epoch 1 Batch 850 Loss 4.8763 Accuracy 0.1198
Epoch 1 Batch 900 Loss 4.8323 Accuracy 0.1230
Epoch 1 Batch 950 Loss 4.7917 Accuracy 0.1261
Epoch 1 Batch 1000 Loss 4.7535 Accuracy 0.1291
Epoch 1 Batch 1050 

# Evaluate