# <p style="padding:15px; background-color:#3f384a; font-family:JetBrains Mono; font-weight:bold; color:#f2f2f0; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">NLP - Neural Machine Translation with Attention</p>

In [1]:
import os
import shutil
import subprocess
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import cpu_count
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
from tensorflow import keras
from keras import layers
from colorama import Fore, Style
from IPython.core.display import HTML
from IPython.display import display_html

K = keras.backend
ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None
FONT_COLOR = "#141B4D"
BACKGROUND_COLOR = "#F6F5F5"
CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL
NOTEBOOK_PALETTE = {
    "graphite_blue": "#26344E",
    "purple_blue1": "#2A357D",
    "purple_blue2": "#454D82",
    "dark_blue1": "#141B4D",
    "dark_blue2": "#0F173B",
    "light_white": "#F2F2F0",
    "light_beige1": "#F6F5F5",
    "light_beige2": "#FFFAF6",
    "orange": "#C73C1A",
    "graphite": "#3E3F4C",
}


def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")

    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", "data")
        shutil.move(dataset + ".zip", "data")


HTML(
    """
<style>
code {
    background: rgba(42, 53, 125, 0.10) !important;
    border-radius: 4px !important;
}
</style>
"""
)


<b><span style="font-size:20px; font-family:JetBrains Mono; margin-left: 5px;">
    Notebook Description 📜
</span></b>
<p style="font-size:16px;font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
    This notebook aims to handle one of the natural language processing (NLP) challenges, i.e. <b>machine translation</b>. We will focus on employing the encoder-decoder architecture and a disruptive approach to NLP, i.e. transformers architecture. To do that, we will use two <b>English-French</b> datasets. In the first part, we will focus on an easy dataset (around 180000 sentences, 12 MB), whereas in the second part, we will use the second dataset (about 22.5 million sentences, 8 GB). In this notebook, we translate English sentences into French ones.
</p>
<b><span style="font-size:20px;font-family:JetBrains Mono; margin-left: 5px;">
    This Notebook Covers 📔
</span></b>
<ul style="font-size:16px; font-family: JetBrains Mono; margin-left: 10px; margin-right: 15px; margin-top: 15px; margin-bottom: 20px">
    <li>TO DO</li>
</ul>
<b><span style="font-size:20px;font-family:JetBrains Mono; margin-left: 5px;">
    See Datasets Here 📈
</span></b>
<p style="font-size:16px; font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
    <a href="https://www.kaggle.com/datasets/devicharith/language-translation-englishfrench" style="color:#2A357D"><b>Easy English-French Dataset</b></a></br>
    <a href="https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset" style="color:#2A357D"><b>Hard English-French Dataset</b></a>
</p>

# <p style="padding:15px; background-color:#3f384a; font-family:JetBrains Mono; font-weight:bold; color:#f2f2f0; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Tackling Easy Dataset</p>

<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>In this section, we will focus on an easy English-French dataset.</li>
    <li>First, let's download and see what we are dealing with.</li>
</ul>

In [16]:
%%time

easy_dataset_user = "devicharith"
easy_dataset = "language-translation-englishfrench"
data_dir = Path("data")

if not ON_KAGGLE:
    download_dataset_from_kaggle(easy_dataset_user, easy_dataset, data_dir)
    easy_dataset_path = data_dir / "eng_-french.csv"
else:
    easy_dataset_path = Path(
        "/kaggle/input/language-translation-englishfrench/eng_-french.csv"
    )


CPU times: total: 0 ns
Wall time: 0 ns


In [18]:
%%time

easy_dataset = pd.read_csv(easy_dataset_path, encoding="utf-8", engine="pyarrow")
easy_dataset = easy_dataset.sample(len(easy_dataset), random_state=42)
easy_dataset.head()


CPU times: total: 141 ms
Wall time: 140 ms


Unnamed: 0,English words/sentences,French words/sentences
2785,Take a seat.,Prends place !
29880,I wish Tom was here.,J'aimerais que Tom soit là.
53776,How did the audition go?,Comment s'est passée l'audition ?
154386,I've no friend to talk to about my problems.,Je n'ai pas d'ami avec lequel je puisse m'entr...
149823,I really like this skirt. Can I try it on?,"J'aime beaucoup cette jupe, puis-je l'essayer ?"


In [19]:
easy_dataset.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 175621 entries, 2785 to 121958
Data columns (total 2 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   English words/sentences  175621 non-null  object
 1   French words/sentences   175621 non-null  object
dtypes: object(2)
memory usage: 4.0+ MB


In [20]:
easy_dataset["English Words in Sentence"] = (
    easy_dataset["English words/sentences"].str.split().apply(len)
)
easy_dataset["French Words in Sentence"] = (
    easy_dataset["French words/sentences"].str.split().apply(len)
)

fig = px.histogram(
    easy_dataset,
    x=["English Words in Sentence", "French Words in Sentence"],
    color_discrete_sequence=["#3f384a", "#e04c5f"],
    labels={"variable": "Variable", "value": "Words in Sentence"},
    marginal="box",
    barmode="group",
    height=560,
    width=840,
    title="Words in Sentence",
)
fig.update_layout(
    font_color=FONT_COLOR,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    bargap=0.2,
    bargroupgap=0.1,
    legend=dict(orientation="h", yanchor="bottom", xanchor="right", y=1.02, x=1),
)
fig.show()


<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>As you can see, sentences usually have several words, at most $15$.</li>
    <li>Additionally, sentences are arranged in ascending order of the number of words. Therefore, I took the liberty of shuffling this data.</li>
    <li>Let's prepare the training dataset and validation datasets.</li>
</ul>

In [25]:
sentences_en = easy_dataset["English words/sentences"].to_numpy()
sentences_fr = easy_dataset["French words/sentences"].to_numpy()

validation_size = 0.1
valid_len = int(validation_size * len(easy_dataset))

sentences_en_train = sentences_en[:-valid_len]
sentences_fr_train = sentences_fr[:-valid_len]

sentences_en_valid = sentences_en[-valid_len:]
sentences_fr_valid = sentences_fr[-valid_len:]

print(CLR + "Train Size:", f"{RED}{len(sentences_en_train)}")
print(CLR + "Valid Size:", f"{RED}{len(sentences_en_valid)}")


[1m[37mTrain Size: [1m[31m158059
[1m[37mValid Size: [1m[31m17562


<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>Since we have the data split, we can prepare that for encoder-decoder architecture. In general, we need two inputs and a target. The first input, i.e. English sentences, is passed to the encoder. On the other hand, French ones are passed to the decoder. Nevertheless, the decoder should take them one timestamp earlier. Therefore, we need to add a unique token - the start of a sequence (SOS). It acts as an indicator or a trigger signal for the decoder to start generating the translated output. Also, the target should contain a unique token - the end of a sequence (EOS). It serves as a marker to indicate the completion of the translation. When the decoder generates the EOS token, it signals that the translation process is finished.</li>
    <li>Now, we will write two short utility functions that create <code>TensorFlow</code> datasets for the encoder-decoder.</li>
</ul>

In [27]:
def prepare_input_and_target(sentences_en, sentences_fr):
    """Return data in the format: `((encoder_input, decoder_input), target)`"""
    return (sentences_en, b"startofseq " + sentences_fr), sentences_fr + b" endofseq"


def from_sentences_dataset(
    sentences_en,
    sentences_fr,
    batch_size=32,
    cache=True,
    shuffle=False,
    shuffle_buffer_size=10_000,
    seed=None,
):
    """Creates `TensorFlow` dataset for encoder-decoder from given sentences."""
    dataset = tf.data.Dataset.from_tensor_slices((sentences_en, sentences_fr))
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=tf.data.AUTOTUNE)
    if cache:
        dataset = dataset.cache()
    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size)


In [29]:
example_ds = from_sentences_dataset(
    sentences_en_train, sentences_fr_train, batch_size=2
)
list(example_ds.take(1))[0]


((<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'Take a seat.', b'I wish Tom was here.'], dtype=object)>,
  <tf.Tensor: shape=(2,), dtype=string, numpy=
  array([b'startofseq Prends place !',
         b"startofseq J'aimerais que Tom soit l\xc3\xa0."], dtype=object)>),
 <tf.Tensor: shape=(2,), dtype=string, numpy=
 array([b'Prends place ! endofseq',
        b"J'aimerais que Tom soit l\xc3\xa0. endofseq"], dtype=object)>)

<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>As you can see, everything should work. We got the output in the desired form, i.e. <code>((encoder_input, decoder_input), target)</code>.</li>
    <li>We need another two functions. The first <code>adapt_compile_and_fit()</code> is liable for additional datasets preparation, an adaptation of the model text vectorization layers, and, finally, for the training process. The second one: <code>translate()</code>, is responsible for the sentence translation.</li>
    <li>Additionally, we will write a small callback, i.e. <code>ColoramaVerbose</code>, which slightly prettifies the training output.</li>
</ul>

In [30]:
class ColoramaVerbose(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(
            f"{CLR}Epoch: {RED}{epoch + 1:02d}{CLR} -{RESET}",
            f"{CLR}loss: {RED}{logs['loss']:.5f}{CLR} -{RESET}",
            f"{CLR}accuracy: {RED}{logs['accuracy']:.5f}{CLR} -{RESET}",
            f"{CLR}val_loss: {RED}{logs['val_loss']:.5f}{CLR} -{RESET}",
            f"{CLR}val_accuracy: {RED}{logs['val_accuracy']:.5f}",
        )


In [31]:
def adapt_compile_and_fit(
    model,
    train_dataset,
    valid_dataset,
    n_epochs=5,
    n_patience=3,
    init_lr=0.01,
    decay_rate=0.1,
    verbose_level=0,
):
    """Takes the model vectorization layers and adapts them to the training data.
    Then, it prepares the final datasets vectorizing targets and prefetching,
    and finally trains the given model. Additionally, provides learning rate scheduling
    (exponential decay) and early stopping."""

    model.vectorization_en.adapt(
        train_dataset.map(
            lambda sentences, target: sentences[0],  # English sentences.
            num_parallel_calls=tf.data.AUTOTUNE,
        )
    )
    model.vectorization_fr.adapt(
        train_dataset.map(
            lambda sentences, target: sentences[1] + b" endofseq",  # French sentences.
            num_parallel_calls=tf.data.AUTOTUNE,
        )
    )

    train_dataset_prepared = train_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_fr(target)),
        num_parallel_calls=tf.data.AUTOTUNE,
    ).prefetch(tf.data.AUTOTUNE)

    valid_dataset_prepared = valid_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_fr(target)),
        num_parallel_calls=tf.data.AUTOTUNE,
    ).prefetch(tf.data.AUTOTUNE)

    colorama_cb = ColoramaVerbose()
    early_stopping_cb = keras.callbacks.EarlyStopping(
        monitor="val_accuracy", patience=n_patience, restore_best_weights=True
    )
    n_decay_steps = n_epochs * len(list(train_dataset_prepared))
    scheduled_lr = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=init_lr, decay_steps=n_decay_steps, decay_rate=decay_rate
    )

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=keras.optimizers.RMSprop(learning_rate=scheduled_lr),
        metrics=["accuracy"],
    )

    return model.fit(
        train_dataset_prepared,
        epochs=n_epochs,
        validation_data=valid_dataset_prepared,
        callbacks=[early_stopping_cb, colorama_cb],
        verbose=verbose_level,
    )


In [32]:
def translate(model, sentence_en):
    translation = ""
    for word_idx in range(model.max_sentence_len):
        X_encoder = np.array([sentence_en])
        X_decoder = np.array(["startofseq " + translation])
        # Last token's probas.
        y_proba = model.predict((X_encoder, X_decoder), verbose=0)[0, word_idx]
        predicted_word_id = np.argmax(y_proba)
        predicted_word = model.vectorization_fr.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()


# <p style="padding:15px; background-color:#3f384a; font-family:JetBrains Mono; font-weight:bold; color:#f2f2f0; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Bidirectional Encoder-Decoder with Attention</p>

<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>All utility functions and preprocessing steps are done, so we can get through to the implementation of an encoder-decoder RNN with an attention mechanism.</li>
    <li>In general, the encoder-decoder RNN with attention mechanism is an extension of the basic encoder-decoder architecture for sequence-to-sequence tasks, such as machine translation. It handles the limitation of the basic architecture, which often struggles with longer input sequences.</li>
    <li>In such an architecture, the encoder is responsible for encoding the input sequence into a fixed-length context vector, while the decoder generates the output sequence based on the encoded information. The attention mechanism enables the decoder to focus on specific parts of the input sequence, allowing for better alignment and handling of long sentences.</li>
    <li>Roughly speaking, the encoder-decoder consists of vectorization layers, embedding layers, usually LSTM or GRU cells (actually, these are the encoder and decoder), an attention layer and final output dense layer.</li>
    <li>The last thing is the "bidirectional" word. The point here is that the encoder is bidirectional, meaning the sequence is processed from left to right and from right to left. So when we have a bidirectional LSTM cell with, for example, $16$ units, we actually have $32$ units. Such a mechanism helps to capture the sentence context.</li>
</ul>

In [36]:
class BidirectionalEncoderDecoderWithAttention(keras.Model):
    def __init__(
        self,
        vocabulary_size=2000,
        max_sentence_len=50,
        embedding_size=128,
        n_units_lstm=512,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.max_sentence_len = max_sentence_len

        self.vectorization_en = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )
        self.vectorization_fr = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )

        self.encoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )
        self.decoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )

        self.encoder = layers.Bidirectional(
            layers.LSTM(n_units_lstm // 2, return_sequences=True, return_state=True)
        )
        self.decoder = layers.LSTM(n_units_lstm, return_sequences=True)
        self.attention = layers.Attention()
        self.output_layer = layers.Dense(vocabulary_size, activation="softmax")

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_input_ids = self.vectorization_en(encoder_inputs)
        decoder_input_ids = self.vectorization_fr(decoder_inputs)

        encoder_embeddings = self.encoder_embedding(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding(decoder_input_ids)

        # The final hidden state of the encoder, representing the entire
        # input sequence, is used to initialize the decoder.
        encoder_output, *encoder_state = self.encoder(encoder_embeddings)
        encoder_state = [
            tf.concat(encoder_state[0::2], axis=-1),  # Short-term state (0 & 2).
            tf.concat(encoder_state[1::2], axis=-1),  # Long-term state (1 & 3).
        ]
        decoder_output = self.decoder(decoder_embeddings, initial_state=encoder_state)
        attention_output = self.attention([decoder_output, encoder_output])

        return self.output_layer(attention_output)


<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>We're ready to run all this stuff now. As we remember, we usually have sentences no longer than $15$ words. Therefore, it's better to use this value in the model.</li>
</ul>

In [37]:
K.clear_session()
tf.random.set_seed(42)  # Ensure reproducibility on CPU.

easy_train_ds = from_sentences_dataset(
    sentences_en_train, sentences_fr_train, shuffle=True, seed=42
)
easy_valid_ds = from_sentences_dataset(sentences_en_valid, sentences_fr_valid)

bidirect_encoder_decoder = BidirectionalEncoderDecoderWithAttention(
    vocabulary_size=1000, max_sentence_len=15
)
history = adapt_compile_and_fit(
    bidirect_encoder_decoder, easy_train_ds, easy_valid_ds, n_epochs=1
)


[1m[37mEpoch: [1m[31m01[1m[37m -[0m [1m[37mloss: [1m[31m1.08056[1m[37m -[0m [1m[37maccuracy: [1m[31m0.53209[1m[37m -[0m [1m[37mval_loss: [1m[31m0.81512[1m[37m -[0m [1m[37mval_accuracy: [1m[31m0.61556


In [38]:
translation1 = translate(bidirect_encoder_decoder, "Take a seat")
translation2 = translate(bidirect_encoder_decoder, "I wish Tom was here.")
translation3 = translate(bidirect_encoder_decoder, "How did the audition go?")

print("Actual Translations:")
print("Take a seat".ljust(25), "-> ", "Prends place !")
print("I wish Tom was here.".ljust(25), "-> ", "J'aimerais que Tom soit là.")
print("How did the audition go?".ljust(25), "-> ", "Comment s'est passée l'audition ?")
print()
print("Model Translations:")
print("Take a seat".ljust(25), "-> ", translation1)
print("I wish Tom was here.".ljust(25), "-> ", translation2)
print("How did the audition go?".ljust(25), "-> ", translation3)


Actual Translations:
Take a seat               ->  Prends place !
I wish Tom was here.      ->  J'aimerais que Tom soit là.
How did the audition go?  ->  Comment s'est passée l'audition ?

Model Translations:
Take a seat               ->  [UNK]
I wish Tom was here.      ->  jaimerais que tom [UNK] ici
How did the audition go?  ->  comment [UNK] [UNK] [UNK]


<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>The model handles quite well with short sentences but struggles with longer ones. Also, sometimes the translation is far from ideal. One possible solution for better translations is the so-called <b>Beam Search</b>, but I won't be implementing this here. If you are interested in that, you will certainly find this concept.</li>
</ul>

# <p style="padding:15px; background-color:#3f384a; font-family:JetBrains Mono; font-weight:bold; color:#f2f2f0; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Transformer Architecture</p>

<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>The Transformer architecture has revolutionized language translation tasks. It composes an architecture specifically designed for sequence-to-sequence tasks like language translation. It replaces traditional recurrent neural networks (RNNs) and introduces self-attention mechanisms for capturing dependencies and positional information in an input sequence.</li>
    <li>The encoder component processes the input sequence by a stack of identical encoder layers. Each encoder layer comprises a multi-head self-attention mechanism and a position-wise feed-forward neural network.</li>
    <li>The decoder component also consists of a stack of identical layers but with additional masked self-attention and encoder-decoder attention mechanisms. The masked self-attention prevents the decoder from attending to future positions during training, ensuring the model generates outputs based only on the current and previously generated tokens. The encoder-decoder attention allows the decoder to attend to relevant parts of the encoded input sequence.</li>
    <li>There is another completely new component, i.e. positional embedding (PE). It provides positional information for the input embeddings to account for word order. It helps the model differentiate between words based on their relative positions. We implement this using sine and cosine functions of different frequencies and phases.</li>
    <li>You can find that groundbreaking article about transformer here: <a href="https://arxiv.org/abs/1706.03762" style="color:#2A357D"><b>Attention Is All You Need</b></a>. I really encourage you to get familiar with this. Also, then you will be able to easy understand the code below.</br><img src="https://github.com/mateuszk098/kaggle_notebooks/blob/master/mt_with_tranformers/transformer_architecture.png"></li>
    
</ul>

In [39]:
class PositionalEmbedding(layers.Layer):
    def __init__(
        self, max_sentence_len=50, embedding_size=128, dtype=tf.float32, **kwargs
    ):
        super().__init__(dtype=dtype, **kwargs)
        if not embedding_size % 2 == 0:
            raise ValueError("The `embedding_size` must be even.")

        p, i = np.meshgrid(np.arange(max_sentence_len), np.arange(embedding_size // 2))
        pos_emb = np.empty((1, max_sentence_len, embedding_size))
        pos_emb[:, :, 0::2] = np.sin(p / 10_000 ** (2 * i / embedding_size)).T
        pos_emb[:, :, 1::2] = np.cos(p / 10_000 ** (2 * i / embedding_size)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
        self.supports_masking = True

    def call(self, inputs):
        batch_max_length = tf.shape(inputs)[1]
        return inputs + self.positional_embedding[:, :batch_max_length]


In [40]:
class Encoder(layers.Layer):
    def __init__(
        self,
        embedding_size=128,
        n_attention_heads=8,
        n_units_dense=128,
        dropout_rate=0.1,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.attention = layers.MultiHeadAttention(
            n_attention_heads, embedding_size, dropout=dropout_rate
        )
        self.add = layers.Add()
        self.normalization = layers.LayerNormalization()
        self.dense1 = layers.Dense(
            n_units_dense, "relu", kernel_initializer="he_normal"
        )
        self.dense2 = layers.Dense(embedding_size)
        self.dropout = layers.Dropout(dropout_rate)

    def call(self, inputs, mask=None):
        Z = inputs
        skip_Z = Z
        Z = self.attention(Z, value=Z, attention_mask=mask)
        Z = self.normalization(self.add([Z, skip_Z]))
        skip_Z = Z
        Z = self.dense1(Z)
        Z = self.dense2(Z)
        Z = self.dropout(Z)
        return self.normalization(self.add([Z, skip_Z]))


class Decoder(layers.Layer):
    def __init__(
        self,
        embedding_size=128,
        n_attention_heads=8,
        n_units_dense=128,
        dropout_rate=0.1,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.masked_attention = layers.MultiHeadAttention(
            n_attention_heads, embedding_size, dropout=dropout_rate
        )
        self.attention = layers.MultiHeadAttention(
            n_attention_heads, embedding_size, dropout=dropout_rate
        )
        self.add = layers.Add()
        self.normalization = layers.LayerNormalization()
        self.dense1 = layers.Dense(
            n_units_dense, "relu", kernel_initializer="he_normal"
        )
        self.dense2 = layers.Dense(embedding_size)
        self.dropout = layers.Dropout(dropout_rate)

    def call(self, inputs, mask=None):
        decoder_mask, encoder_mask = mask  # type: ignore
        Z, encoder_output = inputs
        Z_skip = Z
        Z = self.masked_attention(Z, value=Z, attention_mask=decoder_mask)
        Z = self.normalization(self.add([Z, Z_skip]))
        Z_skip = Z
        Z = self.attention(Z, value=encoder_output, attention_mask=encoder_mask)
        Z = self.normalization(self.add([Z, Z_skip]))
        Z_skip = Z
        Z = self.dense1(Z)
        Z = self.dense2(Z)
        Z = self.dropout(Z)
        return self.normalization(self.add([Z, Z_skip]))


In [41]:
class Transformer(keras.Model):
    def __init__(
        self,
        vocabulary_size=2000,
        max_sentence_len=50,
        embedding_size=128,
        n_encoder_decoder_blocks=2,
        n_attention_heads=8,
        n_units_dense=128,
        dropout_rate=0.1,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.max_sentence_len = max_sentence_len

        self.vectorization_en = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )
        self.vectorization_fr = layers.TextVectorization(
            vocabulary_size, output_sequence_length=max_sentence_len
        )
        self.encoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )
        self.decoder_embedding = layers.Embedding(
            vocabulary_size, embedding_size, mask_zero=True
        )
        self.positional_embedding = PositionalEmbedding(max_sentence_len, embedding_size)
        self.encoder_blocks = [
            Encoder(embedding_size, n_attention_heads, n_units_dense, dropout_rate)
            for _ in range(n_encoder_decoder_blocks)
        ]
        self.decoder_blocks = [
            Decoder(embedding_size, n_attention_heads, n_units_dense, dropout_rate)
            for _ in range(n_encoder_decoder_blocks)
        ]
        self.output_layer = layers.Dense(vocabulary_size, activation="softmax")

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_input_ids = self.vectorization_en(encoder_inputs)
        decoder_input_ids = self.vectorization_fr(decoder_inputs)

        encoder_embeddings = self.encoder_embedding(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding(decoder_input_ids)

        encoder_pos_embeddings = self.positional_embedding(encoder_embeddings)
        decoder_pos_embeddings = self.positional_embedding(decoder_embeddings)

        encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
        decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
        batch_max_len_decoder = tf.shape(decoder_embeddings)[1]
        decoder_causal_mask = tf.linalg.band_part(  # Lower triangular matrix.
            tf.ones((batch_max_len_decoder, batch_max_len_decoder), tf.bool), -1, 0
        )
        decoder_mask = decoder_causal_mask & decoder_pad_mask

        Z = encoder_pos_embeddings
        for encoder_block in self.encoder_blocks:
            Z = encoder_block(Z, mask=encoder_pad_mask)

        encoder_output = Z
        Z = decoder_pos_embeddings
        for decoder_block in self.decoder_blocks:
            Z = decoder_block(
                [Z, encoder_output], mask=[decoder_mask, encoder_pad_mask]
            )

        return self.output_layer(Z)


In [46]:
K.clear_session()
tf.random.set_seed(42)  # Ensure reproducibility on CPU.

transformer = Transformer(
    vocabulary_size=1000,
    max_sentence_len=15,
    dropout_rate=0.2,
    n_encoder_decoder_blocks=1,
)
history = adapt_compile_and_fit(
    transformer,
    easy_train_ds,
    easy_valid_ds,
    n_epochs=1,
    verbose_level=1,
    init_lr=0.001,
)




# <p style="padding:15px; background-color:#3f384a; font-family:JetBrains Mono; font-weight:bold; color:#f2f2f0; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Tackling Hard Dataset</p>

<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>As so far, we didn't encounter a large dataset problem. Up to now.</li>
    <li>The second English-French dataset comprises over 22.5 million sentences and weighs around 8 GB. There may be a problem with loading this as one file. One of the solutions is to split this one large file into several small files and use <code>TensorFlow</code> data API to handle loading and prefetching data from several files. This is what we will do in this section. In that way, you will be able to train a model on the whole dataset (if you have time and resources, obviously).</li>
    <li>Firstly, let's download the data as before.</li>
</ul>

In [173]:
%time

hard_dataset_user = "dhruvildave"
hard_dataset = "en-fr-translation-dataset"
data_dir = Path("data")

if not ON_KAGGLE:
    download_dataset_from_kaggle(hard_dataset_user, hard_dataset, data_dir)
    hard_dataset_path = data_dir / "en-fr.csv"
else:
    hard_dataset_path = Path("/kaggle/input/en-fr-translation-dataset/en-fr.csv")


CPU times: total: 0 ns
Wall time: 0 ns


<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>We will use <code>pandas</code> to split the dataset into multiple small files.</li>
</ul>

In [174]:
%time

chunk_size = 100_000
chunks_dir = Path("data_chunks")

if not os.path.exists(chunks_dir):
    chunks_dir.mkdir(parents=True)
    chunks = pd.read_csv(hard_dataset_path, chunksize=chunk_size, encoding="utf-8")
    for i, chunk in enumerate(chunks):
        chunk_path = chunks_dir / f"en-fr-chunk-{i:03}.csv"
        chunk.to_csv(chunk_path, index=False, encoding="utf-8")


CPU times: total: 0 ns
Wall time: 0 ns


In [175]:
filepaths = [f"{chunks_dir}/{chunk_file}" for chunk_file in os.listdir(chunks_dir)]
filepaths[:10]


['data_chunks/en-fr-chunk-000.csv',
 'data_chunks/en-fr-chunk-001.csv',
 'data_chunks/en-fr-chunk-002.csv',
 'data_chunks/en-fr-chunk-003.csv',
 'data_chunks/en-fr-chunk-004.csv',
 'data_chunks/en-fr-chunk-005.csv',
 'data_chunks/en-fr-chunk-006.csv',
 'data_chunks/en-fr-chunk-007.csv',
 'data_chunks/en-fr-chunk-008.csv',
 'data_chunks/en-fr-chunk-009.csv']

In [176]:
with open(filepaths[0], encoding="utf8") as f:
    for line in f.readlines()[:5]:
        print(line, end="")


en,fr
Changing Lives | Changing Society | How It Works | Technology Drives Change Home | Concepts | Teachers | Search | Overview | Credits | HHCC Web | Reference | Feedback Virtual Museum of Canada Home Page,"Il a transformé notre vie | Il a transformé la société | Son fonctionnement | La technologie, moteur du changement Accueil | Concepts | Enseignants | Recherche | Aperçu | Collaborateurs | Web HHCC | Ressources | Commentaires Musée virtuel du Canada"
Site map,Plan du site
Feedback,Rétroaction
Credits,Crédits


In [177]:
with open(filepaths[-1], encoding="utf8") as f:
    for line in f.readlines()[:5]:
        print(line, end="")


en,fr
"Under the modified rights model, aboriginal rights are not extinguished, but are modified into the rights articulated and defined in the treaty.","Le modèle des droits modifiés n’abroge pas les droits ancestraux, mais il les adapte pour les intégrer aux droits énoncés et définis dans le traité."
"Under the non-assertion model, Aboriginal rights are not extinguished, and the Aboriginal group agrees to exercise only those rights articulated and defined in the treaty and to assert no other Aboriginal rights.","Le modèle de nonaffirmation n’abolit pas les droits ancestraux, mais vient confirmer que le groupe autochtone accepte d’exercer uniquement les droits précisés et définis dans le traité et qu’il ne revendiquera aucun autre droit ancestral."
Cash Component A Final Agreement specifies the total amount of the cash settlement to be provided by the federal government to the respective Aboriginal group (through the government that represents it) as part of the land claim settlement.

<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #e04c5f; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li>Now, we will write utility functions that load and preliminary prepare the appropriate <code>TensorFlow</code> dataset. We proceed as before but with two significant changes. We add parsing <code>csv</code> line and files interleaving. Thanks to interleaving, the pipeline will be loading subsequent sentences from given files until they are exhausted.</li>
</ul>

In [178]:
def parse_csv_line(line):
    "Decodes `csv` line and returns `(sentence_en, sentence_fr)` tensor."
    defaults = 2 * [tf.constant("", dtype=tf.string)]
    fields = tf.io.decode_csv(line, record_defaults=defaults)
    return tf.stack(fields[0]), tf.stack(fields[1])


def prepare_input_and_target(sentences_en, sentences_fr):
    """Return data in the format: `((encoder_input, decoder_input), target)`"""
    return (sentences_en, b"startofseq " + sentences_fr), sentences_fr + b" endofseq"


def from_csv_files_dataset(
    filepaths,
    batch_size=32,
    cache=True,
    shuffle=False,
    shuffle_buffer_size=10_000,
    seed=None,
):
    dataset = tf.data.Dataset.list_files(filepaths, seed=seed)
    dataset = dataset.interleave(  # type: ignore
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),  # Skip header.
        cycle_length=tf.data.AUTOTUNE,
        num_parallel_calls=tf.data.AUTOTUNE,
    )
    dataset = dataset.map(parse_csv_line, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=tf.data.AUTOTUNE)
    if cache:
        dataset = dataset.cache()
    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
    return dataset.batch(batch_size)


In [181]:
K.clear_session()
tf.random.set_seed(42)  # Ensure reproducibility on CPU.

hard_train_ds = from_csv_files_dataset(filepaths[0:2], shuffle=True, seed=42)
hard_valid_ds = from_csv_files_dataset(filepaths[2:3])

bidirect_encoder_decoder = BidirectionalEncoderDecoderWithAttention()
history = adapt_compile_and_fit(
    bidirect_encoder_decoder,
    hard_train_ds,
    hard_valid_ds,
    n_epochs=1,
    verbose_level=1,
)


   6249/Unknown - 259s 40ms/step - loss: 1.4338 - accuracy: 0.3954[1m[37mEpoch: [1m[31m01[1m[37m -[0m [1m[37mloss: [1m[31m1.43382[1m[37m -[0m [1m[37maccuracy: [1m[31m0.39545[1m[37m -[0m [1m[37mval_loss: [1m[31m1.70756[1m[37m -[0m [1m[37mval_accuracy: [1m[31m0.33422
