# **NLP Final Project - Machine Translation**

##### Downloading Libraries

In [None]:
!pip install tensorflow-text

Collecting tensorflow-text
  Downloading tensorflow_text-2.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting tensorflow<2.19,>=2.18.0 (from tensorflow-text)
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow<2.19,>=2.18.0->tensorflow-text)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tensorflow_text-2.18.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m95.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Preprocessing

In [None]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as tf_text
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Input, Dropout, LayerNormalization
import matplotlib.pyplot as plt
import seaborn as sns

data = pd.read_csv('/content/drive/MyDrive/colab/data.csv')
print(data.head(5))

  english  spanish
0     Go.      Ve.
1     Go.    Vete.
2     Go.    Vaya.
3     Go.  Váyase.
4     Hi.    Hola.


In [None]:
# Shuffle dataset

data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,english,spanish
0,Everyone has strengths and weaknesses.,Todo el mundo tiene fortalezas y debilidades.
1,I'm glad that he passed the exam.,Me alegro de que haya pasado el examen.
2,I wish I had a house of my own.,Desearía tener una casa propia.
3,We are committed to our country's welfare.,Estamos comprometidos con el bienestar del país.
4,They say that he will never return.,Se dice que él ya nunca volverá.


In [None]:
en_text = data["english"]
spn_text = data["spanish"]
import string
en_text=en_text.to_numpy().tolist()
en_text = [str(item) for item in en_text]
spn_text=spn_text.to_numpy().tolist()
spn_text = [str(item) for item in spn_text]
print(type(en_text))  # Should be <class 'list'>
print(type(en_text[0]))  # Should be <class 'str'>, not a tensor

<class 'list'>
<class 'str'>


In [None]:
import unicodedata
import re
def pre_proc(text):
    if isinstance(text, bytes):
        text = text.decode('utf-8')  # Decode bytes to string using 'utf-8' encoding

    text = unicodedata.normalize('NFKD', text)  # removes accents in Spanish
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^ a-z.?!,¿]', '', text)  # removes any characters that are not letters/numbers/ .?!
    text = re.sub(r'[.?!,¿]', r' \0 ', text)  # adds space before and after punctuations
    text = re.sub(r'\bstart\b', '', text)  # Remove "start"
    text = re.sub(r'\bend\b', '', text)  # Remove "end"
    text = f'[START] {text} [END]'  # Add [START] and [END] tags

    return text

In [None]:
en_train=en_text[:10000]
spn_train=spn_text[:10000]
en_test=en_text[10000:12000]
spn_test=spn_text[10000:12000]

In [None]:
en_train = [pre_proc(text) for text in en_train]
spn_train = [pre_proc(text) for text in spn_train]
en_test = [pre_proc(text) for text in en_test]
spn_test = [pre_proc(text) for text in spn_test]
print(en_test[:7])
print(spn_test[:7])

['[START]   hes very intelligent     [END]', '[START]   the bus is capable of carrying thirty people     [END]', '[START]   when did you lose your keys     [END]', '[START]   im not interested in anything tom has to say     [END]', '[START]   she tried to hide her feelings     [END]', '[START]   tom knows who mary is     [END]', '[START]   the same thing happened monday     [END]']
['[START]   el es muy inteligente     [END]', '[START]   este autobus tiene capacidad para treinta personas     [END]', '[START]     cuando perdiste las llaves     [END]', '[START]   no me interesa nada de lo que tom tenga para decir     [END]', '[START]   intento ocultar sus sentimientos     [END]', '[START]   tom sabe quien es mary     [END]', '[START]   lo mismo paso el lunes     [END]']


In [None]:
num_words = 10000
tokenizer_en= Tokenizer(num_words=num_words, filters='#$%&()*+,-/:;<=>@«»""[\\]^_`{|}~\t\n')
tokenizer_spn= Tokenizer(num_words=num_words, filters='#$%&()*+,-/:;<=>@«»""[\\]^_`{|}~\t\n')
tokenizer_en.fit_on_texts(en_train)
tokenizer_spn.fit_on_texts(spn_train)
en_train = tokenizer_en.texts_to_sequences(en_train)
spn_train = tokenizer_spn.texts_to_sequences(spn_train)

word_idx_en = tokenizer_en.word_index
word_idx_spn = tokenizer_spn.word_index
print(f"The number of words in the English vocabulary: {len(word_idx_en)}")
print(f"The number of words in the Spanish vocabulary: {len(word_idx_spn)}")


The number of words in the English vocabulary: 4991
The number of words in the Spanish vocabulary: 7479


In [None]:
en_train = pad_sequences(en_train, maxlen = 30, padding='post', truncating='post')
spn_train = pad_sequences(spn_train, maxlen=30, padding='post', truncating='post')

In [None]:
en_train[:5]

array([[   2,  335,   46, 2675,   40, 2676,    1,    3,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0],
       [   2,   35,  492,   14,   11,  700,    4,  835,    1,    3,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0],
       [   2,    5,  336,    5,   50,    9,  119,   13,   17,  301,    1,
           3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0],
       [   2,   31,   23, 1918,    6,  114, 2677, 2678,    1,    3,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0],
       [   2,   45,  133,   14,   11,   49,   97,  701,    1,    3,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0, 

In [None]:
# Apply tokenization and padding to the test data
en_test = tokenizer_en.texts_to_sequences(en_test)
spn_test = tokenizer_spn.texts_to_sequences(spn_test)
en_test = pad_sequences(en_test, maxlen = 30, padding='post', truncating='post')
spn_test = pad_sequences(spn_test, maxlen=30, padding='post', truncating='post')

# **LSTM-based Seq2Seq Model** (Bonus)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, RepeatVector
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
from keras import optimizers

In [None]:
vocab_en = len(tokenizer_en.word_index) + 1
vocab_spn = len(tokenizer_spn.word_index) + 1

In [None]:
en_train= en_train[:10000]
spn_train= spn_train[:10000]
en_val= en_train[10000:12000]
spn_val= spn_train[10000:12000]
print(en_train.shape)
print(spn_train.shape)

(10000, 100)
(10000, 100)


In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_en, output_dim=128,input_shape=(en_len,), mask_zero=True))
model.add(LSTM(units=512)) # Encoder layer
model.add(RepeatVector(n=spn_len))
model.add(LSTM(units=512, return_sequences=True)) # Decoder layer
model.add(Dense(vocab_spn, activation='softmax')) # Generates the probabilities for each word in the target vocabulary (Spanish).

  super().__init__(**kwargs)


In [None]:
model.summary()

In [None]:
print(f"Vocabulary size for Spanish: {vocab_spn}")
print(f"Vocabulary size for English: {vocab_en}")


Vocabulary size for Spanish: 8243
Vocabulary size for English: 5443


## Training & Evaluation

In [None]:
from tensorflow.keras.optimizers import RMSprop
# Define optimizer
rms = RMSprop(learning_rate=0.001)
# Compile the model
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train the model
# Limit the target values to be within the vocabulary size
#spn_train = np.clip(spn_train, 0, vocab_spn - 1)

history = model.fit(en_train, spn_train,validation_data=(en_val, spn_val), epochs=10, batch_size=32)

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 71ms/step - accuracy: 0.8933 - loss: 1.3206 - val_accuracy: 0.9224 - val_loss: 0.4827
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 73ms/step - accuracy: 0.9230 - loss: 0.4822 - val_accuracy: 0.9224 - val_loss: 0.4744
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 76ms/step - accuracy: 0.9249 - loss: 0.4631 - val_accuracy: 0.9242 - val_loss: 0.4654
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 80ms/step - accuracy: 0.9260 - loss: 0.4543 - val_accuracy: 0.9230 - val_loss: 0.4706
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 86ms/step - accuracy: 0.9257 - loss: 0.4536 - val_accuracy: 0.9269 - val_loss: 0.4564
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 88ms/step - accuracy: 0.9264 - loss: 0.4498 - val_accuracy: 0.9256 - val_loss: 0.4599
Epoch 7/10
[1m3

In [None]:
# Evaluate the Model
test_accuracy = model.evaluate(en_test, spn_test)[1]
print(f"Model's Accuracy: {test_accuracy*100}%")

Model's Accuracy: 78.67%


# **Transformers Model**



## Position Encodings

In [None]:
# Function to compute the angles for positional encoding.
def get_angles(pos, i, emb_dim):
    """
    pos: The position of the token in the sequence (e.g., 0, 1, 2, ..., seq_len-1).
    i: The index of the embedding dimension (e.g., 0, 1, ..., embedding_dim-1).
    embedding_dim: The total dimensionality of the embedding space (e.g., 512, 256, etc.).
    """
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(emb_dim))
    return pos * angle_rates

In [None]:
def positional_encoding(position, embedding_dim):
    """
    Adds  positional encoding to the Embeddings to be fed to the Transformer model.

    Computes a sin and cos of the angles determined by the get_angles() function
    and adds the value computed to an axis of the embeddings.
    """
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                           np.arange(embedding_dim)[np.newaxis, :], embedding_dim)

    # apply sin to even indices in the array. ie 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array. ie 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
# Generate positional encodings
pos_encodings = positional_encoding(100, 128)

## Masking

In [None]:
def create_padding_mask(seq):
    """
    Creates a padding mask for a given sequence.

    Args:
        seq (tensor): A tensor of shape (batch_size, seq_len) containing the sequence.

    Returns:
        A tensor of shape (batch_size, 1, 1, seq_len) containing a mask that is 1 where the sequence is padded, and 0 otherwise.
    """
    # Convert the sequence to a boolean tensor where True indicates a pad token (value 0).
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # Add an extra dimension to the mask to add the padding to the attention logits (ensure mask can be applied in attention during self-attention).
    return seq[:, tf.newaxis, tf.newaxis, :]

###### This mask ensures that, during training, the model cannot "see" future tokens in the sequence, and it can only attend to the current or previous tokens. This is necessary because the model should predict each token in a sequence autoregressively, one token at a time.

In [None]:
def create_look_ahead_mask(size):
    """
    Creates a look-ahead mask used during training the decoder of a transformer.

    Args:
        size (int): The size of the mask.

    Returns:
        tf.Tensor: A lower triangular matrix of shape (size, size) with ones on the diagonal
            and zeros below the diagonal. (indicating "allowed" positions)
    """
    # create a matrix with ones on the diagonal and zeros below the diagonal
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)

    return mask

In [None]:
# Responsible for generating all the necessary masks used in the transformer model
def create_masks(inputs, targets):
    """
    Creates masks for the input sequence and target sequence.

    Args:
        inputs: Input sequence tensor.
        targets: Target sequence tensor.

    Returns:
        A tuple of three masks: the encoder padding mask, the combined mask used in the first attention block,
        and the decoder padding mask used in the second attention block.
    """

    # Create the encoder padding mask.
    enc_padding_mask = create_padding_mask(inputs)

    # Create the decoder padding mask.
    dec_padding_mask = create_padding_mask(inputs)

    # Create the look ahead mask for the first attention block.
    # It is used to pad and mask future tokens in the tokens received by the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(targets)[1])

    # Create the decoder target padding mask.
    dec_target_padding_mask = create_padding_mask(targets)

    # Combine the look ahead mask and decoder target padding mask for the first attention block.
    # Ensures that the decoder’s self-attention mechanism does not attend to padding tokens or future tokens.
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

## Model Architecture

### Self-Attention

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    """
    Computes the scaled dot product attention weight for the query (q), key (k), and value (v) vectors.
    The attention weight is a measure of how much focus should be given to each element in the sequence of values (v)
    based on the corresponding element in the sequence of queries (q) and keys (k).

    Args:
    q: query vectors; shape (..., seq_len_q, depth)
    k: key vectors; shape  (..., seq_len_k, depth)
    v: value vectors; shape  (..., seq_len_v, depth_v)
    mask: (optional) mask to be applied to the attention weights

    Returns:
    output: The output of the scaled dot product attention computation; shape   (..., seq_len_q, depth_v)
    attention_weights: The attention weights
    """
    # Compute dot product of query and key vectors
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    # Compute the square root of the depth of the key vectors
    dk = tf.cast(tf.shape(k)[-1], dtype=tf.float32)
    scaled_dk = tf.math.sqrt(dk)

    # Compute scaled attention logits by dividing dot product by scaled dk
    # To prevent excessively large values in the attention logits
    scaled_attention_logits = matmul_qk / scaled_dk

    # Apply mask to the attention logits (if mask is available)
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  #By adding -1e9 where the mask is 1, attention weight for these positions becomes effectively 0 after applying softmax

    # Apply softmax to the scaled attention logits to get the attention weights
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    # Compute the weighted sum of the value vectors using the attention weights
    output = tf.matmul(attention_weights, v)

    return output, attention_weights

### Multi-head Attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    """
    MultiHeadAttention Layer that implements the attention mechanism for the Transformer.
    It splits the input into multiple heads, computes scaled dot-product attention for each head
    and then concatenates the output of the heads and passes it through a dense layer.
    """

    def __init__(self, key_dim, num_heads, dropout_rate=0.0):
        """
        Initializes the MultiHeadAttention layer.

        Args:
            key_dim (int): The dimensionality of the key space.
            num_heads (int): The number of attention heads.
            dropout (float): The dropout rate to apply after the dense layer.
        """
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.key_dim = key_dim
        # ensure  that the dimension of the embedding can be evenly split across attention heads
        assert key_dim % num_heads == 0
        self.depth = self.key_dim // self.num_heads  # dimension of each individual head

        # dense layers to project the input into queries, keys and values
        self.wq = Dense(key_dim)
        self.wk = Dense(key_dim)
        self.wv = Dense(key_dim)

        # dropout layer
        self.dropout = Dropout(dropout_rate)

        # dense layer to project the output of the attention heads
        self.dense = Dense(key_dim)

    def split_heads(self, x, batch_size):
        """
        Splits the last dimension of the tensor into (num_heads, depth).
        Transposes the result such that the shape is (batch_size, num_heads, seq_len, depth).

        Args:
            x (tensor): The tensor to be split.
            batch_size (int): The size of the batch.

        Returns:
            tensor: The tensor with the last dimension split into (num_heads, depth) and transposed.
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask=None):
        """
        Applies the multi-head attention mechanism to the inputs.

        Args:
            v (tensor): The value tensor of shape (batch_size, seq_len_v, key_dim).
            k (tensor): The key tensor of shape (batch_size, seq_len_k, key_dim).
            q (tensor): The query tensor of shape (batch_size, seq_len_q, key_dim).
            mask (tensor, optional): The mask tensor of shape (batch_size, seq_len_q, seq_len_k).
                                     Defaults to None.

        Returns:
            tensor: The output tensor of shape (batch_size, seq_len_q, key_dim).
            tensor: The attention weights tensor of shape (batch_size, num_heads, seq_len_q, seq_len_k).
        """
        batch_size = tf.shape(q)[0]

        # input tensors are passed through dense layers to project them into correct key dim
        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        # split the heads
        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        # split the queries, keys and values into multiple heads (compute the attention output and the attention weights)
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        # reshape and add Dense layer
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.key_dim))
        output = self.dense(concat_attention)
        output = self.dropout(output)

        return output, attention_weights

### FC Layer

In [None]:
def FeedForward(embedding_dim, fully_connected_dim):
    """Create a fully connected feedforward neural network.

    Args:
        embedding_dim (int): Dimensionality of the embedding output from the transformer layer.
        fully_connected_dim (int): Number of neurons in the fully connected layers.

    Returns:
        tf.keras.Sequential: A fully connected feedforward neural network with the specified architecture.
    """
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(fully_connected_dim, activation='relu'),
        tf.keras.layers.Dense(embedding_dim)
    ])
    return model

### Encoder

######  Encoder attends to all positions of the input sequence to compute a weighted sum of the values at each position which  allows to capture dependencies between all positions. Each layer in Encoder has residual connections and layer normalization, which help to mitigate the vanishing gradient problem and improve training stability.

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1):
        """Initializes the encoder layer

        Args:
            embedding_dim: The dimensionality of the input and output of this layer
            num_heads: The number of attention heads to use in the multi-head attention layer
            fully_connected_dim: The dimensionality of the hidden layer in the feedforward network
            dropout_rate: The rate of dropout to apply to the output of this layer during training

        Returns:
            A new instance of the EncoderLayer class
        """
        super(EncoderLayer, self).__init__()

        # Multi-head self-attention mechanism
        self.multi = MultiHeadAttention(embedding_dim, num_heads, dropout_rate)

        # Layer normalization
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)

        # Dropout
        self.dropout = Dropout(dropout_rate)

        # Feedforward network
        self.fc = FeedForward(embedding_dim, fully_connected_dim)

    def call(self, x, mask, training):
        """Applies the encoder layer to the input tensor

        Args:
            x: The input tensor to the encoder layer
            training: A boolean indicating whether the model is in training mode
            mask: A tensor representing the mask to apply to the attention mechanism

        Returns:
            The output of the encoder layer after applying the multi-head attention and feedforward network
        """

        # Apply multi-head self-attention mechanism to input tensor
        attn_out, _ = self.multi(x, x, x, mask)

        # Apply first layer normalization and add residual connection
        out1 = self.layernorm1(attn_out + x)

        # Apply feedforward network to output of first layer normalization
        fc_out = self.fc(out1)
        fc_out = self.dropout(fc_out, training=training)

        # Apply second layer normalization and add residual connection
        out2 = self.layernorm2(fc_out + out1)

        return out2

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, emb_dim, num_heads, fc_dim,vocab_size, max_position, dropout_rate=0.1):
        """
        Initializes the Encoder layer of the Transformer model.

        Args:
            num_layers (int): Number of EncoderLayers to stack.
            emb_dim (int): Dimensionality of the token embedding space.
            num_heads (int): Number of attention heads to use in MultiHeadAttention layers.
            fc_dim (int): Dimensionality of the fully connected layer in the EncoderLayer.
            vocab_size (int): Size of the input vocabulary.
            max_position (int): Maximum length of input sequences for positional encoding.
            dropout_rate (float): Probability of dropping out units during training.

        """
        super(Encoder, self).__init__()

        self.num_layers = num_layers
        self.emb_dim = emb_dim

        # Embedding layer
        self.embedding = Embedding(vocab_size, emb_dim)

        # Positional encoding
        self.pos_encoding = positional_encoding(max_position, emb_dim)

        # Encoder layers
        self.enc_layers = [EncoderLayer(emb_dim, num_heads, fc_dim, dropout_rate) for _ in range(num_layers)]

        # Dropout layer
        self.dropout = Dropout(dropout_rate)

    def call(self, inputs, mask, training):
        """
        Call function for the Encoder layer.

        Args:
            inputs: tensor of shape (batch_size, sequence_length) representing input sequences
            training: boolean indicating if the model is in training mode
            mask: tensor of shape (batch_size, sequence_length) representing the mask to apply to the input sequence

        Returns:
            A tensor of shape (batch_size, sequence_length, embedding_dim) representing the encoded sequence
        """

        # Get the sequence length
        seq_len = tf.shape(inputs)[1]

        # Embed the input sequence
        inputs = self.embedding(inputs)

        # Scale the embeddings by sqrt(embedding_dim)
        inputs *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))

        # Add positional encodings to the input sequence
        inputs += self.pos_encoding[:, :seq_len, :]

        # Apply dropout to the input sequence
        inputs = self.dropout(inputs, training=training)

        # Pass the input sequence through the encoder layers
        for i in range(self.num_layers):
            inputs = self.enc_layers[i](inputs, mask, training=training)

        # Return the encoded sequence
        return inputs

### Decoder

###### The decoder takes in the encoded input sequence along with previous generated output sequence. The output sequence is first passed through an embedding layer, which maps each token to a high-dimensional vector space. The embedding output is then added with a positional encoding, which allows the model to encode the sequential order of the input/output sequence.

###### Decoder applies a multi-head self-attention mechanism similar to encoder. However, decoder also uses an additional masked self-attention mechanism, which prevents it from attending to future tokens in output sequence during training.

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, emb_dim, num_heads, fc_dim, dropout_rate=0.1):
        """
        Initializes a single decoder layer of the transformer model.

        Args:
        emb_dim: The dimension of the embedding space.
        num_heads: The number of attention heads to use.
        fc_dim: The dimension of the feedforward network.
        rate: The dropout rate for regularization.
        """
        super(DecoderLayer, self).__init__()

        # Instantiate two instances of MultiHeadAttention.
        self.multi1 = MultiHeadAttention(emb_dim, num_heads, dropout_rate)
        self.multi2 = MultiHeadAttention(emb_dim, num_heads, dropout_rate)

        # Instantiate a fully connected feedforward network.
        self.fc = FeedForward(emb_dim, fc_dim)

        # Instantiate three layer normalization layers with epsilon=1e-6.
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)

        # Instantiate a dropout layer for regularization.
        self.dropout3 = Dropout(dropout_rate)

    def call(self, x, enc_output, look_ahead_mask, pad_mask, training):
        """
        Forward pass through the decoder layer.

        Args:
        x: The input to the decoder layer, a query vector.
        enc_output: The output from the top layer of the encoder, a set of attention vectors k and v.
        training: Whether to apply dropout regularization.
        look_ahead_mask: The mask to apply to the input sequence so that it can't look ahead to future positions.
        pad_mask: The mask to apply to the input sequence to ignore padding tokens.

        Returns:
        The output from the decoder layer, a tensor with the same shape as the input.
        The attention weights from the first multi-head attention layer.
        The attention weights from the second multi-head attention layer.
        """

        # Apply the first multi-head attention layer to the query vector x.
        # We pass x as all three inputs to the layer because this is a self-attention layer.
        attn1, attn_weights_block1 = self.multi1(x, x, x, look_ahead_mask)

        # Add the original input to the output of the attention layer and apply layer normalization.
        out1 = self.layernorm1(attn1 + x)

        # Apply the second multi-head attention layer to the output from the first layer and the encoder output.
        attn2, attn_weights_block2 = self.multi2(enc_output, enc_output, out1, pad_mask)

        # Add the output from the first layer to the output of the second layer and apply layer normalization.
        out2 = self.layernorm2(attn2 + out1)

        # Apply the feedforward network to the output of the second layer and apply dropout regularization.
        fc_out = self.fc(out2)
        fc_out = self.dropout3(fc_out, training=training)

        # Add the output from the second layer to the output of the feedforward network and apply layer normalization.
        out3 = self.layernorm3(fc_out + out2)

        return out3, attn_weights_block1, attn_weights_block2

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, emb_dim, num_heads, fc_dim, target_vocab_size, max_position, dropout_rate=0.1):
        """
        Initializes the Decoder object.

        Args:
            num_layers (int): The number of Decoder layers.
            embedding_dim (int): The size of the embedding dimension.
            num_heads (int): The number of heads in the MultiHeadAttention layer.
            fully_connected_dim (int): The number of units in the feedforward network.
            target_vocab_size (int): The number of words in the target vocabulary.
            maximum_position_encoding (int): The maximum length of a sequence.
            dropout_rate (float): The rate at which to apply dropout.
        """
        super(Decoder, self).__init__()

        self.num_layers = num_layers
        self.emb_dim = emb_dim

        # create layers
        self.embedding = Embedding(target_vocab_size, emb_dim)
        self.pos_encoding = positional_encoding(max_position, emb_dim)
        self.dec_layers = [DecoderLayer(emb_dim, num_heads, fc_dim, dropout_rate=0.1) for _ in range(num_layers)]
        self.dropout = Dropout(dropout_rate)

    def call(self, x, enc_output, look_ahead_mask, pad_mask, training):
        """
        Executes the Decoder.

        Args:
            x (tf.Tensor): The input to the Decoder.
            enc_output (tf.Tensor): The output from the Encoder.
            training (bool): Whether the Decoder is in training mode.
            look_ahead_mask (tf.Tensor): The mask for self-attention in the MultiHeadAttention layer.
            padding_mask (tf.Tensor): The mask for padding in the MultiHeadAttention layer.

        Returns:
            tf.Tensor: The output from the Decoder.
            dict: A dictionary of attention weights.
        """
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        # add embedding and positional encoding
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.emb_dim, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x, training=training)

        # apply each layer of the decoder
        for i in range(self.num_layers):
            # pass through decoder layer i
            x, block1, block2 = self.dec_layers[i](x, enc_output, look_ahead_mask, pad_mask, training=training)

            # record attention weights for block1 and block2
            attention_weights[f"decoder_layer{i + 1}_block1"] = block1
            attention_weights[f"decoder_layer{i + 1}_block2"] = block2

        return x, attention_weights

## Transformer Class

###### Composed of two main components: the encoder and the decoder. The encoder takes an input sequence and produces a sequence of hidden representations, while the decoder takes this sequence of hidden representations and generates an output sequence

In [None]:
class Transformer(tf.keras.Model):
    """
    A Transformer model that takes in an input and target sequence and outputs a final prediction.

    Args:
        num_layers (int): Number of layers in the Encoder and Decoder.
        embedding_dim (int): Dimensionality of the embedding layer.
        num_heads (int): Number of attention heads used in the Transformer.
        fully_connected_dim (int): Dimensionality of the fully connected layer in the Encoder and Decoder.
        input_vocab_size (int): Size of the input vocabulary.
        target_vocab_size (int): Size of the target vocabulary.
        max_positional_encoding_input (int): Maximum length of the input sequence.
        max_positional_encoding_target (int): Maximum length of the target sequence.
        dropout_rate (float, optional): Dropout rate used in the Encoder and Decoder layers. Defaults to 0.1.
    """
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, target_vocab_size, max_positional_encoding_input, max_positional_encoding_target, dropout_rate=0.1):
        super(Transformer, self).__init__()

        # Initialize the Encoder and Decoder layers
        self.encoder = Encoder(num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, max_positional_encoding_input, dropout_rate)
        self.decoder = Decoder(num_layers, embedding_dim, num_heads, fully_connected_dim, target_vocab_size, max_positional_encoding_target, dropout_rate)

        # Add a final dense layer to make the final prediction
        self.final_layer = tf.keras.layers.Dense(target_vocab_size, activation='softmax')

    def call(self, inp, tar, enc_padding_mask, look_ahead_mask, dec_padding_mask, training):
        """
        Perform a forward pass through the Transformer model.

        Args:
            inp (tf.Tensor): Input sequence tensor with shape (batch_size, input_seq_len).
            tar (tf.Tensor): Target sequence tensor with shape (batch_size, target_seq_len).
            training (bool): Whether the model is being trained or not.
            enc_padding_mask (tf.Tensor): Padding mask for the Encoder with shape (batch_size, 1, 1, input_seq_len).
            look_ahead_mask (tf.Tensor): Mask to prevent the Decoder from looking ahead in the target sequence with shape (batch_size, 1, target_seq_len, target_seq_len).
            dec_padding_mask (tf.Tensor): Padding mask for the Decoder with shape (batch_size, 1, 1, target_seq_len).

        Returns:
            tuple: A tuple containing the final output of the model and the attention weights of the Decoder.
        """
        # Pass the input sequence through the Encoder
        enc_output = self.encoder(inp, mask=enc_padding_mask, training=training)

        # Pass the target sequence and the output of the Encoder through the Decoder
        dec_output, attention_weights = self.decoder(tar, enc_output, look_ahead_mask=look_ahead_mask, pad_mask=dec_padding_mask, training=training)

        # Pass the output of the Decoder through the final dense layer to get the final prediction
        final_output = self.final_layer(dec_output)

        return final_output, attention_weights

## Training Loop

In [None]:
# Set hyperparameters for the Transformer model
emb_dim = 256  # dimensionality of the embeddings used for tokens in the input and target sequences
fully_connected_dim = 512  # dimensionality of the hidden layer of the feedforward neural network within the Transformer block
num_layers = 4  # number of Transformer blocks in the encoder and decoder stacks
num_heads = 8  # number of heads in the multi-head attention mechanism
dropout_rate = 0.1  # dropout rate for regularization

# Set vocabulary sizes for input and target sequences
input_vocab_size = len(tokenizer_en.word_index) + 2  # add 2 for the start and end tokens
target_vocab_size = len(tokenizer_spn.word_index) + 2  # add 2 for the start and end tokens

# Set maximum positional encoding values for input and target sequences
max_pos_encoding_input = input_vocab_size  # maximum positional encoding value for input sequence
max_pos_encoding_target = target_vocab_size  # maximum positional encoding value for target sequence

# Set the number of epochs and batch size for training
EPOCHS = 10
batch_size = 128

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    """
    A custom learning rate schedule that uses a combination of
    a square root inverse decay and a warmup schedule.

    Args:
        embedding_dim (int): The dimension of the embedding.
        warmup_steps (int): The number of steps used for warmup.

    Returns:
        float: The learning rate value at a given step.
    """
    def __init__(self, emb_dim, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.emb_dim = tf.Variable(emb_dim, dtype=tf.float32)
        self.warmup_steps = tf.cast(warmup_steps, dtype=tf.float32)

    def __call__(self, step):
        """
        Compute the learning rate value for a given step using
        a combination of square root inverse decay and warmup.

        Args:
            step (int): The current step number.

        Returns:
            float: The learning rate value at the current step.
        """
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.emb_dim) * tf.math.minimum(arg1, arg2)

# Create an instance of the custom learning rate schedule
learning_rate = CustomSchedule(emb_dim)

In [None]:

# Create an instance of the Transformer model
transformer = Transformer(num_layers, emb_dim, num_heads,
                           fully_connected_dim, input_vocab_size, target_vocab_size,
                           max_pos_encoding_input, max_pos_encoding_target, dropout_rate)


# Define the optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2 = 0.98, epsilon = 1e-9)

# Define the loss object
loss_object = tf.keras.losses.SparseCategoricalCrossentropy()


def loss_function(true_values, predictions):
    """
    Calculate the loss value for a given target sequence.

    Args:
        true_values (tf.Tensor): The true target sequence.
        predictions (tf.Tensor): The predicted target sequence.

    Returns:
        float: The loss value for the given target sequence.
    """
    # Create a mask to exclude the padding tokens
    mask = tf.math.logical_not(tf.math.equal(true_values, 0))

    # Compute the loss value using the loss object
    loss_ = loss_object(true_values, predictions)

    # Apply the mask to exclude the padding tokens
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    # Calculate the mean loss value
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

def accuracy_function(true_values, predictions):
    """
    Calculate the accuracy for a given target sequence.

    Args:
        true_values (tf.Tensor): The true target sequence.
        predictions (tf.Tensor): The predicted target sequence.

    Returns:
        float: The accuracy value for the given target sequence.
    """
    # Compute the accuracies using the true and predicted target sequences
    accuracies = tf.equal(true_values, tf.argmax(predictions, axis=2))

    # Create a mask to exclude the padding tokens
    mask = tf.math.logical_not(tf.math.equal(true_values, 0))

    # Apply the mask to exclude the padding tokens from the accuracies
    accuracies = tf.math.logical_and(mask, accuracies)
    accuracies = tf.cast(accuracies, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)

    # Calculate the mean accuracy value
    return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)

# Define the training metrics
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
print(type(train_loss))  # Verify the class of train_loss
print(type(train_accuracy))  #

<class 'keras.src.metrics.reduction_metrics.Mean'>
<class 'keras.src.metrics.accuracy_metrics.SparseCategoricalAccuracy'>


In [None]:
# Define the input signature for the train_step function
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),  # encoder_input
    tf.TensorSpec(shape=(None, None), dtype=tf.int64)   # target
]

@tf.function(input_signature=train_step_signature)
def train_step(encoder_input, target):
    """
    Function to perform a single training step.

    Args:
    encoder_input (tf.Tensor): The input tensor for the encoder.
    target (tf.Tensor): The target tensor for the decoder.

    Returns:
    None.
    """

    # Slice the target tensor to get the input for the decoder
    decoder_input = target[:, :-1]

    # Slice the target tensor to get the expected output of the decoder
    expected_output = target[:, 1:]

    # Create masks for the encoder input, decoder input and the padding
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, decoder_input)

    # Perform a forward pass through the model
    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp=encoder_input, tar=decoder_input,
                                     enc_padding_mask=enc_padding_mask,
                                     look_ahead_mask=combined_mask,
                                     dec_padding_mask=dec_padding_mask,
                                     training=True)

        # Calculate the loss between the predicted output and the expected output
        loss = loss_function(expected_output, predictions)

    # Calculate gradients and update the model parameters
    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    # Update the training loss and accuracy metrics
    train_loss(loss)
    train_accuracy(expected_output, predictions)

    return loss, train_accuracy.result()

In [None]:
for epoch in range(1, EPOCHS + 1):
    # Re-initialize metrics at the start of each epoch
    train_loss = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

    current_batch_index = 0

    # Iterate through the dataset in batches of batch_size
    for i in range(int(len(spn_train) / batch_size)):
        # Get the input and target batch
        target_batch = tf.convert_to_tensor(np.array(spn_train[current_batch_index:current_batch_index + batch_size]), dtype=tf.int64)
        input_batch = tf.convert_to_tensor(np.array(en_train[current_batch_index:current_batch_index + batch_size]), dtype=tf.int64)

        current_batch_index = current_batch_index + batch_size

        # Call the train_step function to train the model using the current batch
        loss, accuracy = train_step(input_batch, target_batch)

        # Update the metrics with the batch results
        train_loss.update_state(loss)

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input_batch, target_batch[:, :-1])
        predictions, _ = transformer(inp=input_batch, tar=target_batch[:, :-1],
                                     enc_padding_mask=enc_padding_mask,
                                     look_ahead_mask=combined_mask,
                                     dec_padding_mask=dec_padding_mask,
                                     training=False) # Set training=False as we are evaluating this batch
        train_accuracy.update_state(target_batch[:,1:], predictions)  # Assuming target_batch are the labels for classification

    # Print the epoch loss and accuracy after iterating through the dataset
    print(f'Epoch {epoch} - Loss: {train_loss.result():.4f}, Accuracy: {train_accuracy.result():.4f}')


Epoch 1 - Loss: 1.1486, Accuracy: 0.8288
Epoch 2 - Loss: 1.0882, Accuracy: 0.8353
Epoch 3 - Loss: 1.0340, Accuracy: 0.8411
Epoch 4 - Loss: 0.9901, Accuracy: 0.8468
Epoch 5 - Loss: 0.9418, Accuracy: 0.8518
Epoch 6 - Loss: 0.8885, Accuracy: 0.8585
Epoch 7 - Loss: 0.8359, Accuracy: 0.8656
Epoch 8 - Loss: 0.7947, Accuracy: 0.8713
Epoch 9 - Loss: 0.7471, Accuracy: 0.8777
Epoch 10 - Loss: 0.6921, Accuracy: 0.8855


## Evaluation Loop

In [None]:
# Define the evaluation function
def evaluate(transformer, spn_test, en_test, batch_size=64):
    """
    Evaluates the Transformer model on the test set.

    Args:
        transformer (tf.keras.Model): The trained Transformer model.
        spn_test (list): The target (Spanish) test set.
        en_test (list): The input (English) test set.
        batch_size (int): The batch size for evaluation.

    Returns:
        float: The overall accuracy percentage on the test set.
    """
    # Initialize the metric to compute accuracy
    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')

    current_batch_index = 0
    num_batches = len(spn_test) // batch_size

    # Iterate through the test dataset in batches
    for i in range(num_batches):
        # Get the input and target batch
        target_batch = tf.convert_to_tensor(np.array(spn_test[current_batch_index:current_batch_index + batch_size]), dtype=tf.int64)
        input_batch = tf.convert_to_tensor(np.array(en_test[current_batch_index:current_batch_index + batch_size]), dtype=tf.int64)

        current_batch_index += batch_size

        # Create masks for the encoder input, decoder input, and padding
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input_batch, target_batch[:, :-1])

        # Perform inference (no gradient updates) using the trained model
        predictions, _ = transformer(inp=input_batch, tar=target_batch[:, :-1],
                                     enc_padding_mask=enc_padding_mask,
                                     look_ahead_mask=combined_mask,
                                     dec_padding_mask=dec_padding_mask,
                                     training=False)  # Set training=False during evaluation

        # Update the accuracy metric
        test_accuracy.update_state(target_batch[:, 1:], predictions)

    # Return the final accuracy percentage
    return test_accuracy.result().numpy() * 100  # Convert to percentage

# After training, evaluate the model
final_accuracy = evaluate(transformer, spn_test, en_test, batch_size)
print(f"Model's Accuracy : {final_accuracy:.2f}%")


Model's Accuracy : 83.04%


# **Fine Tuning/ Pretrained Model**

##### Downloading Libraries

In [None]:
!pip install datasets transformers[sentencepiece] sacrebleu -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import os
import sys
from datasets import Dataset
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

## OPUS-MT (Open Translation) project by Helsinki-NLP for EN-ES Translation

## Loading & Preprocessing Data

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-en-es"

In [None]:
data = pd.read_csv('/kaggle/input/enesdf/data.csv')
print(data.head(5))

  english  spanish
0     Go.      Ve.
1     Go.    Vete.
2     Go.    Vaya.
3     Go.  Váyase.
4     Hi.    Hola.


In [None]:
train_df=data[:7000]
val_df=data[7000:8000]

In [None]:
train_df.head(5)

Unnamed: 0,english,spanish
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Go.,Váyase.
4,Hi.,Hola.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]



In [None]:
max_input_length = 128
max_target_length = 128

source_lang = "english"
target_lang = "spanish"


def preprocess_function(examples):
    # Tokenize the inputs (source language)
    inputs = examples[source_lang]
    targets = examples[target_lang]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Convert DataFrame to Hugging Face Dataset
train_set = Dataset.from_pandas(pd.DataFrame(train_set))
val_set = Dataset.from_pandas(pd.DataFrame(val_set))

#Apply the preprocessing function to the datasets
tokenized_train = train_set.map(preprocess_function, batched=True)
tokenized_val = val_set.map(preprocess_function, batched=True)


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

## Training Model

In [None]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tf_model.h5:   0%|          | 0.00/313M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-en-es.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
# Data collator for padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
# Training settings
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_epochs = 10

In [None]:
# Prepare TensorFlow dataset for training
train_dataset = model.prepare_tf_dataset(
    tokenized_train,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=data_collator,
)
validation_dataset = model.prepare_tf_dataset(
    tokenized_val,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator,
)

In [None]:
# Set up optimizer and compile model
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

In [None]:
# Train the model
model.fit(train_dataset, validation_data=validation_dataset, epochs=num_epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x7d1cb4f55600>

In [None]:
model.save_pretrained("tf_model/")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


## Model Evaluation

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

# Function to translate the sentences
def translate(text):
    input_text  = text
    tokenized = tokenizer([input_text], return_tensors='np')
    out = model.generate(**tokenized, max_length=128)
    with tokenizer.as_target_tokenizer():
        print(tokenizer.decode(out[0], skip_special_tokens=True))

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
sentence="He's very intelligent."
translate(sentence)

Él es muy inteligente.
