In [45]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
en_training_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-train.en"
en_validation_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-dev.en"
fa_training_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-train.fa"
fa_validation_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-dev.fa"

def read_files(path):
    with open(path, "r", encoding="utf-8") as f:
        en_train_dataset = f.read().split("\n")[:-1]
    return en_train_dataset

en_training_data = read_files(en_training_data_path)
en_validation_data = read_files(en_validation_data_path)
fa_training_data = read_files(fa_training_data_path)
fa_validation_data = read_files(fa_validation_data_path)

max_lenght = 500
train_dataset = [[fa_sentence, en_sentence] for fa_sentence, en_sentence in zip(fa_training_data, en_training_data) if len(fa_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
val_dataset = [[fa_sentence, en_sentence] for fa_sentence, en_sentence in zip(fa_validation_data, en_validation_data) if len(fa_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
fa_training_data, en_training_data = zip(*train_dataset)
fa_validation_data, en_validation_data = zip(*val_dataset)

print(len(fa_training_data))
print(len(fa_validation_data))
print(fa_training_data[:3])
print(en_training_data[:3])

996371
1990
('وسايلتو جمع کن باشه.', 'يادمه يه گاو اونجا بود', 'و رييس سازمان "ياماگاتو" هم بين کشته ها هستن')
('Pack your stuff.', 'I remember the cow that stayed over there.', '_')


In [47]:
import os
import json
import typing
from tqdm import tqdm

class CustomTokenizer:
    """ Custom Tokenizer class to tokenize and detokenize text data into sequences of integers

    Args:
        split (str, optional): Split token to use when tokenizing text. Defaults to " ".
        char_level (bool, optional): Whether to tokenize at character level. Defaults to False.
        lower (bool, optional): Whether to convert text to lowercase. Defaults to True.
        start_token (str, optional): Start token to use when tokenizing text. Defaults to "<start>".
        end_token (str, optional): End token to use when tokenizing text. Defaults to "<eos>".
        filters (list, optional): List of characters to filter out. Defaults to
            ['!', "'", '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
            '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n'].
        filter_nums (bool, optional): Whether to filter out numbers. Defaults to True.
        start (int, optional): Index to start tokenizing from. Defaults to 1.
    """
    def __init__(
            self,
            split: str=" ",
            char_level: bool=False,
            lower: bool=True,
            start_token: str="<start>",
            end_token: str="<eos>",
            filters: list = ['!', "'", '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '\t', '\n'],
            filter_nums: bool = True,
            start: int=1,
        ) -> None:
        self.split = split
        self.char_level = char_level
        self.lower = lower
        self.index_word = {}
        self.word_index = {}
        self.max_length = 0
        self.start_token = start_token
        self.end_token = end_token
        self.filters = filters
        self.filter_nums = filter_nums
        self.start = start

    @property
    def start_token_index(self):
        return self.word_index[self.start_token]

    @property
    def end_token_index(self):
        return self.word_index[self.end_token]

    def sort(self):
        """ Sorts the word_index and index_word dictionaries"""
        self.index_word = dict(enumerate(dict(sorted(self.word_index.items())), start=self.start))
        self.word_index = {v: k for k, v in self.index_word.items()}

    def split_line(self, line: str):
        """ Splits a line of text into tokens

        Args:
            line (str): Line of text to split

        Returns:
            list: List of string tokens
        """
        line = line.lower() if self.lower else line

        if self.char_level:
            return [char for char in line]

        # split line with split token and check for filters
        line_tokens = line.split(self.split)

        new_tokens = []
        for index, token in enumerate(line_tokens):
            filtered_tokens = ['']
            for c_index, char in enumerate(token):
                if char in self.filters or (self.filter_nums and char.isdigit()):
                    filtered_tokens += [char, ''] if c_index != len(token) -1 else [char]
                else:
                    filtered_tokens[-1] += char

            new_tokens += filtered_tokens
            if index != len(line_tokens) -1:
                new_tokens += [self.split]

        new_tokens = [token for token in new_tokens if token != '']

        return new_tokens

    def fit_on_texts(self, lines: typing.List[str]):
        """ Fits the tokenizer on a list of lines of text
        This function will update the word_index and index_word dictionaries and set the max_length attribute

        Args:
            lines (typing.List[str]): List of lines of text to fit the tokenizer on
        """
        self.word_index = {key: value for value, key in enumerate([self.start_token, self.end_token, self.split] + self.filters)}

        for line in tqdm(lines, desc="Fitting tokenizer"):
            line_tokens = self.split_line(line)
            self.max_length = max(self.max_length, len(line_tokens) +2) # +2 for start and end tokens

            for token in line_tokens:
                if token not in self.word_index:
                    self.word_index[token] = len(self.word_index)

        self.sort()

    def update(self, lines: typing.List[str]):
        """ Updates the tokenizer with new lines of text
        This function will update the word_index and index_word dictionaries and set the max_length attribute

        Args:
            lines (typing.List[str]): List of lines of text to update the tokenizer with
        """
        new_tokens = 0
        for line in tqdm(lines, desc="Updating tokenizer"):
            line_tokens = self.split_line(line)
            self.max_length = max(self.max_length, len(line_tokens) +2) # +2 for start and end tokens
            for token in line_tokens:
                if token not in self.word_index:
                    self.word_index[token] = len(self.word_index)
                    new_tokens += 1

        self.sort()
        print(f"Added {new_tokens} new tokens")

    def detokenize(self, sequences: typing.List[int], remove_start_end: bool=True):
        """ Converts a list of sequences of tokens back into text

        Args:
            sequences (typing.list[int]): List of sequences of tokens to convert back into text
            remove_start_end (bool, optional): Whether to remove the start and end tokens. Defaults to True.

        Returns:
            typing.List[str]: List of strings of the converted sequences
        """
        lines = []
        for sequence in sequences:
            line = ""
            for token in sequence:
                if token == 0:
                    break
                if remove_start_end and (token == self.start_token_index or token == self.end_token_index):
                    continue

                line += self.index_word[token]

            lines.append(line)

        return lines

    def texts_to_sequences(self, lines: typing.List[str], include_start_end: bool=True):
        """ Converts a list of lines of text into a list of sequences of tokens

        Args:
            lines (typing.list[str]): List of lines of text to convert into tokenized sequences
            include_start_end (bool, optional): Whether to include the start and end tokens. Defaults to True.

        Returns:
            typing.List[typing.List[int]]: List of sequences of tokens
        """
        sequences = []
        for line in lines:
            line_tokens = self.split_line(line)
            sequence = [self.word_index[word] for word in line_tokens if word in self.word_index]
            if include_start_end:
                sequence = [self.word_index[self.start_token]] + sequence + [self.word_index[self.end_token]]

            sequences.append(sequence)

        return sequences

    def save(self, path: str, type: str="json"):
        """ Saves the tokenizer to a file

        Args:
            path (str): Path to save the tokenizer to
            type (str, optional): Type of file to save the tokenizer to. Defaults to "json".
        """
        serialised_dict = self.dict()
        if type == "json":
            if os.path.dirname(path):
                os.makedirs(os.path.dirname(path), exist_ok=True)
            with open(path, "w") as f:
                json.dump(serialised_dict, f)

    def dict(self):
        """ Returns a dictionary of the tokenizer

        Returns:
            dict: Dictionary of the tokenizer
        """
        return {
            "split": self.split,
            "lower": self.lower,
            "char_level": self.char_level,
            "index_word": self.index_word,
            "max_length": self.max_length,
            "start_token": self.start_token,
            "end_token": self.end_token,
            "filters": self.filters,
            "filter_nums": self.filter_nums,
            "start": self.start
        }

    @staticmethod
    def load(path: typing.Union[str, dict], type: str="json"):
        """ Loads a tokenizer from a file

        Args:
            path (typing.Union[str, dict]): Path to load the tokenizer from or a dictionary of the tokenizer
            type (str, optional): Type of file to load the tokenizer from. Defaults to "json".

        Returns:
            CustomTokenizer: Loaded tokenizer
        """
        if isinstance(path, str):
            if type == "json":
                with open(path, "r") as f:
                    load_dict = json.load(f)

        elif isinstance(path, dict):
            load_dict = path

        tokenizer = CustomTokenizer()
        tokenizer.split = load_dict["split"]
        tokenizer.lower = load_dict["lower"]
        tokenizer.char_level = load_dict["char_level"]
        tokenizer.index_word = {int(k): v for k, v in load_dict["index_word"].items()}
        tokenizer.max_length = load_dict["max_length"]
        tokenizer.start_token = load_dict["start_token"]
        tokenizer.end_token = load_dict["end_token"]
        tokenizer.filters = load_dict["filters"]
        tokenizer.filter_nums = bool(load_dict["filter_nums"])
        tokenizer.start = load_dict["start"]
        tokenizer.word_index = {v: int(k) for k, v in tokenizer.index_word.items()}

        return tokenizer

    @property
    def lenght(self):
        return len(self.index_word)

    def __len__(self):
        return len(self.index_word)

In [48]:
# prepare Spanish tokenizer, this is the input language
tokenizer = CustomTokenizer(char_level=True)
tokenizer.fit_on_texts(fa_training_data)
tokenizer.save("tokenizer.json")

# prepare English tokenizer, this is the output language
detokenizer = CustomTokenizer(char_level=True)
detokenizer.fit_on_texts(en_training_data)
detokenizer.save("detokenizer.json")

Fitting tokenizer: 100%|██████████| 996371/996371 [00:06<00:00, 147637.84it/s]
Fitting tokenizer: 100%|██████████| 996371/996371 [00:04<00:00, 201185.19it/s]


In [49]:
tokenized_sentence = detokenizer.texts_to_sequences(["Hello world, how are you?"])[0]
print(tokenized_sentence)

detokenized_sentence = detokenizer.detokenize([tokenized_sentence], remove_start_end=False)
print(detokenized_sentence)

detokenized_sentence = detokenizer.detokenize([tokenized_sentence])
print(detokenized_sentence)

[33, 51, 48, 55, 55, 58, 3, 66, 58, 61, 55, 47, 15, 3, 51, 58, 66, 3, 44, 61, 48, 3, 68, 58, 64, 36, 32]
['<start>hello world, how are you?<eos>']
['hello world, how are you?']


In [50]:
pip install mltu



In [51]:
from mltu.tensorflow.dataProvider import DataProvider
import numpy as np

def preprocess_inputs(data_batch, label_batch):
    encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
    decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
    decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)

    data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
    label_batch_tokens = detokenizer.texts_to_sequences(label_batch)

    for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
        encoder_input[index][:len(data)] = data
        decoder_input[index][:len(label)-1] = label[:-1] # Drop the [END] tokens
        decoder_output[index][:len(label)-1] = label[1:] # Drop the [START] tokens

    return (encoder_input, decoder_input), decoder_output

train_dataProvider = DataProvider(
    train_dataset,
    batch_size=4,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True
    )

val_dataProvider = DataProvider(
    val_dataset,
    batch_size=4,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True
    )

INFO:DataProvider:Skipping Dataset validation...
INFO:DataProvider:Skipping Dataset validation...


In [52]:
for data_batch in train_dataProvider:
    (encoder_inputs, decoder_inputs), decoder_outputs = data_batch

    encoder_inputs_str = tokenizer.detokenize(encoder_inputs)
    decoder_inputs_str = detokenizer.detokenize(decoder_inputs, remove_start_end=False)
    decoder_outputs_str = detokenizer.detokenize(decoder_outputs, remove_start_end=False)
    print(encoder_inputs_str)
    print(decoder_inputs_str)
    print(decoder_outputs_str)

    break

['وسايلتو جمع کن باشه.', 'يادمه يه گاو اونجا بود', 'و رييس سازمان "ياماگاتو" هم بين کشته ها هستن', 'اِی #@%$* توش، غارِ یخـی؟']
['<start>pack your stuff.', '<start>i remember the cow that stayed over there.', '<start>_', '<start>oh']
['pack your stuff.<eos>', 'i remember the cow that stayed over there.<eos>', '_<eos>', 'oh<eos>']


In [53]:
import numpy as np

import tensorflow as tf
try: [tf.config.experimental.set_memory_growth(gpu, True) for gpu in tf.config.experimental.list_physical_devices("GPU")]
except: pass

from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard
from mltu.tensorflow.callbacks import Model2onnx, WarmupCosineDecay

from mltu.tensorflow.dataProvider import DataProvider
from mltu.tokenizers import CustomTokenizer

from mltu.tensorflow.transformer.utils import MaskedAccuracy, MaskedLoss
from mltu.tensorflow.transformer.callbacks import EncDecSplitCallback

In [54]:
import tensorflow as tf

from mltu.tensorflow.transformer.layers import Encoder, Decoder

def Transformer(
    input_vocab_size: int,
    target_vocab_size: int,
    encoder_input_size: int = None,
    decoder_input_size: int = None,
    num_layers: int=6,
    d_model: int=512,
    num_heads: int=8,
    dff: int=2048,
    dropout_rate: float=0.1,
    ) -> tf.keras.Model:
    """
    A custom TensorFlow model that implements the Transformer architecture.

    Args:
        input_vocab_size (int): The size of the input vocabulary.
        target_vocab_size (int): The size of the target vocabulary.
        encoder_input_size (int): The size of the encoder input sequence.
        decoder_input_size (int): The size of the decoder input sequence.
        num_layers (int): The number of layers in the encoder and decoder.
        d_model (int): The dimensionality of the model.
        num_heads (int): The number of heads in the multi-head attention layer.
        dff (int): The dimensionality of the feed-forward layer.
        dropout_rate (float): The dropout rate.

    Returns:
        A TensorFlow Keras model.
    """
    inputs = [
        tf.keras.layers.Input(shape=(encoder_input_size,), dtype=tf.int64),
        tf.keras.layers.Input(shape=(decoder_input_size,), dtype=tf.int64)
        ]

    encoder_input, decoder_input = inputs

    encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=input_vocab_size, dropout_rate=dropout_rate)(encoder_input)
    decoder = Decoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, vocab_size=target_vocab_size, dropout_rate=dropout_rate)(decoder_input, encoder)

    output = tf.keras.layers.Dense(target_vocab_size)(decoder)

    return tf.keras.Model(inputs=inputs, outputs=output)

In [55]:
#configs.py
import os
from datetime import datetime

from mltu.configs import BaseModelConfigs


class ModelConfigs(BaseModelConfigs):
    def __init__(self):
        super().__init__()
        self.model_path = '/content/drive/MyDrive/Colab Notebooks/Models'
        self.num_layers = 1
        self.d_model = 128
        self.num_heads = 8
        self.dff = 512
        self.dropout_rate = 0.1
        self.batch_size = 64
        self.train_epochs = 1
        # CustomSchedule parameters
        self.init_lr = 0.00001
        self.lr_after_warmup = 0.0005
        self.final_lr = 0.0001
        self.warmup_epochs = 1
        self.decay_epochs = 9

        # self.num_layers = 4
        # self.d_model = 128
        # self.num_heads = 8
        # self.dff = 512
        # self.dropout_rate = 0.1
        # self.batch_size = 16
        # self.train_epochs = 50
        # # CustomSchedule parameters
        # self.init_lr = 0.00001
        # self.lr_after_warmup = 0.0005
        # self.final_lr = 0.0001
        # self.warmup_epochs = 2
        # self.decay_epochs = 18

In [56]:
#train.py

configs = ModelConfigs()

In [57]:
en_training_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-train.en"
en_validation_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-dev.en"
fa_training_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-train.fa"
fa_validation_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-dev.fa"

def read_files(path):
    with open(path, "r", encoding="utf-8") as f:
        en_train_dataset = f.read().split("\n")[:-1]
    return en_train_dataset

en_training_data = read_files(en_training_data_path)
en_validation_data = read_files(en_validation_data_path)
fa_training_data = read_files(fa_training_data_path)
fa_validation_data = read_files(fa_validation_data_path)

# Consider only sentences with length <= 500
max_lenght = 500
train_dataset = [[fa_sentence, en_sentence] for fa_sentence, en_sentence in zip(fa_training_data, en_training_data) if len(fa_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
val_dataset = [[fa_sentence, en_sentence] for fa_sentence, en_sentence in zip(fa_validation_data, en_validation_data) if len(fa_sentence) <= max_lenght and len(en_sentence) <= max_lenght]
fa_training_data, en_training_data = zip(*train_dataset)
fa_validation_data, en_validation_data = zip(*val_dataset)

In [58]:
# prepare spanish tokenizer, this is the input language
tokenizer = CustomTokenizer(char_level=True)
tokenizer.fit_on_texts(fa_training_data)
tokenizer.save(configs.model_path + "/tokenizer.json")

# prepare english tokenizer, this is the output language
detokenizer = CustomTokenizer(char_level=True)
detokenizer.fit_on_texts(en_training_data)
detokenizer.save(configs.model_path + "/detokenizer.json")

Fitting tokenizer: 100%|██████████| 996371/996371 [00:06<00:00, 144890.01it/s]
Fitting tokenizer: 100%|██████████| 996371/996371 [00:04<00:00, 202888.79it/s]


In [59]:
def preprocess_inputs(data_batch, label_batch):
    encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
    decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
    decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)

    data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
    label_batch_tokens = detokenizer.texts_to_sequences(label_batch)

    for index, (data, label) in enumerate(zip(data_batch_tokens, label_batch_tokens)):
        encoder_input[index][:len(data)] = data
        decoder_input[index][:len(label)-1] = label[:-1] # Drop the [END] tokens
        decoder_output[index][:len(label)-1] = label[1:] # Drop the [START] tokens

    return (encoder_input, decoder_input), decoder_output

In [60]:
# Create Training Data Provider
train_dataProvider = DataProvider(
    train_dataset,
    batch_size=configs.batch_size,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
    )

# Create Validation Data Provider
val_dataProvider = DataProvider(
    val_dataset,
    batch_size=configs.batch_size,
    batch_postprocessors=[preprocess_inputs],
    use_cache=True,
    )

INFO:DataProvider:Skipping Dataset validation...
INFO:DataProvider:Skipping Dataset validation...


In [61]:
# Create TensorFlow Transformer Model
transformer = Transformer(
    num_layers=configs.num_layers,
    d_model=configs.d_model,
    num_heads=configs.num_heads,
    dff=configs.dff,
    input_vocab_size=len(tokenizer)+1,
    target_vocab_size=len(detokenizer)+1,
    dropout_rate=configs.dropout_rate,
    encoder_input_size=tokenizer.max_length,
    decoder_input_size=detokenizer.max_length
    )

transformer.summary()



In [62]:
optimizer = tf.keras.optimizers.Adam(learning_rate=configs.init_lr, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# Compile the model
transformer.compile(
    loss=MaskedLoss(),
    optimizer=optimizer,
    metrics=[MaskedAccuracy()],
    run_eagerly=False
    )

In [63]:
pip install tf2onnx



In [64]:
# Define callbacks
warmupCosineDecay = WarmupCosineDecay(
    lr_after_warmup=configs.lr_after_warmup,
    final_lr=configs.final_lr,
    warmup_epochs=configs.warmup_epochs,
    decay_epochs=configs.decay_epochs,
    initial_lr=configs.init_lr,
    )
earlystopper = EarlyStopping(monitor="val_masked_accuracy", patience=5, verbose=1, mode="max")
checkpoint = ModelCheckpoint(f"{configs.model_path}/model.keras", monitor="val_masked_accuracy", verbose=1, save_best_only=True, mode="max", save_weights_only=False)
tb_callback = TensorBoard(f"{configs.model_path}/logs")
reduceLROnPlat = ReduceLROnPlateau(monitor="val_masked_accuracy", factor=0.9, min_delta=1e-10, patience=2, verbose=1, mode="max")
model2onnx = Model2onnx(f"{configs.model_path}/model.keras", metadata={"tokenizer": tokenizer.dict(), "detokenizer": detokenizer.dict()}, save_on_epoch_end=False)
encDecSplitCallback = EncDecSplitCallback(configs.model_path, encoder_metadata={"tokenizer": tokenizer.dict()}, decoder_metadata={"detokenizer": detokenizer.dict()})

In [65]:
import tensorflow as tf

# Define a patched version of on_epoch_begin
def patched_on_epoch_begin(self, epoch, logs=None):
    if epoch < self.warmup_epochs:
        # Apply warmup schedule
        lr = self.initial_lr * (epoch + 1) / self.warmup_epochs
    else:
        # Apply cosine decay
        progress = (epoch - self.warmup_epochs) / self.decay_epochs
        lr = self.final_lr + 0.5 * (self.lr_after_warmup - self.final_lr) * (1 + tf.cos(tf.constant(progress) * 3.14159))

    # Ensure the learning rate is set correctly
    if hasattr(self.model.optimizer, 'lr'):
        learning_rate_attr = self.model.optimizer.lr
    else:
        learning_rate_attr = self.model.optimizer.learning_rate

    # Update learning rate if it's a tf.Variable or equivalent
    if isinstance(learning_rate_attr, tf.Variable):
        tf.keras.backend.set_value(learning_rate_attr, lr)
    else:
        # Create a new tf.Variable for learning rate if necessary
        self.model.optimizer.learning_rate = tf.Variable(lr, dtype=tf.float32)

    if self.verbose:
        print(f"Epoch {epoch+1}: Learning rate is {lr:.7f}.")

# Apply the patch to the callback
WarmupCosineDecay.on_epoch_begin = patched_on_epoch_begin


In [66]:
def patched_on_epoch_end(self, epoch: int, logs: dict=None):
  logs = logs or {}

  if hasattr(self.model.optimizer, 'lr'):
    logs["lr"] = self.model.optimizer.lr
  else:
    logs["lr"] = self.model.optimizer.learning_rate
  # Log the learning rate value
  # logs["lr"] = self.model.optimizer.lr

  return logs

WarmupCosineDecay.on_epoch_end = patched_on_epoch_end


In [67]:
# Train the model
transformer.fit(
    train_dataProvider,
    validation_data=val_dataProvider,
    epochs=configs.train_epochs,
    callbacks=[
        warmupCosineDecay,
        checkpoint,
        tb_callback,
        reduceLROnPlat,
        model2onnx,
        encDecSplitCallback
        ]
    )

transformer.save(f"/content/drive/MyDrive/Colab Notebooks/Models/model.keras")

[1m15569/15569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 2.9191 - masked_accuracy: 0.2709
Epoch 1: val_masked_accuracy improved from -inf to 0.35777, saving model to /content/drive/MyDrive/Colab Notebooks/Models/model.keras
[1m15569/15569[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m840s[0m 52ms/step - loss: 2.9191 - masked_accuracy: 0.2709 - val_loss: 2.1719 - val_masked_accuracy: 0.3578 - lr: 1.0000e-05 - learning_rate: 1.0000e-05




'Functional' object has no attribute '_get_save_spec'
Error parsing message with type 'onnx.ModelProto'
[Errno 2] Unable to synchronously open file (unable to open file: name = '/content/drive/MyDrive/Colab Notebooks/Models/model.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)


In [3]:
!pip install tf2onnx




In [7]:
import tf2onnx
import tensorflow as tf

from tensorflow.keras.models import load_model
from tensorflow.keras.utils import custom_object_scope
from mltu.tensorflow.transformer.layers import Encoder  # Import the custom layer or object

with custom_object_scope({'Encoder': Encoder}):
    keras_model = load_model("/content/drive/MyDrive/Colab Notebooks/Models/model.h5")

# Convert the Keras model to ONNX format
spec = (tf.TensorSpec(keras_model.inputs[0].shape, keras_model.inputs[0].dtype),)
output_path = "/content/drive/MyDrive/Colab Notebooks/Models/model.onnx"
model_proto, _ = tf2onnx.convert.from_keras(keras_model, input_signature=spec, opset=13)

# Save the ONNX model
with open(output_path, "wb") as f:
    f.write(model_proto.SerializeToString())




TypeError: <class 'mltu.tensorflow.transformer.utils.MaskedLoss'> could not be deserialized properly. Please ensure that components that are Python object instances (layers, models, etc.) returned by `get_config()` are explicitly deserialized in the model's `from_config()` method.

config={'module': 'mltu.tensorflow.transformer.utils', 'class_name': 'MaskedLoss', 'config': {'name': 'masked_loss_2', 'reduction': 'none'}, 'registered_name': 'MaskedLoss'}.

Exception encountered: MaskedLoss.__init__() got an unexpected keyword argument 'name'

In [77]:
import numpy as np
import time

from mltu.tokenizers import CustomTokenizer
from mltu.inferenceModel import OnnxInferenceModel

class PtEnTranslator(OnnxInferenceModel):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.new_inputs = self.model.get_inputs()
        self.tokenizer = CustomTokenizer.load(self.metadata["tokenizer"])
        self.detokenizer = CustomTokenizer.load(self.metadata["detokenizer"])

    def predict(self, sentence):
        start = time.time()
        tokenized_sentence = self.tokenizer.texts_to_sequences([sentence])[0]
        encoder_input = np.pad(tokenized_sentence, (0, self.tokenizer.max_length - len(tokenized_sentence)), constant_values=0).astype(np.int64)

        tokenized_results = [self.detokenizer.start_token_index]
        for index in range(self.detokenizer.max_length - 1):
            decoder_input = np.pad(tokenized_results, (0, self.detokenizer.max_length - len(tokenized_results)), constant_values=0).astype(np.int64)
            input_dict = {
                self.model._inputs_meta[0].name: np.expand_dims(encoder_input, axis=0),
                self.model._inputs_meta[1].name: np.expand_dims(decoder_input, axis=0),
            }
            preds = self.model.run(None, input_dict)[0] # preds shape (1, 206, 29110)
            pred_results = np.argmax(preds, axis=2)
            tokenized_results.append(pred_results[0][index])

            if tokenized_results[-1] == self.detokenizer.end_token_index:
                break

        results = self.detokenizer.detokenize([tokenized_results])
        return results[0], time.time() - start

def read_files(path):
    with open(path, "r", encoding="utf-8") as f:
        en_train_dataset = f.read().split("\n")[:-1]
    return en_train_dataset

# Path to dataset
en_validation_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-dev.en"
fa_validation_data_path = "/content/drive/MyDrive/Colab Notebooks/opus.en-fa-dev.fa"

en_validation_data = read_files(en_validation_data_path)
fa_validation_data = read_files(fa_validation_data_path)

# Consider only sentences with length <= 500
max_lenght = 500
val_examples = [[fa_sentence, en_sentence] for fa_sentence, en_sentence in zip(fa_validation_data, en_validation_data) if len(fa_sentence) <= max_lenght and len(en_sentence) <= max_lenght]

translator = PtEnTranslator("/content/drive/MyDrive/Colab Notebooks/Models/model.h5")

val_dataset = []
for fa, en in val_examples:
    results, duration = translator.predict(fa)
    print("Farsi:     ", fa.lower())
    print("English:     ", en.lower())
    print("English pred:", results)
    print(duration)
    print()

InvalidProtobuf: [ONNXRuntimeError] : 7 : INVALID_PROTOBUF : Load model from /content/drive/MyDrive/Colab Notebooks/Models/model.h5 failed:Protobuf parsing failed.