In [1]:
import re
import numpy as np
import pandas as pd
from neuralnetlib.models import Transformer
from neuralnetlib.preprocessing import Tokenizer, pad_sequences
from neuralnetlib.utils import train_test_split
from neuralnetlib.losses import CrossEntropyWithLabelSmoothing
from neuralnetlib.optimizers import Adam
from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler

In [2]:
def translate(text, transformer, fr_tokenizer, en_tokenizer, temperature: float = 1.0):
    sequence = fr_tokenizer.texts_to_sequences([text], add_special_tokens=True)[0]

    encoder_input = pad_sequences([sequence], max_length=transformer.max_sequence_length, padding='post', pad_value=transformer.PAD_IDX)
    
    output_sequence = transformer.predict(encoder_input, max_length=transformer.max_sequence_length, temperature=temperature)

    output_tokens = output_sequence.tolist()
    translated_text = en_tokenizer.sequences_to_texts(output_tokens)[0]
    
    return translated_text


test_sentences = [
    "tu aimes les chats ou les chiens",
    "je suis étonné",
    "quelles langues apprenez-vous"
]


class DebugCallback(Callback):
    def __init__(self, model, fr_tokenizer, en_tokenizer):
        self.model = model
        self.fr_tokenizer = fr_tokenizer
        self.en_tokenizer = en_tokenizer
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nEpoch {epoch} details:")
        print(f"Loss: {logs['loss']:.4f}")
        
        for test in test_sentences:
            translation = translate(test, self.model, self.fr_tokenizer, self.en_tokenizer)
            print(f"Test translation: {test} -> {translation}")

In [3]:
df = pd.read_csv("dataset.tsv", sep="\t")
df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: re.sub(r'\\x[a-fA-F0-9]{2}|\\u[a-fA-F0-9]{4}|\xa0|\u202f', ' ', x))  # remove unicode characters

LIMIT = 1000
fr_sentences = df.iloc[:, 1].values.tolist()[0:LIMIT]
en_sentences = df.iloc[:, 3].values.tolist()[0:LIMIT]

In [4]:
fr_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation
en_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation

fr_tokenizer.fit_on_texts(fr_sentences, preprocess_ponctuation=True)
en_tokenizer.fit_on_texts(en_sentences, preprocess_ponctuation=True)

In [5]:
X = fr_tokenizer.texts_to_sequences(fr_sentences, preprocess_ponctuation=True, add_special_tokens=True)
y = en_tokenizer.texts_to_sequences(en_sentences, preprocess_ponctuation=True, add_special_tokens=True)

max_len_x = max(len(seq) for seq in X)
max_len_y = max(len(seq) for seq in y)
max_seq_len = max(max_len_x, max_len_y)

vocab_size_fr = len(fr_tokenizer.word_index)
vocab_size_en = len(en_tokenizer.word_index)
max_vocab_size = max(vocab_size_fr, vocab_size_en)

In [6]:
# Verify all data
print(f"vocab_size_en: {vocab_size_en}, vocab_size_fr: {vocab_size_fr}")
print(f"max_len_x: {max_len_x}, max_len_y: {max_len_y}, max_vocab_size: {max_vocab_size}, max_seq_len: {max_seq_len}")
print("French sentences:")
print(fr_sentences)
print("English sentences:")
print(en_sentences)
print("French tokenizer:")
print(fr_tokenizer.word_index)
print("English tokenizer:")
print(en_tokenizer.word_index)

vocab_size_en: 1968, vocab_size_fr: 2251
max_len_x: 95, max_len_y: 84, max_vocab_size: 2251, max_seq_len: 95
French sentences:
["Lorsqu'il a demandé qui avait cassé la fenêtre, tous les garçons ont pris un air innocent.", 'Je ne supporte pas ce type.', 'Je ne supporte pas ce type.', 'Je ne supporte pas ce type.', 'Pour une fois dans ma vie je fais un bon geste... Et ça ne sert à rien.', "Ne tenez aucun compte de ce qu'il dit.", 'Essayons quelque chose !', "Qu'est-ce que tu fais ?", "Qu'est-ce que c'est ?", "Qu'est-ce que c'est ?", "Qu'est-ce que c'est ?", "Aujourd'hui nous sommes le 18 juin et c'est l'anniversaire de Muiriel !", 'Joyeux anniversaire Muiriel !', 'Muiriel a 20 ans maintenant.', 'Muiriel a 20 ans maintenant.', 'Le mot de passe est « Muiriel ».', 'Je serai bientôt de retour.', 'Je ne sais pas.', 'Je ne sais pas.', "J'en perds mes mots.", 'Ça ne va jamais finir.', 'Je ne sais simplement pas quoi dire...', 'Je ne sais simplement pas quoi dire...', 'C’était un méchant lapin.'

In [7]:
X = pad_sequences(X, max_length=max_seq_len, padding='post', pad_value=fr_tokenizer.PAD_IDX)
y = pad_sequences(y, max_length=max_seq_len, padding='post', pad_value=en_tokenizer.PAD_IDX)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [8]:
model = Transformer(
    src_vocab_size=vocab_size_fr,
    tgt_vocab_size=vocab_size_en,
    d_model=128,
    n_heads=4,
    n_encoder_layers=2,
    n_decoder_layers=2,
    d_ff=512,
    dropout_rate=0.1,
    max_sequence_length=max_seq_len,
    random_state=42,
)


model.compile(
    loss_function="cels",
    optimizer=Adam(
        learning_rate=0.001,
        beta_1=0.9,
        beta_2=0.98,
        epsilon=1e-9,
        clip_norm=1.0,
    ),
    verbose=True
)

Transformer(
  src_vocab_size=2251,
  tgt_vocab_size=1968,
  d_model=128,
  n_heads=4,
  n_encoder_layers=2,
  n_decoder_layers=2,
  d_ff=512,
  dropout_rate=0.1,
  max_sequence_length=95
)


In [None]:
history = model.fit(
    x_train, y_train,
    epochs=50,
    batch_size=32,
    verbose=True,
    callbacks=[
        EarlyStopping(monitor='loss', patience=20),
        LearningRateScheduler(schedule="warmup_cosine", initial_learning_rate=0.0001, verbose=True),
        DebugCallback(model, fr_tokenizer, en_tokenizer)
    ],
    validation_data=(x_test, y_test),
    metrics=['bleu_score']
)

Initial learning rate: 0.000100

In [None]:
print("Vocabulary sizes:")
print(f"French vocab size: {len(fr_tokenizer.word_index)}")
print(f"English vocab size: {len(en_tokenizer.word_index)}")

for sent in test_sentences:
    print("\n" + "="*50)
    print(f"Testing: {sent}")
    translation = translate(sent, model, fr_tokenizer, en_tokenizer, temperature=1.2)
    print(f"Translation: {translation}")
    

Vocabulary sizes:
French vocab size: 1946
English vocab size: 1671

Testing: je vais bien
Translation: <SOS> bin

Testing: comment allez-vous ?
Translation: <SOS> bin

Testing: bonjour
Translation: <SOS> bin
