In [12]:
import re
import numpy as np
import pandas as pd
from neuralnetlib.models import Transformer
from neuralnetlib.preprocessing import Tokenizer, pad_sequences
from neuralnetlib.losses import SequenceCrossEntropy
from neuralnetlib.optimizers import Adam
from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler

In [13]:
def translate(text, transformer, fr_tokenizer, en_tokenizer, temperature: float = 1.0):
    sequence = fr_tokenizer.texts_to_sequences([text], add_special_tokens=True)[0]

    encoder_input = pad_sequences([sequence], max_length=transformer.max_sequence_length, padding='post', pad_value=transformer.PAD_IDX)
    
    output_sequence = transformer.predict(encoder_input, max_length=transformer.max_sequence_length, temperature=temperature)

    translated_text = en_tokenizer.sequences_to_texts([output_sequence[0].tolist()[1:]])[0]  # remove the start token with [1:]
    
    return translated_text

class DebugCallback(Callback):
    def __init__(self, model, fr_tokenizer, en_tokenizer):
        self.model = model
        self.fr_tokenizer = fr_tokenizer
        self.en_tokenizer = en_tokenizer
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nEpoch {epoch} details:")
        print(f"Loss: {logs['loss']:.4f}")
        
        test_sent = "bonjour"
        translation = translate(test_sent, self.model, self.fr_tokenizer, self.en_tokenizer)
        print(f"Test translation: {test_sent} -> {translation}")

test_sentences = [
    "je vais bien",
    "comment allez-vous ?",
    "bonjour"
]

In [14]:
df = pd.read_csv("dataset.tsv", sep="\t")
df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: re.sub(r'\\x[a-fA-F0-9]{2}|\\u[a-fA-F0-9]{4}', ' ', x))  # remove unicode characters

# Shuffle
rng = np.random.default_rng(42)
df = df.sample(frac=1, random_state=rng)

LIMIT = 1000
fr_sentences = df.iloc[:, 1].values.tolist()[0:LIMIT]
en_sentences = df.iloc[:, 3].values.tolist()[0:LIMIT]

In [15]:
fr_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation
en_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation

fr_tokenizer.fit_on_texts(fr_sentences, preprocess_ponctuation=True)
en_tokenizer.fit_on_texts(en_sentences, preprocess_ponctuation=True)

In [16]:
x_train = fr_tokenizer.texts_to_sequences(fr_sentences, preprocess_ponctuation=True, add_special_tokens=True)
y_train = en_tokenizer.texts_to_sequences(en_sentences, preprocess_ponctuation=True, add_special_tokens=True)

max_len_x = max(len(seq) for seq in x_train)
max_len_y = max(len(seq) for seq in y_train)
max_seq_len = max(max_len_x, max_len_y)

vocab_size_fr = len(fr_tokenizer.word_index)
vocab_size_en = len(en_tokenizer.word_index)
max_vocab_size = max(vocab_size_fr, vocab_size_en)

In [17]:
# Verify all data
print(f"vocab_size_en: {vocab_size_en}, vocab_size_fr: {vocab_size_fr}")
print(f"max_len_x: {max_len_x}, max_len_y: {max_len_y}, max_vocab_size: {max_vocab_size}, max_seq_len: {max_seq_len}")
print("French sentences:")
print(fr_sentences)
print("English sentences:")
print(en_sentences)
print("French tokenizer:")
print(fr_tokenizer.word_index)
print("English tokenizer:")
print(en_tokenizer.word_index)

vocab_size_en: 1671, vocab_size_fr: 1946
max_len_x: 84, max_len_y: 67, max_vocab_size: 1946, max_seq_len: 84
French sentences:
['De plus en plus de salariés se lassent du baratin des entreprises et font fuiter des documents internes.', "Je n'aime pas apprendre aux enfants à nager quand leurs parents sont à côté.", 'Marie demanda un prêt à sa famille.', "J'étais en train de penser à me marier.", "Je ne me rappelle rien d'autre.", 'Elle travaille comme effeuilleuse.', 'Les pâtes réchauffées ne sont jamais bonnes.', 'Mange si tu as faim\xa0!', 'Il sait aussi parler un peu de grec.', 'Il y a un grand parc près de mon école.', "Ne te fais pas de souci à ce sujet, d'accord ?", 'Tu ne fais que fuir les problèmes de la vie.', 'Je serais ravi si tu pouvais te joindre à nous pour déjeuner.', "Le couteau n'est pas aiguisé.", 'Avez-vous une carte de fidélité\u202f?', "Je ne t'ai jamais vu ainsi.", 'Ziri fait de la randonnée.', 'Je ne connais personne de ce nom.', "Je m'en suis sorti.", 'Les États-

In [7]:
model = Transformer(
    src_vocab_size=vocab_size_fr,
    tgt_vocab_size=vocab_size_en,
    d_model=128,
    n_heads=4,
    n_encoder_layers=2,
    n_decoder_layers=2,
    d_ff=512,
    dropout_rate=0.1,
    max_sequence_length=max_seq_len,
    random_state=42,
)


model.compile(
    loss_function=SequenceCrossEntropy(
        label_smoothing=0.1,
    ),
    optimizer=Adam(
        learning_rate=0.0001,
        beta_1=0.9,
        beta_2=0.98,
        epsilon=1e-9,
        clip_norm=1.0,
    ),
    verbose=True
)

Transformer(
  src_vocab_size=5099,
  tgt_vocab_size=4278,
  d_model=128,
  n_heads=4,
  n_encoder_layers=2,
  n_decoder_layers=2,
  d_ff=512,
  dropout_rate=0.1,
  max_sequence_length=95
)


In [8]:
history = model.fit(
    x_train, y_train,
    epochs=50,
    batch_size=12,
    verbose=True,
    callbacks=[
        EarlyStopping(monitor='loss', patience=20),
        LearningRateScheduler(schedule="warmup_cosine", initial_learning_rate=0.0001, verbose=True),
        DebugCallback(model, fr_tokenizer, en_tokenizer)
    ]
)

Initial learning rate: 0.000100
Epoch 0 details:
Loss: 1.3165
Test translation: bonjour -> poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets poets

Epoch 1 details:
Loss: 0.0110
Test translation: bonjour -> identity prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime prime 

KeyboardInterrupt: 

In [10]:
print("Vocabulary sizes:")
print(f"French vocab size: {len(fr_tokenizer.word_index)}")
print(f"English vocab size: {len(en_tokenizer.word_index)}")

for sent in test_sentences:
    print("\n" + "="*50)
    print(f"Testing: {sent}")
    translation = translate(sent, model, fr_tokenizer, en_tokenizer, temperature=1.2)
    print(f"Translation: {translation}")
    

Vocabulary sizes:
French vocab size: 5099
English vocab size: 4278

Testing: je vais bien
Translation: . <SOS> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

Testing: comment allez-vous ?
Translation: . <SOS> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

Testing: bonjour
Translation: . <SOS> . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
