In [1]:
import numpy as np
from neuralnetlib.models import Transformer
from neuralnetlib.preprocessing import Tokenizer, pad_sequences
from neuralnetlib.losses import SequenceCrossEntropy
from neuralnetlib.optimizers import Adam
from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler

In [2]:
def translate(text, transformer, fr_tokenizer, en_tokenizer, temperature: float = 1.0):
    sequence = fr_tokenizer.texts_to_sequences([text], add_special_tokens=True)[0]

    encoder_input = pad_sequences([sequence], max_length=transformer.max_sequence_length, padding='post', pad_value=transformer.PAD_IDX)
    
    output_sequence = transformer.predict(encoder_input, max_length=transformer.max_sequence_length, temperature=temperature)

    translated_text = en_tokenizer.sequences_to_texts([output_sequence[0].tolist()[1:]])[0]  # remove the start token with [1:]
    
    return translated_text

class DebugCallback(Callback):
    def __init__(self, model, fr_tokenizer, en_tokenizer):
        self.model = model
        self.fr_tokenizer = fr_tokenizer
        self.en_tokenizer = en_tokenizer
    def on_epoch_end(self, epoch, logs=None):
        print(f"\nEpoch {epoch} details:")
        print(f"Loss: {logs['loss']:.4f}")
        
        test_sent = "bonjour"
        translation = translate(test_sent, self.model, self.fr_tokenizer, self.en_tokenizer)
        print(f"Test translation: {test_sent} -> {translation}")

test_sentences = [
    "je vais bien",
    "comment allez-vous ?",
    "bonjour"
]

In [3]:
fr_sentences = [
    "bonjour.",
    "au revoir.",
    "merci beaucoup.",
    "s'il vous plaît.",
    "comment allez-vous ?",
    "je vais bien.",
    "je suis fatigué.",
    "je suis content.",
    "quel est votre nom ?",
    "mon nom est Jean.",
    "enchanté de vous rencontrer.",
    "bonne journée.",
    "bonne soirée.",
    "à demain.",
    "j'aime le café.",
    "je n'aime pas le thé.",
    "quelle heure est-il ?",
    "il est trois heures.",
    "où est la gare ?",
    "la gare est près d'ici.",
    "combien ça coûte ?",
    "c'est trop cher.",
    "parlez-vous anglais ?",
    "un peu.",
    "je ne comprends pas.",
    "pouvez-vous répéter ?",
    "je suis désolé.",
    "pas de problème.",
    "bon appétit.",
    "à votre santé.",
    "j'ai faim.",
    "j'ai soif.",
    "il fait beau aujourd'hui.",
    "il pleut.",
    "il fait froid.",
    "il fait chaud.",
    "je travaille ici.",
    "où habitez-vous ?",
    "j'habite à Paris.",
    "quel âge avez-vous ?",
    "j'ai vingt-cinq ans.",
    "avez-vous des frères et soeurs ?",
    "j'ai une soeur.",
    "j'ai un chat.",
    "j'aime voyager.",
    "je suis étudiant.",
    "je suis professeur.",
    "au secours !",
    "joyeux anniversaire !",
    "félicitations !"
]

en_sentences = [
    "hello.",
    "goodbye.",
    "thank you very much.",
    "please.",
    "how are you?",
    "i am fine.",
    "i am tired.",
    "i am happy.",
    "what is your name?",
    "my name is John.",
    "nice to meet you.",
    "have a nice day.",
    "have a good evening.",
    "see you tomorrow.",
    "i like coffee.",
    "i don't like tea.",
    "what time is it?",
    "it is three o'clock.",
    "where is the train station?",
    "the station is nearby.",
    "how much is it?",
    "it's too expensive.",
    "do you speak english?",
    "a little.",
    "i don't understand.",
    "can you repeat?",
    "i am sorry.",
    "no problem.",
    "enjoy your meal.",
    "cheers.",
    "i am hungry.",
    "i am thirsty.",
    "the weather is nice today.",
    "it's raining.",
    "it's cold.",
    "it's hot.",
    "i work here.",
    "where do you live?",
    "i live in Paris.",
    "how old are you?",
    "i am twenty-five years old.",
    "do you have brothers and sisters?",
    "i have a sister.",
    "i have a cat.",
    "i like to travel.",
    "i am a student.",
    "i am a teacher.",
    "help!",
    "happy birthday!",
    "congratulations!"
]

In [4]:
fr_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation
en_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation

fr_tokenizer.fit_on_texts(fr_sentences, preprocess_ponctuation=True)
en_tokenizer.fit_on_texts(en_sentences, preprocess_ponctuation=True)

In [5]:
x_train = fr_tokenizer.texts_to_sequences(fr_sentences, preprocess_ponctuation=True, add_special_tokens=True)
y_train = en_tokenizer.texts_to_sequences(en_sentences, preprocess_ponctuation=True, add_special_tokens=True)

max_len_x = max(len(seq) for seq in x_train)
max_len_y = max(len(seq) for seq in y_train)
max_seq_len = max(max_len_x, max_len_y)

vocab_size_fr = len(fr_tokenizer.word_index)
vocab_size_en = len(en_tokenizer.word_index)
max_vocab_size = max(vocab_size_fr, vocab_size_en)

In [6]:
# Verify all data
print(f"vocab_size_en: {vocab_size_en}, vocab_size_fr: {vocab_size_fr}")
print(f"max_len_x: {max_len_x}, max_len_y: {max_len_y}, max_vocab_size: {max_vocab_size}, max_seq_len: {max_seq_len}")
print("French sentences:")
print(fr_sentences)
print("English sentences:")
print(en_sentences)
print("French tokenizer:")
print(fr_tokenizer.word_index)
print("English tokenizer:")
print(en_tokenizer.word_index)

vocab_size_en: 95, vocab_size_fr: 107
max_len_x: 10, max_len_y: 10, max_vocab_size: 107, max_seq_len: 10
French sentences:
['bonjour.', 'au revoir.', 'merci beaucoup.', "s'il vous plaît.", 'comment allez-vous ?', 'je vais bien.', 'je suis fatigué.', 'je suis content.', 'quel est votre nom ?', 'mon nom est Jean.', 'enchanté de vous rencontrer.', 'bonne journée.', 'bonne soirée.', 'à demain.', "j'aime le café.", "je n'aime pas le thé.", 'quelle heure est-il ?', 'il est trois heures.', 'où est la gare ?', "la gare est près d'ici.", 'combien ça coûte ?', "c'est trop cher.", 'parlez-vous anglais ?', 'un peu.', 'je ne comprends pas.', 'pouvez-vous répéter ?', 'je suis désolé.', 'pas de problème.', 'bon appétit.', 'à votre santé.', "j'ai faim.", "j'ai soif.", "il fait beau aujourd'hui.", 'il pleut.', 'il fait froid.', 'il fait chaud.', 'je travaille ici.', 'où habitez-vous ?', "j'habite à Paris.", 'quel âge avez-vous ?', "j'ai vingt-cinq ans.", 'avez-vous des frères et soeurs ?', "j'ai une so

In [7]:
model = Transformer(
    src_vocab_size=vocab_size_fr,
    tgt_vocab_size=vocab_size_en,
    d_model=128,
    n_heads=4,
    n_encoder_layers=2,
    n_decoder_layers=2,
    d_ff=512,
    dropout_rate=0.1,
    max_sequence_length=max_seq_len,
    random_state=42,
)


model.compile(
    loss_function=SequenceCrossEntropy(
        label_smoothing=0.1,
    ),
    optimizer=Adam(
        learning_rate=0.0001,
        beta_1=0.9,
        beta_2=0.98,
        epsilon=1e-9,
        clip_norm=1.0,
    ),
    verbose=True
)

Transformer(
  src_vocab_size=107,
  tgt_vocab_size=95,
  d_model=128,
  n_heads=4,
  n_encoder_layers=2,
  n_decoder_layers=2,
  d_ff=512,
  dropout_rate=0.1,
  max_sequence_length=10
)


In [8]:
history = model.fit(
    x_train, y_train,
    epochs=50,
    batch_size=12,
    verbose=True,
    callbacks=[
        EarlyStopping(monitor='loss', patience=20),
        LearningRateScheduler(schedule="warmup_cosine", initial_learning_rate=0.0001, verbose=True),
        DebugCallback(model, fr_tokenizer, en_tokenizer)
    ]
)

Initial learning rate: 0.000100
Epoch 0 details:
Loss: 10.1493
Test translation: bonjour -> a ! <UNK> train please goodbye don't ! travel

Epoch 1 details:
Loss: 8.8985
Test translation: bonjour -> a ! understand train please goodbye don't ! travel

Epoch 2 details:
Loss: 7.3460
Test translation: bonjour -> a ! understand train please goodbye don't ! work

Epoch 3 details:
Loss: 5.7090
Test translation: bonjour -> a ! <UNK> train please goodbye don't ! work

Epoch 4 details:
Loss: 4.0713
Test translation: bonjour -> hungry don't <UNK> understand please a don't don't work

Epoch 5 details:
Loss: 3.6026
Test translation: bonjour -> a don't <UNK> understand please a don't don't work

Epoch 6 details:
Loss: 4.1277
Test translation: bonjour -> a don't <UNK> work please a don't don't work

Epoch 7 details:
Loss: 3.2000
Test translation: bonjour -> a don't <UNK> work please a don't i work

Epoch 8 details:
Loss: 3.5087
Test translation: bonjour -> a don't <UNK> a . a don't don't work

Epoch 9

In [9]:
print("Vocabulary sizes:")
print(f"French vocab size: {len(fr_tokenizer.word_index)}")
print(f"English vocab size: {len(en_tokenizer.word_index)}")

for sent in test_sentences:
    print("\n" + "="*50)
    print(f"Testing: {sent}")
    translation = translate(sent, model, fr_tokenizer, en_tokenizer, temperature=0.8)
    print(f"Translation: {translation}")
    

Vocabulary sizes:
French vocab size: 107
English vocab size: 95

Testing: je vais bien
Translation: . . . . . . . . .

Testing: comment allez-vous ?
Translation: . . . . . . . . .

Testing: bonjour
Translation: . . . . . . . . .
