In [1]:
from neuralnetlib.models import Transformer
from neuralnetlib.preprocessing import Tokenizer, pad_sequences
from neuralnetlib.losses import SequenceCrossEntropy
from neuralnetlib.optimizers import Adam
from neuralnetlib.callbacks import EarlyStopping, Callback, LearningRateScheduler

In [2]:
def translate(sentence: str, model, fr_tokenizer, en_tokenizer, 
             temperature=0.8, beam_size=10, min_length=2) -> str:
    print(f"\nProcessing: {sentence}")
    
    # Tokenization
    tokens = fr_tokenizer.texts_to_sequences([sentence], preprocess_ponctuation=True)[0]
    print(f"Tokens: {tokens}")
    print(f"Original words: {[fr_tokenizer.index_word.get(t, '<UNK>') for t in tokens]}")
    
    # Add special tokens
    tokens = fr_tokenizer.encode_special_tokens([tokens])[0]
    print(f"With special tokens: {tokens}")
    
    # Padding
    padded = pad_sequences([tokens], max_length=model.max_sequence_length, padding='post', 
                          pad_value=fr_tokenizer.word_index[fr_tokenizer.pad_token])
    print(f"Padded sequence: {padded}")
    
    pred = model.predict(padded, 
                        max_length=model.max_sequence_length,
                        beam_size=beam_size,      
                        alpha=0.6,         
                        min_length=min_length,
                        temperature=temperature)[0]
    
    print(f"Raw prediction: {pred}")
    
    words = []
    for idx in pred[1:]:  # Skip SOS
        if idx == en_tokenizer.word_index[en_tokenizer.eos_token]:
            print(f"Found EOS token: {idx}")
            break
            
        if idx == en_tokenizer.word_index[en_tokenizer.pad_token]:
            print(f"Skipping PAD token: {idx}")
            continue
            
        word = en_tokenizer.index_word.get(idx, en_tokenizer.unk_token)
        print(f"Token {idx} -> word: {word}")
        words.append(word)
    
    if not words:
        print("Warning: Empty translation, using default handling...")
        for idx in pred[1:]:  # Skip SOS
            if idx not in [en_tokenizer.word_index[token] for token in 
                         [en_tokenizer.pad_token, en_tokenizer.eos_token]]:
                word = en_tokenizer.index_word.get(idx, en_tokenizer.unk_token)
                words.append(word)
    
    translation = " ".join(words) if words else "[Translation failed]"
    print(f"Final translation: {translation}")
    return translation

class TrainingMonitor(Callback):
    def __init__(self, model, fr_tokenizer, en_tokenizer, test_sentences):
        self.model = model
        self.fr_tokenizer = fr_tokenizer
        self.en_tokenizer = en_tokenizer
        self.test_sentences = test_sentences
        
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 5 == 0:  # Check every 5 epochs
            print(f"\nEpoch {epoch} validation:")
            for sent in self.test_sentences:
                translation = translate(
                    sent, self.model, self.fr_tokenizer, self.en_tokenizer,
                    temperature=0.8, beam_size=5, min_length=2
                )
                print(f"Input: {sent}")
                print(f"Output: {translation}\n")


test_sentences = [
    "je vais bien",
    "comment allez-vous ?",
    "bonjour"
]

In [3]:
fr_sentences = [
    "bonjour.",
    "au revoir.",
    "merci beaucoup.",
    "s'il vous plaît.",
    "comment allez-vous ?",
    "je vais bien.",
    "je suis fatigué.",
    "je suis content.",
    "quel est votre nom ?",
    "mon nom est Jean.",
    "enchanté de vous rencontrer.",
    "bonne journée.",
    "bonne soirée.",
    "à demain.",
    "j'aime le café.",
    "je n'aime pas le thé.",
    "quelle heure est-il ?",
    "il est trois heures.",
    "où est la gare ?",
    "la gare est près d'ici.",
    "combien ça coûte ?",
    "c'est trop cher.",
    "parlez-vous anglais ?",
    "un peu.",
    "je ne comprends pas.",
    "pouvez-vous répéter ?",
    "je suis désolé.",
    "pas de problème.",
    "bon appétit.",
    "à votre santé.",
    "j'ai faim.",
    "j'ai soif.",
    "il fait beau aujourd'hui.",
    "il pleut.",
    "il fait froid.",
    "il fait chaud.",
    "je travaille ici.",
    "où habitez-vous ?",
    "j'habite à Paris.",
    "quel âge avez-vous ?",
    "j'ai vingt-cinq ans.",
    "avez-vous des frères et sœurs ?",
    "j'ai une sœur.",
    "j'ai un chat.",
    "j'aime voyager.",
    "je suis étudiant.",
    "je suis professeur.",
    "au secours !",
    "joyeux anniversaire !",
    "félicitations !"
]

en_sentences = [
    "hello.",
    "goodbye.",
    "thank you very much.",
    "please.",
    "how are you?",
    "i am fine.",
    "i am tired.",
    "i am happy.",
    "what is your name?",
    "my name is John.",
    "nice to meet you.",
    "have a nice day.",
    "have a good evening.",
    "see you tomorrow.",
    "i like coffee.",
    "i don't like tea.",
    "what time is it?",
    "it is three o'clock.",
    "where is the train station?",
    "the station is nearby.",
    "how much is it?",
    "it's too expensive.",
    "do you speak english?",
    "a little.",
    "i don't understand.",
    "can you repeat?",
    "i am sorry.",
    "no problem.",
    "enjoy your meal.",
    "cheers.",
    "i am hungry.",
    "i am thirsty.",
    "the weather is nice today.",
    "it's raining.",
    "it's cold.",
    "it's hot.",
    "i work here.",
    "where do you live?",
    "i live in Paris.",
    "how old are you?",
    "i am twenty-five years old.",
    "do you have brothers and sisters?",
    "i have a sister.",
    "i have a cat.",
    "i like to travel.",
    "i am a student.",
    "i am a teacher.",
    "help!",
    "happy birthday!",
    "congratulations!"
]

In [4]:
fr_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation
en_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation

fr_tokenizer.fit_on_texts(fr_sentences, preprocess_ponctuation=True)
en_tokenizer.fit_on_texts(en_sentences, preprocess_ponctuation=True)

In [5]:
x_train = fr_tokenizer.texts_to_sequences(fr_sentences, preprocess_ponctuation=True)
y_train = en_tokenizer.texts_to_sequences(en_sentences, preprocess_ponctuation=True)

max_len_x = max(len(seq) for seq in x_train)
max_len_y = max(len(seq) for seq in y_train)
max_seq_len = max(max_len_x, max_len_y)

vocab_size_fr = len(fr_tokenizer.word_index)
vocab_size_en = len(en_tokenizer.word_index)
max_vocab_size = max(vocab_size_fr, vocab_size_en)

In [6]:
# Verify all data
print(f"vocab_size_en: {vocab_size_en}, vocab_size_fr: {vocab_size_fr}")
print(f"max_len_x: {max_len_x}, max_len_y: {max_len_y}, max_vocab_size: {max_vocab_size}, max_seq_len: {max_seq_len}")
print("French sentences:")
print(fr_sentences)
print("English sentences:")
print(en_sentences)
print("French tokenizer:")
print(fr_tokenizer.word_index)
print("English tokenizer:")
print(en_tokenizer.word_index)

vocab_size_en: 95, vocab_size_fr: 107
max_len_x: 8, max_len_y: 8, max_vocab_size: 107, max_seq_len: 8
French sentences:
['bonjour.', 'au revoir.', 'merci beaucoup.', "s'il vous plaît.", 'comment allez-vous ?', 'je vais bien.', 'je suis fatigué.', 'je suis content.', 'quel est votre nom ?', 'mon nom est Jean.', 'enchanté de vous rencontrer.', 'bonne journée.', 'bonne soirée.', 'à demain.', "j'aime le café.", "je n'aime pas le thé.", 'quelle heure est-il ?', 'il est trois heures.', 'où est la gare ?', "la gare est près d'ici.", 'combien ça coûte ?', "c'est trop cher.", 'parlez-vous anglais ?', 'un peu.', 'je ne comprends pas.', 'pouvez-vous répéter ?', 'je suis désolé.', 'pas de problème.', 'bon appétit.', 'à votre santé.', "j'ai faim.", "j'ai soif.", "il fait beau aujourd'hui.", 'il pleut.', 'il fait froid.', 'il fait chaud.', 'je travaille ici.', 'où habitez-vous ?', "j'habite à Paris.", 'quel âge avez-vous ?', "j'ai vingt-cinq ans.", 'avez-vous des frères et sœurs ?', "j'ai une sœur."

In [7]:
model = Transformer(
    src_vocab_size=vocab_size_fr,
    tgt_vocab_size=vocab_size_en,
    d_model=256,        
    n_heads=8,         
    n_encoder_layers=4,
    n_decoder_layers=4,
    d_ff=1024,           
    dropout_rate=0.3,
    max_sequence_length=max_seq_len,
    random_state=42,
)


model.compile(
    loss_function=SequenceCrossEntropy(
        label_smoothing=0.1,
    ),
    optimizer=Adam(
        learning_rate=0.0001,
        beta_1=0.9,
        beta_2=0.98,
        epsilon=1e-9
    ),
    verbose=True
)

Transformer(
  src_vocab_size=107,
  tgt_vocab_size=95,
  d_model=256,
  n_heads=8,
  n_encoder_layers=4,
  n_decoder_layers=4,
  d_ff=1024,
  dropout_rate=0.3,
  max_sequence_length=8
)


In [8]:
x_train_padded, y_train_padded = model.prepare_data(x_train, y_train)

In [9]:
def inspect_training_data(x_train_padded, y_train_padded, fr_tokenizer, en_tokenizer):
    print("\nInspecting training data:")
    for i in range(min(3, len(x_train_padded))):
        print(f"\nExample {i+1}:")
        print("Input sequence:")
        print(f"Raw: {x_train_padded[i]}")
        print("Tokens:", [fr_tokenizer.index_word.get(idx, '<UNK>') for idx in x_train_padded[i]])
        
        print("\nOutput sequence:")
        print(f"Raw: {y_train_padded[i]}")
        print("Tokens:", [en_tokenizer.index_word.get(idx, '<UNK>') for idx in y_train_padded[i]])

inspect_training_data(x_train_padded, y_train_padded, fr_tokenizer, en_tokenizer)


Inspecting training data:

Example 1:
Input sequence:
Raw: [ 2 32  4  3  0  0  0  0]
Tokens: ['<SOS>', 'bonjour', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Output sequence:
Raw: [ 2 32  4  3  0  0  0  0]
Tokens: ['<SOS>', 'hello', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Example 2:
Input sequence:
Raw: [ 2 19 33  4  3  0  0  0]
Tokens: ['<SOS>', 'au', 'revoir', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>']

Output sequence:
Raw: [ 2 33  4  3  0  0  0  0]
Tokens: ['<SOS>', 'goodbye', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Example 3:
Input sequence:
Raw: [ 2 34 35  4  3  0  0  0]
Tokens: ['<SOS>', 'merci', 'beaucoup', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>']

Output sequence:
Raw: [ 2 34  7 35 20  4  3  0]
Tokens: ['<SOS>', 'thank', 'you', 'very', 'much', '.', '<EOS>', '<PAD>']


In [10]:
monitor = TrainingMonitor(model, fr_tokenizer, en_tokenizer, test_sentences)
lr = LearningRateScheduler(
    schedule="warmup_cosine",
    initial_learning_rate=0.0001,
)
history = model.fit(
    x_train_padded, y_train_padded,
    epochs=20,
    batch_size=12,
    verbose=True,
    callbacks=[
        EarlyStopping(monitor='loss', patience=20),
        monitor,
        lr
    ]
)

Epoch 0 validation:

Processing: je vais bien
Tokens: [6, 40, 41]
Original words: ['je', 'vais', 'bien']
With special tokens: [2, 6, 40, 41, 3]
Padded sequence: [[ 2  6 40 41  3  0  0  0]]
Raw prediction: [ 2 74 73 76 69 73 73 76  3]
Token 74 -> word: cold
Token 73 -> word: raining
Token 76 -> word: work
Token 69 -> word: hungry
Token 73 -> word: raining
Token 73 -> word: raining
Token 76 -> word: work
Found EOS token: 3
Final translation: cold raining work hungry raining raining work
Input: je vais bien
Output: cold raining work hungry raining raining work


Processing: comment allez-vous ?
Tokens: [38, 39, 8, 7, 5]
Original words: ['comment', 'allez', '-', 'vous', '?']
With special tokens: [2, 38, 39, 8, 7, 5, 3]
Padded sequence: [[ 2 38 39  8  7  5  3  0]]
Raw prediction: [ 2 78 75 76 69 73 73 76  3]
Token 78 -> word: in
Token 75 -> word: hot
Token 76 -> word: work
Token 69 -> word: hungry
Token 73 -> word: raining
Token 73 -> word: raining
Token 76 -> word: work
Found EOS token: 3


In [11]:
print("Vocabulary sizes:")
print(f"French vocab size: {len(fr_tokenizer.word_index)}")
print(f"English vocab size: {len(en_tokenizer.word_index)}")

for sent in test_sentences:
    print("\n" + "="*50)
    print(f"Testing: {sent}")
    translation = translate(sent, model, fr_tokenizer, en_tokenizer, 
                      temperature=0.8,
                      beam_size=10,
                      min_length=2) 
    print(f"Translation: {translation}")
    

Vocabulary sizes:
French vocab size: 107
English vocab size: 95

Testing: je vais bien

Processing: je vais bien
Tokens: [6, 40, 41]
Original words: ['je', 'vais', 'bien']
With special tokens: [2, 6, 40, 41, 3]
Padded sequence: [[ 2  6 40 41  3  0  0  0]]
Raw prediction: [ 2 73 73 73 73 73 73 73  3]
Token 73 -> word: raining
Token 73 -> word: raining
Token 73 -> word: raining
Token 73 -> word: raining
Token 73 -> word: raining
Token 73 -> word: raining
Token 73 -> word: raining
Found EOS token: 3
Final translation: raining raining raining raining raining raining raining
Translation: raining raining raining raining raining raining raining

Testing: comment allez-vous ?

Processing: comment allez-vous ?
Tokens: [38, 39, 8, 7, 5]
Original words: ['comment', 'allez', '-', 'vous', '?']
With special tokens: [2, 38, 39, 8, 7, 5, 3]
Padded sequence: [[ 2 38 39  8  7  5  3  0]]
Raw prediction: [ 2 73 73 73 73 73 73 73  3]
Token 73 -> word: raining
Token 73 -> word: raining
Token 73 -> word: rai

In [12]:
def analyze_attention_weights(model, input_sentence, fr_tokenizer):
    tokens = fr_tokenizer.texts_to_sequences([input_sentence])[0]
    tokens = fr_tokenizer.encode_special_tokens([tokens])[0]
    padded = pad_sequences([tokens], max_length=model.max_sequence_length, padding='post', 
                          pad_value=fr_tokenizer.word_index[fr_tokenizer.pad_token])
    
    _ = model.predict(padded)
    
    attention_weights = model.decoder_layers[-1].cross_attention.attention_weights
    
    print("\nAttention Analysis:")
    print(f"Attention shape: {attention_weights.shape}")
    print("First attention head values:")
    print(attention_weights[0, 0])

for sent in test_sentences[:2]:
    print(f"\nAnalyzing: {sent}")
    analyze_attention_weights(model, sent, fr_tokenizer)


Analyzing: je vais bien

Attention Analysis:
Attention shape: (1, 8, 7, 8)
First attention head values:
[[0.20029563 0.19942448 0.20725221 0.19974921 0.19327846 0.
  0.         0.        ]
 [0.20029325 0.19943252 0.20725276 0.19975518 0.19326629 0.
  0.         0.        ]
 [0.20028801 0.1994326  0.20725065 0.19975687 0.19327187 0.
  0.         0.        ]
 [0.20028121 0.19943791 0.20725839 0.19975882 0.19326367 0.
  0.         0.        ]
 [0.20027662 0.19944207 0.20725907 0.19975834 0.19326389 0.
  0.         0.        ]
 [0.20027687 0.19944068 0.20725896 0.19975863 0.19326486 0.
  0.         0.        ]
 [0.20028094 0.19943636 0.20725508 0.19976103 0.19326659 0.
  0.         0.        ]]

Analyzing: comment allez-vous ?

Attention Analysis:
Attention shape: (1, 8, 7, 8)
First attention head values:
[[0.20057462 0.19977678 0.20656328 0.19895658 0.19412874 0.
  0.         0.        ]
 [0.20057219 0.19978558 0.20656474 0.19896115 0.19411634 0.
  0.         0.        ]
 [0.2005668  0.1