In [1]:
from neuralnetlib.models import Transformer
from neuralnetlib.preprocessing import Tokenizer, pad_sequences
from neuralnetlib.losses import SequenceCrossEntropy
from neuralnetlib.optimizers import Adam
from neuralnetlib.callbacks import EarlyStopping, Callback

In [2]:
def translate(sentence: str, model, fr_tokenizer, en_tokenizer, 
             temperature=0.8, beam_size=10, min_length=2) -> str:
    print(f"\nProcessing: {sentence}")
    
    # Tokenization
    tokens = fr_tokenizer.texts_to_sequences([sentence], preprocess_ponctuation=True)[0]
    print(f"Tokens: {tokens}")
    print(f"Original words: {[fr_tokenizer.index_word.get(t, '<UNK>') for t in tokens]}")
    
    # Add special tokens
    tokens = fr_tokenizer.encode_special_tokens([tokens])[0]
    print(f"With special tokens: {tokens}")
    
    # Padding
    padded = pad_sequences([tokens], max_length=model.max_sequence_length, padding='post', 
                          pad_value=fr_tokenizer.word_index[fr_tokenizer.pad_token])
    print(f"Padded sequence: {padded}")
    
    pred = model.predict(padded, 
                        max_length=model.max_sequence_length,
                        beam_size=beam_size,      
                        alpha=0.6,         
                        min_length=min_length,
                        temperature=temperature)[0]
    
    print(f"Raw prediction: {pred}")
    
    words = []
    for idx in pred[1:]:  # Skip SOS
        if idx == en_tokenizer.word_index[en_tokenizer.eos_token]:
            print(f"Found EOS token: {idx}")
            break
            
        if idx == en_tokenizer.word_index[en_tokenizer.pad_token]:
            print(f"Skipping PAD token: {idx}")
            continue
            
        word = en_tokenizer.index_word.get(idx, en_tokenizer.unk_token)
        print(f"Token {idx} -> word: {word}")
        words.append(word)
    
    if not words:
        print("Warning: Empty translation, using default handling...")
        for idx in pred[1:]:  # Skip SOS
            if idx not in [en_tokenizer.word_index[token] for token in 
                         [en_tokenizer.pad_token, en_tokenizer.eos_token]]:
                word = en_tokenizer.index_word.get(idx, en_tokenizer.unk_token)
                words.append(word)
    
    translation = " ".join(words) if words else "[Translation failed]"
    print(f"Final translation: {translation}")
    return translation

class TrainingMonitor(Callback):
    def __init__(self, model, fr_tokenizer, en_tokenizer, test_sentences):
        self.model = model
        self.fr_tokenizer = fr_tokenizer
        self.en_tokenizer = en_tokenizer
        self.test_sentences = test_sentences
        
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 5 == 0:  # Check every 5 epochs
            print(f"\nEpoch {epoch} validation:")
            for sent in self.test_sentences:
                translation = translate(
                    sent, self.model, self.fr_tokenizer, self.en_tokenizer,
                    temperature=0.8, beam_size=5, min_length=2
                )
                print(f"Input: {sent}")
                print(f"Output: {translation}\n")


test_sentences = [
    "je vais bien",
    "comment allez-vous ?",
    "bonjour"
]

In [3]:
fr_sentences = [
    "bonjour.",
    "au revoir.",
    "merci beaucoup.",
    "s'il vous plaît.",
    "comment allez-vous ?",
    "je vais bien.",
    "je suis fatigué.",
    "je suis content.",
    "quel est votre nom ?",
    "mon nom est Jean.",
    "enchanté de vous rencontrer.",
    "bonne journée.",
    "bonne soirée.",
    "à demain.",
    "j'aime le café.",
    "je n'aime pas le thé.",
    "quelle heure est-il ?",
    "il est trois heures.",
    "où est la gare ?",
    "la gare est près d'ici.",
    "combien ça coûte ?",
    "c'est trop cher.",
    "parlez-vous anglais ?",
    "un peu.",
    "je ne comprends pas.",
    "pouvez-vous répéter ?",
    "je suis désolé.",
    "pas de problème.",
    "bon appétit.",
    "à votre santé.",
    "j'ai faim.",
    "j'ai soif.",
    "il fait beau aujourd'hui.",
    "il pleut.",
    "il fait froid.",
    "il fait chaud.",
    "je travaille ici.",
    "où habitez-vous ?",
    "j'habite à Paris.",
    "quel âge avez-vous ?",
    "j'ai vingt-cinq ans.",
    "avez-vous des frères et sœurs ?",
    "j'ai une sœur.",
    "j'ai un chat.",
    "j'aime voyager.",
    "je suis étudiant.",
    "je suis professeur.",
    "au secours !",
    "joyeux anniversaire !",
    "félicitations !"
]

en_sentences = [
    "hello.",
    "goodbye.",
    "thank you very much.",
    "please.",
    "how are you?",
    "i am fine.",
    "i am tired.",
    "i am happy.",
    "what is your name?",
    "my name is John.",
    "nice to meet you.",
    "have a nice day.",
    "have a good evening.",
    "see you tomorrow.",
    "i like coffee.",
    "i don't like tea.",
    "what time is it?",
    "it is three o'clock.",
    "where is the train station?",
    "the station is nearby.",
    "how much is it?",
    "it's too expensive.",
    "do you speak english?",
    "a little.",
    "i don't understand.",
    "can you repeat?",
    "i am sorry.",
    "no problem.",
    "enjoy your meal.",
    "cheers.",
    "i am hungry.",
    "i am thirsty.",
    "the weather is nice today.",
    "it's raining.",
    "it's cold.",
    "it's hot.",
    "i work here.",
    "where do you live?",
    "i live in Paris.",
    "how old are you?",
    "i am twenty-five years old.",
    "do you have brothers and sisters?",
    "i have a sister.",
    "i have a cat.",
    "i like to travel.",
    "i am a student.",
    "i am a teacher.",
    "help!",
    "happy birthday!",
    "congratulations!"
]

In [4]:
fr_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation
en_tokenizer = Tokenizer(filters="")  # else the tokenizer would remove the special characters including ponctuation

fr_tokenizer.fit_on_texts(fr_sentences, preprocess_ponctuation=True)
en_tokenizer.fit_on_texts(en_sentences, preprocess_ponctuation=True)

In [5]:
x_train = fr_tokenizer.texts_to_sequences(fr_sentences, preprocess_ponctuation=True)
y_train = en_tokenizer.texts_to_sequences(en_sentences, preprocess_ponctuation=True)

max_len_x = max(len(seq) for seq in x_train)
max_len_y = max(len(seq) for seq in y_train)
max_seq_len = max(max_len_x, max_len_y)

vocab_size_fr = len(fr_tokenizer.word_index)
vocab_size_en = len(en_tokenizer.word_index)
max_vocab_size = max(vocab_size_fr, vocab_size_en)

In [6]:
# Verify all data
print(f"vocab_size_en: {vocab_size_en}, vocab_size_fr: {vocab_size_fr}")
print(f"max_len_x: {max_len_x}, max_len_y: {max_len_y}, max_vocab_size: {max_vocab_size}, max_seq_len: {max_seq_len}")
print("French sentences:")
print(fr_sentences)
print("English sentences:")
print(en_sentences)
print("French tokenizer:")
print(fr_tokenizer.word_index)
print("English tokenizer:")
print(en_tokenizer.word_index)

vocab_size_en: 95, vocab_size_fr: 107
max_len_x: 8, max_len_y: 8, max_vocab_size: 107, max_seq_len: 8
French sentences:
['bonjour.', 'au revoir.', 'merci beaucoup.', "s'il vous plaît.", 'comment allez-vous ?', 'je vais bien.', 'je suis fatigué.', 'je suis content.', 'quel est votre nom ?', 'mon nom est Jean.', 'enchanté de vous rencontrer.', 'bonne journée.', 'bonne soirée.', 'à demain.', "j'aime le café.", "je n'aime pas le thé.", 'quelle heure est-il ?', 'il est trois heures.', 'où est la gare ?', "la gare est près d'ici.", 'combien ça coûte ?', "c'est trop cher.", 'parlez-vous anglais ?', 'un peu.', 'je ne comprends pas.', 'pouvez-vous répéter ?', 'je suis désolé.', 'pas de problème.', 'bon appétit.', 'à votre santé.', "j'ai faim.", "j'ai soif.", "il fait beau aujourd'hui.", 'il pleut.', 'il fait froid.', 'il fait chaud.', 'je travaille ici.', 'où habitez-vous ?', "j'habite à Paris.", 'quel âge avez-vous ?', "j'ai vingt-cinq ans.", 'avez-vous des frères et sœurs ?', "j'ai une sœur."

In [7]:
model = Transformer(
    src_vocab_size=vocab_size_fr,
    tgt_vocab_size=vocab_size_en,
    d_model=32,        
    n_heads=2,         
    n_encoder_layers=2,
    n_decoder_layers=2,
    d_ff=64,           
    dropout_rate=0.4,
    max_sequence_length=max_seq_len,
    random_state=42,
    src_unk_idx=fr_tokenizer.word_index[fr_tokenizer.unk_token],
    src_sos_idx=fr_tokenizer.word_index[fr_tokenizer.sos_token],
    src_eos_idx=fr_tokenizer.word_index[fr_tokenizer.eos_token],
    tgt_unk_idx=en_tokenizer.word_index[en_tokenizer.unk_token],
    tgt_sos_idx=en_tokenizer.word_index[en_tokenizer.sos_token],
    tgt_eos_idx=en_tokenizer.word_index[en_tokenizer.eos_token],
)


model.compile(
    loss_function=SequenceCrossEntropy(
        label_smoothing=0.1,
    ),
    optimizer=Adam(
        learning_rate=0.0001,
        beta_1=0.9,
        beta_2=0.98,
        epsilon=1e-9
    ),
    verbose=True
)

Transformer(
  src_vocab_size=107,
  tgt_vocab_size=95,
  d_model=32,
  n_heads=2,
  n_encoder_layers=2,
  n_decoder_layers=2,
  d_ff=64,
  dropout_rate=0.4,
  max_sequence_length=8
)


In [8]:
x_train_padded, y_train_padded = model.prepare_data(x_train, y_train)

In [9]:
def inspect_training_data(x_train_padded, y_train_padded, fr_tokenizer, en_tokenizer):
    print("\nInspecting training data:")
    for i in range(min(3, len(x_train_padded))):
        print(f"\nExample {i+1}:")
        print("Input sequence:")
        print(f"Raw: {x_train_padded[i]}")
        print("Tokens:", [fr_tokenizer.index_word.get(idx, '<UNK>') for idx in x_train_padded[i]])
        
        print("\nOutput sequence:")
        print(f"Raw: {y_train_padded[i]}")
        print("Tokens:", [en_tokenizer.index_word.get(idx, '<UNK>') for idx in y_train_padded[i]])

inspect_training_data(x_train_padded, y_train_padded, fr_tokenizer, en_tokenizer)


Inspecting training data:

Example 1:
Input sequence:
Raw: [106  29   1 107   0   0   0   0]
Tokens: ['<SOS>', 'bonjour', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Output sequence:
Raw: [94 29  1 95  0  0  0  0]
Tokens: ['<SOS>', 'hello', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Example 2:
Input sequence:
Raw: [106  16  30   1 107   0   0   0]
Tokens: ['<SOS>', 'au', 'revoir', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>']

Output sequence:
Raw: [94 30  1 95  0  0  0  0]
Tokens: ['<SOS>', 'goodbye', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']

Example 3:
Input sequence:
Raw: [106  31  32   1 107   0   0   0]
Tokens: ['<SOS>', 'merci', 'beaucoup', '.', '<EOS>', '<PAD>', '<PAD>', '<PAD>']

Output sequence:
Raw: [94 31  4 32 17  1 95  0]
Tokens: ['<SOS>', 'thank', 'you', 'very', 'much', '.', '<EOS>', '<PAD>']


In [10]:
monitor = TrainingMonitor(model, fr_tokenizer, en_tokenizer, test_sentences)

history = model.fit(
    x_train_padded, y_train_padded,
    epochs=300,
    batch_size=2,
    verbose=True,
    callbacks=[
        EarlyStopping(monitor='loss', patience=20),
        monitor
    ]
)

[==----------------------------] 8% Epoch 1/300 - loss: 11.1380 -  - 0.01s

Epoch 0 validation:

Processing: je vais bien
Tokens: [3, 37, 38]
Original words: ['je', 'vais', 'bien']
With special tokens: [106, 3, 37, 38, 107]
Padded sequence: [[106   3  37  38 107   0   0   0]]
Raw prediction: [94 77 74 76 78 71 74 74 95]
Token 77 -> word: twenty
Token 74 -> word: here
Token 76 -> word: paris
Token 78 -> word: -
Token 71 -> word: cold
Token 74 -> word: here
Token 74 -> word: here
Found EOS token: 95
Final translation: twenty here paris - cold here here
Input: je vais bien
Output: twenty here paris - cold here here


Processing: comment allez-vous ?
Tokens: [35, 36, 5, 4, 2]
Original words: ['comment', 'allez', '-', 'vous', '?']
With special tokens: [106, 35, 36, 5, 4, 2, 107]
Padded sequence: [[106  35  36   5   4   2 107   0]]
Raw prediction: [94 76 68 79 76 71 71 75 95]
Token 76 -> word: paris
Token 68 -> word: weather
Token 79 -> word: five
Token 76 -> word: paris
Token 71 -> word: cold
Token 71 -> word: cold
Token 75 -> word: in
Found EOS token: 95
Final tra

In [11]:
print("Vocabulary sizes:")
print(f"French vocab size: {len(fr_tokenizer.word_index)}")
print(f"English vocab size: {len(en_tokenizer.word_index)}")

for sent in test_sentences:
    print("\n" + "="*50)
    print(f"Testing: {sent}")
    translation = translate(sent, model, fr_tokenizer, en_tokenizer, 
                      temperature=0.8,
                      beam_size=10,
                      min_length=2) 
    print(f"Translation: {translation}")
    

Vocabulary sizes:
French vocab size: 107
English vocab size: 95

Testing: je vais bien

Processing: je vais bien
Tokens: [3, 37, 38]
Original words: ['je', 'vais', 'bien']
With special tokens: [106, 3, 37, 38, 107]
Padded sequence: [[106   3  37  38 107   0   0   0]]
Raw prediction: [94 92 92 92 92 92 92 92 95]
Token 92 -> word: <UNK>
Token 92 -> word: <UNK>
Token 92 -> word: <UNK>
Token 92 -> word: <UNK>
Token 92 -> word: <UNK>
Token 92 -> word: <UNK>
Token 92 -> word: <UNK>
Found EOS token: 95
Final translation: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>
Translation: <UNK> <UNK> <UNK> <UNK> <UNK> <UNK> <UNK>

Testing: comment allez-vous ?

Processing: comment allez-vous ?
Tokens: [35, 36, 5, 4, 2]
Original words: ['comment', 'allez', '-', 'vous', '?']
With special tokens: [106, 35, 36, 5, 4, 2, 107]
Padded sequence: [[106  35  36   5   4   2 107   0]]
Raw prediction: [94 92 92 92 92 92 92 92 95]
Token 92 -> word: <UNK>
Token 92 -> word: <UNK>
Token 92 -> word: <UNK>
Token 92 -> word: 

In [12]:
def analyze_attention_weights(model, input_sentence, fr_tokenizer):
    tokens = fr_tokenizer.texts_to_sequences([input_sentence])[0]
    tokens = fr_tokenizer.encode_special_tokens([tokens])[0]
    padded = pad_sequences([tokens], max_length=model.max_sequence_length, padding='post', 
                          pad_value=fr_tokenizer.word_index[fr_tokenizer.pad_token])
    
    _ = model.predict(padded)
    
    attention_weights = model.decoder_layers[-1].cross_attention.attention_weights
    
    print("\nAttention Analysis:")
    print(f"Attention shape: {attention_weights.shape}")
    print("First attention head values:")
    print(attention_weights[0, 0])

for sent in test_sentences[:2]:
    print(f"\nAnalyzing: {sent}")
    analyze_attention_weights(model, sent, fr_tokenizer)


Analyzing: je vais bien

Attention Analysis:
Attention shape: (1, 2, 7, 8)
First attention head values:
[[0.20216132 0.19910566 0.19761014 0.19804745 0.20307543 0.
  0.         0.        ]
 [0.2021579  0.19910481 0.19761539 0.19804832 0.20307358 0.
  0.         0.        ]
 [0.20215832 0.19910442 0.19761517 0.19804813 0.20307396 0.
  0.         0.        ]
 [0.20215844 0.19910475 0.1976139  0.19804914 0.20307377 0.
  0.         0.        ]
 [0.20215822 0.19910514 0.1976144  0.19804846 0.20307379 0.
  0.         0.        ]
 [0.20215825 0.1991049  0.19761434 0.19804884 0.20307367 0.
  0.         0.        ]
 [0.20215869 0.19910526 0.19761368 0.19804887 0.2030735  0.
  0.         0.        ]]

Analyzing: comment allez-vous ?

Attention Analysis:
Attention shape: (1, 2, 7, 8)
First attention head values:
[[0.20198853 0.19804079 0.19821552 0.19933267 0.20242249 0.
  0.         0.        ]
 [0.2019854  0.19804162 0.19821951 0.1993323  0.20242117 0.
  0.         0.        ]
 [0.20198576 0.1