In [1]:
import numpy as np
from neuralnetlib.models import Transformer
from neuralnetlib.preprocessing import Tokenizer, pad_sequences

In [2]:
fr_sentences = [
    "je suis heureux.",
    "j'aime les chats.",
    "bonjour le monde.",
    "au revoir.",
    "comment allez-vous ?",
]

en_sentences = [
    "i am happy.",
    "i like cats.",
    "hello world.",
    "goodbye.",
    "how are you?",
]

In [3]:
fr_tokenizer = Tokenizer(oov_token="<OOV>")
en_tokenizer = Tokenizer(oov_token="<OOV>")

fr_tokenizer.fit_on_texts(fr_sentences, preprocess_ponctuation=True)
en_tokenizer.fit_on_texts(en_sentences, preprocess_ponctuation=True)

In [4]:
x_train = fr_tokenizer.texts_to_sequences(fr_sentences, preprocess_ponctuation=True)
y_train = en_tokenizer.texts_to_sequences(en_sentences, preprocess_ponctuation=True)

max_len_x = max(len(seq) for seq in x_train)
max_len_y = max(len(seq) for seq in y_train)
max_seq_len = max(max_len_x, max_len_y)

vocab_size_fr = len(fr_tokenizer.word_index)
vocab_size_en = len(en_tokenizer.word_index)
max_vocab_size = max(vocab_size_fr, vocab_size_en) + 4

In [5]:
# Verify all data
print(f"vocab_size_en: {vocab_size_en}, vocab_size_fr: {vocab_size_fr}")
print(f"max_len_x: {max_len_x}, max_len_y: {max_len_y}, max_vocab_size: {max_vocab_size}, max_seq_len: {max_seq_len}")
print("French sentences:")
print(fr_sentences)
print("English sentences:")
print(en_sentences)
print("French tokenizer:")
print(fr_tokenizer.word_index)
print("English tokenizer:")
print(en_tokenizer.word_index)

vocab_size_en: 12, vocab_size_fr: 16
max_len_x: 5, max_len_y: 4, max_vocab_size: 20, max_seq_len: 5
French sentences:
['je suis heureux.', "j'aime les chats.", 'bonjour le monde.', 'au revoir.', 'comment allez-vous ?']
English sentences:
['i am happy.', 'i like cats.', 'hello world.', 'goodbye.', 'how are you?']
French tokenizer:
{'je': 1, 'suis': 2, 'heureux': 3, "j'": 4, 'aime': 5, 'les': 6, 'chats': 7, 'bonjour': 8, 'le': 9, 'monde': 10, 'au': 11, 'revoir': 12, 'comment': 13, 'allez': 14, 'vous': 15, '<OOV>': 16}
English tokenizer:
{'i': 1, 'am': 2, 'happy': 3, 'like': 4, 'cats': 5, 'hello': 6, 'world': 7, 'goodbye': 8, 'how': 9, 'are': 10, 'you': 11, '<OOV>': 12}


In [6]:
model = Transformer(
    vocab_size=max_vocab_size,
    d_model=128,
    n_heads=4,
    n_encoder_layers=2,
    n_decoder_layers=2,
    d_ff=256,
    dropout_rate=0.2,
    max_sequence_length=max_seq_len,
    temperature=0.7,
    random_state=42
)


model.compile(
    loss_function='sequencecrossentropy',
    optimizer='adam',
    verbose=True
)

Transformer(
  vocab_size=24,
  d_model=128,
  n_heads=4,
  n_encoder_layers=2,
  n_decoder_layers=2,
  d_ff=256,
  dropout_rate=0.2,
  max_sequence_length=5
)


In [7]:
x_train_padded, y_train_padded = model.prepare_data(x_train, y_train)

In [8]:
history = model.fit(
    x_train_padded, y_train_padded,
    epochs=50,
    batch_size=5,
    verbose=True
)




In [9]:
def translate(sentence: str, model, fr_tokenizer, en_tokenizer) -> str:
    tokens = fr_tokenizer.texts_to_sequences([sentence], preprocess_ponctuation=True)[0]
    tokens = [model.SOS_IDX] + [t + 4 for t in tokens] + [model.EOS_IDX]  # Shift indices by 4
    padded = pad_sequences([tokens], max_length=max_len_x, padding='post', pad_value=model.PAD_IDX)
    
    pred = model.predict(padded, max_length=max_seq_len)[0]
    words = []
    for idx in pred[1:]:
        if idx == model.EOS_IDX:
            break
        if idx in [model.PAD_IDX, model.UNK_IDX, model.SOS_IDX]:
            continue
        word = en_tokenizer.index_word.get(idx - 4, "<OOV>")
        words.append(word)
    
    return " ".join(words)

test_sentences = [
    "je suis heureux.",
    "comment allez-vous ?",
    "bonjour le monde."
]

for sent in test_sentences:
    print(f"FR: {sent}")
    translation = translate(sent, model, fr_tokenizer, en_tokenizer)
    print(f"EN: {translation}\n")


FR: je suis heureux.
EN: goodbye goodbye goodbye goodbye

FR: comment allez-vous ?
EN: goodbye goodbye goodbye goodbye

FR: bonjour le monde.
EN: goodbye goodbye goodbye goodbye

