In [None]:
import numpy as np
from neuralnetlib.models import Transformer
from neuralnetlib.preprocessing import Tokenizer, pad_sequences

In [2]:
fr_sentences = [
    "je suis heureux.",
    "j'aime les chats.",
    "bonjour le monde.",
    "au revoir.",
    "comment allez-vous ?",
]

en_sentences = [
    "i am happy.",
    "i like cats.",
    "hello world.",
    "goodbye.",
    "how are you?",
]

In [3]:
fr_tokenizer = Tokenizer(oov_token="<OOV>")
en_tokenizer = Tokenizer(oov_token="<OOV>")

fr_tokenizer.fit_on_texts(fr_sentences, preprocess_ponctuation=True)
en_tokenizer.fit_on_texts(en_sentences, preprocess_ponctuation=True)

In [4]:
x_train = fr_tokenizer.texts_to_sequences(fr_sentences, preprocess_ponctuation=True)
y_train = en_tokenizer.texts_to_sequences(en_sentences, preprocess_ponctuation=True)

max_len_x = max(len(seq) for seq in x_train)
max_len_y = max(len(seq) for seq in y_train)

x_train_padded = pad_sequences(x_train, max_length=max_len_x, padding='post')
y_train_padded = pad_sequences(y_train, max_length=max_len_y, padding='post')

vocab_size_fr = len(fr_tokenizer.word_index) + 1
vocab_size_en = len(en_tokenizer.word_index) + 1
max_seq_len = max(max_len_x, max_len_y) + 2

In [5]:
# Verify all data
print("French sentences:")
print(fr_sentences)
print("English sentences:")
print(en_sentences)
print("French tokenizer:")
print(fr_tokenizer.word_index)
print("English tokenizer:")
print(en_tokenizer.word_index)
print("Padded French sequences:")
print(x_train_padded)
print("Padded English sequences:")
print(y_train_padded)

French sentences:
['je suis heureux.', "j'aime les chats.", 'bonjour le monde.', 'au revoir.', 'comment allez-vous ?']
English sentences:
['i am happy.', 'i like cats.', 'hello world.', 'goodbye.', 'how are you?']
French tokenizer:
{'je': 1, 'suis': 2, 'heureux': 3, "j'": 4, 'aime': 5, 'les': 6, 'chats': 7, 'bonjour': 8, 'le': 9, 'monde': 10, 'au': 11, 'revoir': 12, 'comment': 13, 'allez': 14, 'vous': 15, '<OOV>': 16}
English tokenizer:
{'i': 1, 'am': 2, 'happy': 3, 'like': 4, 'cats': 5, 'hello': 6, 'world': 7, 'goodbye': 8, 'how': 9, 'are': 10, 'you': 11, '<OOV>': 12}
Padded French sequences:
[[ 1  2  3 16  0]
 [ 4  5  6  7 16]
 [ 8  9 10 16  0]
 [11 12 16  0  0]
 [13 14 16 15 16]]
Padded English sequences:
[[ 1  2  3 12]
 [ 1  4  5 12]
 [ 6  7 12  0]
 [ 8 12  0  0]
 [ 9 10 11 12]]


In [None]:
model = Transformer(
    vocab_size=max(vocab_size_fr, vocab_size_en),
    d_model=128,
    n_heads=4,
    n_encoder_layers=3,
    n_decoder_layers=3,
    d_ff=512,
    dropout_rate=0.1,
    max_sequence_length=max_seq_len,
    random_state=42
)

# Add start and end tokens to the target sequences
y_train_with_tokens = np.zeros((y_train_padded.shape[0], max_len_y + 2), dtype=int)
y_train_with_tokens[:, 0] = model.SOS_IDX 
y_train_with_tokens[:, -1] = model.EOS_IDX
y_train_with_tokens[:, 1:-1] = y_train_padded

model.compile(
    loss_function='sequencecrossentropy',
    optimizer='adam',
    verbose=True
)

Transformer(
  vocab_size=20,
  d_model=128,
  n_heads=4,
  n_encoder_layers=3,
  n_decoder_layers=3,
  d_ff=512,
  dropout_rate=0.1,
  max_sequence_length=7
)


In [8]:
history = model.fit(
    x_train_padded, y_train_with_tokens,
    epochs=100,
    batch_size=1,
    verbose=True
)




In [9]:
test_sentences = [
    "je suis heureux.",
    "comment allez-vous ?",
    "bonjour le monde."
]

for fr_sent in test_sentences:
    # Convert input sentence to sequence
    input_seq = [fr_tokenizer.word_index.get(word, model.PAD_IDX) for word in fr_sent.split()]
    
    # Pad the input sequence to max_seq_len
    input_seq = np.pad(
        [input_seq], 
        ((0, 0), (0, max_seq_len - len(input_seq))),
        constant_values=model.PAD_IDX
    )
    
    # Predict the output sequence
    output_seq = model.predict(input_seq, max_length=max_seq_len)[0]
    
    # Convert output indices to words
    output_words = []
    for idx in output_seq[1:]:  # Skip <SOS> token
        if idx == model.EOS_IDX:  # Stop at <EOS>
            break
        word = en_tokenizer.index_word.get(idx, "<OOV>")
        output_words.append(word)
    
    print(f"FR: {fr_sent}")
    print(f"EN: {' '.join(output_words)}\n")


FR: je suis heureux.
EN: are are are are are are

FR: comment allez-vous ?
EN: are are are are are are

FR: bonjour le monde.
EN: are are are are are are

