In [125]:
import numpy as np
import os

In [126]:
input_path = "dataset/raw/eng_fra.txt"

In [127]:
lines = []

with open(input_path, "r", encoding="utf-8") as infile:
    for line in infile:
        lines.append(line)

In [128]:
print("Total Examples:", len(lines))

Total Examples: 237838


In [129]:
lines[:2]

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)\n',
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)\n']

In [130]:
lines[-1]

'"I went drinking with one of my boyfriend\'s friends, and now he\'s furious at me." "Was this friend a guy or a girl?" "A guy, obviously. Why would I go drinking with his female friends?" "Yeah, you\'re right." "His name is Tom. He\'s really hot, and I really want to go drinking with him again."\t«\xa0Je suis allée boire avec un ami de mon compagnon, et voilà qu\'il est furieux contre moi.\xa0» «\xa0Était-ce un gars ou une fille\xa0?\xa0» «\xa0Un gars, bien évidemment. Pourquoi irais-je boire avec ses amies\xa0?\xa0» «\xa0Ouais, ça se comprend.\xa0» «\xa0Il s\'appelle Tom. Il est trop canon, et j\'ai tellement envie d\'aller prendre un verre avec lui à nouveau.\xa0»\tCC-BY 2.0 (France) Attribution: tatoeba.org #9821215 (DJ_Saidez) & #11726136 (Micsmithel)\n'

In [131]:
print('First 10 lines:')
for line in lines[:10]:
    print(line)

First 10 lines:
Go.	Va !	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)

Go.	Marche.	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)

Go.	En route !	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)

Go.	Bouge !	CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)

Hi.	Salut !	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)

Hi.	Salut.	CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)

Run!	Cours !	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)

Run!	Courez !	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)

Run!	Prenez vos jambes à vos cous !	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)

Run!	File !	CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077454 (sacredceltic)



In [132]:
print('Last 2 lines:')
for line in lines[-2:]:
    print(line)

Last 2 lines:
It may be impossible to get a completely error-free corpus due to the nature of this kind of collaborative effort. However, if we encourage members to contribute sentences in their own languages rather than experiment in languages they are learning, we might be able to minimize errors.	Il est peut-être impossible d'obtenir un Corpus complètement dénué de fautes, étant donnée la nature de ce type d'entreprise collaborative. Cependant, si nous encourageons les membres à produire des phrases dans leurs propres langues plutôt que d'expérimenter dans les langues qu'ils apprennent, nous pourrions être en mesure de réduire les erreurs.	CC-BY 2.0 (France) Attribution: tatoeba.org #2024159 (CK) & #2024564 (sacredceltic)

"I went drinking with one of my boyfriend's friends, and now he's furious at me." "Was this friend a guy or a girl?" "A guy, obviously. Why would I go drinking with his female friends?" "Yeah, you're right." "His name is Tom. He's really hot, and I really want to 

In [133]:
eng_sentences = []
fra_sentences = []

with open(input_path, "r", encoding="utf-8") as infile:
    for line in infile:
        parts = line.strip().split('\t')
        eng_sentences.append(parts[0])
        fra_sentences.append(parts[1])

eng_lens = [len(s.split()) for s in eng_sentences]
fra_lens = [len(s.split()) for s in fra_sentences]

print("English max:", np.max(eng_lens))
print("French max:", np.max(fra_lens))
print("English 98th percentile:", np.percentile(eng_lens, 98))
print("French 98th percentile:", np.percentile(fra_lens, 98))

English max: 55
French max: 68
English 98th percentile: 12.0
French 98th percentile: 14.0


In [134]:
Tx = 13  # Max input (English) length
Ty = 17  # Max output (French) length, including <sos> and <eos>

In [135]:
def filter_and_count_sentences(input_path, max_words=50):
    """
    Reads a parallel corpus file, filters sentence pairs where either the English
    or French sentence exceeds max_words, and returns statistics.

    Args:
        input_path (str): Path to the dataset file (tab-separated English ↔ French).
        max_words (int): Maximum allowed words in both English and French sentences.

    Returns:
        tuple:
            - list: Filtered lines (kept sentence pairs as raw lines).
            - int: Total number of sentence pairs read.
            - int: Number of pairs exceeding max_words in either sentence.
            - int: Number of sentences exceeding max_words in English.
            - int: Number of sentences exceeding max_words in French.
    """
    total_sentences = 0
    filtered_sentences = []
    exceeding_pairs = 0
    english_exceeding = 0
    french_exceeding = 0

    with open(input_path, "r", encoding="utf-8") as infile:
        for line in infile:
            total_sentences += 1
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue  # Skip lines that don't have both English and French

            eng = parts[0].strip()
            fra = parts[1].strip()

            eng_words = eng.split()
            fra_words = fra.split()

            eng_len = len(eng_words)
            fra_len = len(fra_words)

            eng_too_long = eng_len > max_words
            fra_too_long = fra_len > max_words

            if eng_too_long:
                english_exceeding += 1
            if fra_too_long:
                french_exceeding += 1

            if eng_too_long or fra_too_long:
                exceeding_pairs += 1
            else:
                filtered_sentences.append(line.strip())

    return filtered_sentences, total_sentences, exceeding_pairs, english_exceeding, french_exceeding

In [136]:
filtered_sentences, total, too_long, eng_long, fra_long = filter_and_count_sentences(
    "dataset/raw/eng_fra.txt", max_words=Tx)

print(f"Total sentence pairs: {total}")
print(f"Pairs exceeding limit: {too_long}")
print(f"English sentences exceeding limit: {eng_long}")
print(f"French sentences exceeding limit: {fra_long}")
print(f"Filtered and kept: {len(filtered_sentences)}")

Total sentence pairs: 237838
Pairs exceeding limit: 6140
English sentences exceeding limit: 2866
French sentences exceeding limit: 5596
Filtered and kept: 231698


In [137]:
filtered_sentences[0]

'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'

In [138]:
def remove_attributes_from_filtered_sentences(filtered_sentences_with_attributes):
    """
    Takes a list of raw lines from the filtered dataset and removes the
    attribution information, returning only the English and French sentences.

    Args:
        filtered_sentences_with_attributes (list): A list of strings,
                                                   each containing "Eng\tFra\tAttr".

    Returns:
        list: A list of strings, each containing "Eng\tFra".
    """
    clean_sentences = []
    for line in filtered_sentences_with_attributes:
        # Split by the tab character. The first two parts are English and French.
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            # Reconstruct the line with only English and French, joining with a tab
            clean_sentences.append(f"{parts[0]}\t{parts[1]}")
        else:
            # Handle malformed lines if they exist, though they shouldn't if
            # the original data is consistent.
            print(f"Warning: Skipping malformed line: {line.strip()}")
    return clean_sentences

In [139]:
input_path = "dataset/raw/eng_fra.txt"

# Filter sentences based on English word count ---
print(f"Processing dataset from: {input_path}")
max_words = Tx
initial_filtered_sentences, total, too_long, eng_long, fra_long = filter_and_count_sentences(input_path, max_words=max_words)

print(f"\n--- Filtering Results ---")
print(f"Total sentences read: {total}")
print(f"Sentences exceeding {max_words}  words: {too_long}")
print(f"Sentences to keep (<= {max_words} words): {len(initial_filtered_sentences)}")

# Remove attribution from the filtered sentences ---
print(f"\n--- Removing Attributes ---")
final_clean_sentences = remove_attributes_from_filtered_sentences(initial_filtered_sentences)

print(f"First 5 cleaned sentences:")
for i, line in enumerate(final_clean_sentences[:5]):
    print(f"  {i+1}: {line}")

print(f"\nTotal clean sentences for training: {len(final_clean_sentences)}")

output_clean_path = "dataset/processed/eng_fra_clean_filtered.txt"
try:
    with open(output_clean_path, "w", encoding="utf-8") as outfile:
        for sentence_pair in final_clean_sentences:
            outfile.write(sentence_pair + '\n') # Add newline back
    print(f"\nCleaned and filtered sentences saved to: {output_clean_path}")
except IOError as e:
    print(f"Error saving file: {e}")

Processing dataset from: dataset/raw/eng_fra.txt

--- Filtering Results ---
Total sentences read: 237838
Sentences exceeding 13  words: 6140
Sentences to keep (<= 13 words): 231698

--- Removing Attributes ---
First 5 cleaned sentences:
  1: Go.	Va !
  2: Go.	Marche.
  3: Go.	En route !
  4: Go.	Bouge !
  5: Hi.	Salut !

Total clean sentences for training: 231698

Cleaned and filtered sentences saved to: dataset/processed/eng_fra_clean_filtered.txt


In [140]:
final_clean_sentences[100]

'Go now.\tAllez-y maintenant.'

In [141]:
# Load sentence pairs
def load_cleaned_pairs(filepath):
    with open(filepath, encoding="utf-8") as f:
        lines = f.read().strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [142]:
pairs = load_cleaned_pairs("dataset/processed/eng_fra_clean_filtered.txt")

In [143]:
for i in range(5):
    print(f"{i+1}: {pairs[i][0]}  ->  {pairs[i][1]}")

1: Go.  ->  Va !
2: Go.  ->  Marche.
3: Go.  ->  En route !
4: Go.  ->  Bouge !
5: Hi.  ->  Salut !


In [144]:
# Tokenize each sentence (split into list of words)
def tokenize(sentences):
    return [sentence.lower().strip().split() for sentence in sentences]

# Separate English and French
eng_sentences = [pair[0] for pair in pairs]
fra_sentences = [pair[1] for pair in pairs]

# Tokenize
tokenized_eng = tokenize(eng_sentences)
tokenized_fra = tokenize(fra_sentences)

In [145]:
tokenized_fra[:5]

[['va', '!'],
 ['marche.'],
 ['en', 'route', '!'],
 ['bouge', '!'],
 ['salut', '!']]

In [146]:
# Build vocabulary dictionary
def build_vocab(tokenized_sentences):
    vocab = {"<pad>": 0, "<sos>": 1, "<eos>": 2, "<unk>": 3}
    index = 4
    for sentence in tokenized_sentences:
        for word in sentence:
            if word not in vocab:
                vocab[word] = index
                index += 1
    return vocab

eng_vocab = build_vocab(tokenized_eng)
fra_vocab = build_vocab(tokenized_fra)

print("English vocab size:", len(eng_vocab))
print("French vocab size:", len(fra_vocab))

English vocab size: 29267
French vocab size: 47464


In [147]:
eng_vocab['hello']

1018

In [148]:
print("\nSample data pairs:")
for i in range(5):
    eng, fra = pairs[i][0], pairs[i][1]
    print(f"  {i+1}. English: {eng}  ->  French: {fra}")

print("\nInput vocabulary (eng_vocan):")
print(f"  Size: {len(eng_vocab)}")
print(f"  Example mapping: {list(eng_vocab.items())[:7]}")

print("\nOutput vocabulary (fra_vocab):")
print(f"  Size: {len(fra_vocab)}")
print(f"  Example mapping: {list(fra_vocab.items())[:7]}")


Sample data pairs:
  1. English: Go.  ->  French: Va !
  2. English: Go.  ->  French: Marche.
  3. English: Go.  ->  French: En route !
  4. English: Go.  ->  French: Bouge !
  5. English: Hi.  ->  French: Salut !

Input vocabulary (eng_vocan):
  Size: 29267
  Example mapping: [('<pad>', 0), ('<sos>', 1), ('<eos>', 2), ('<unk>', 3), ('go.', 4), ('hi.', 5), ('run!', 6)]

Output vocabulary (fra_vocab):
  Size: 47464
  Example mapping: [('<pad>', 0), ('<sos>', 1), ('<eos>', 2), ('<unk>', 3), ('va', 4), ('!', 5), ('marche.', 6)]


In [149]:
# Convert a tokenized sentence to indices
def sentence_to_indices(sentence_tokens, vocab):
    return [vocab.get(word, vocab["<unk>"]) for word in sentence_tokens]

eng_example = tokenized_eng[100]
fra_example = tokenized_fra[100]

print("English:", eng_example)
print("To indices:", sentence_to_indices(eng_example, eng_vocab))

English: ['go', 'now.']
To indices: [20, 43]


In [159]:
def preprocess_data(pairs, input_vocab, target_vocab, Tx, Ty):
    """
    Prepares input/output sequences for seq2seq word-level training.

    Arguments:
    pairs        -- List of (input_sentence, target_sentence) tuples (strings)
    input_vocab  -- Dict mapping input words to integer indices (e.g. eng_vocab)
    target_vocab -- Dict mapping target words to integer indices (e.g. fra_vocab)
    Tx           -- Fixed input sequence length (input, e.g. English)
    Ty           -- Fixed output sequence length (output, e.g. French)

    Returns:
    X                -- np.array of shape (m, Tx), input sequences (as word indices)
    Y                -- np.array of shape (m, Ty), output sequences (as word indices)
    inv_input_vocab  -- Dict mapping input indices back to words
    inv_target_vocab -- Dict mapping output indices back to words
    """
    m = len(pairs)
    X = np.zeros((m, Tx), dtype=np.int32)
    Y = np.zeros((m, Ty), dtype=np.int32)

    for i, (input_sentence, target_sentence) in enumerate(pairs):
        # Tokenize sentences
        input_tokens = tokenize([input_sentence])[0]
        target_tokens = tokenize([target_sentence])[0]

        # Add <sos> and <eos> to target
        target_tokens = ['<sos>'] + target_tokens + ['<eos>']

        # Convert tokens to indices
        input_ids = sentence_to_indices(input_tokens, input_vocab)
        target_ids = sentence_to_indices(target_tokens, target_vocab)

        # Pad or truncate
        input_ids = input_ids[:Tx] + [input_vocab["<pad>"]] * max(0, Tx - len(input_ids))
        target_ids = target_ids[:Ty] + [target_vocab["<pad>"]] * max(0, Ty - len(target_ids))

        X[i] = input_ids
        Y[i] = target_ids

    # Build inverse vocabularies
    inv_input_vocab = {i: w for w, i in input_vocab.items()}
    inv_target_vocab = {i: w for w, i in target_vocab.items()}

    return X, Y, inv_input_vocab, inv_target_vocab

In [160]:
Tx = 13   # max English length (based on 98th percentile)
Ty = 17   # max French length
input_vocab_size = len(eng_vocab)
target_vocab_size = len(fra_vocab)
embedding_dim = 256

In [162]:
pairs = load_cleaned_pairs("dataset/processed/eng_fra_clean_filtered.txt")

X, Y, inv_eng_vocab, inv_fra_vocab = preprocess_data(pairs, eng_vocab, fra_vocab, Tx, Ty)

print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: (231698, 13)
Y shape: (231698, 17)


In [207]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Concatenate, Attention
from tensorflow.keras.models import Model

def create_nmt_model(Tx, Ty, input_vocab_size, target_vocab_size, embedding_dim=256, lstm_units=512):
    # ----- Encoder -----
    encoder_inputs = Input(shape=(Tx,), name="encoder_inputs")   # (m, Tx)
    enc_embedding = Embedding(input_vocab_size, embedding_dim, mask_zero=True, name="encoder_embedding")(encoder_inputs) # (m, tx, e_dim)
    encoder_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, return_state=True), name="bi_encoder_lstm")
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(enc_embedding)
    encoder_outputs = Dense(lstm_units, activation="tanh")(encoder_outputs)  # (None, Tx, 256)

    # Concatenate forward and backward states
    state_h = Concatenate()([forward_h, backward_h])  # (batch_size, lstm_units*2)
    state_c = Concatenate()([forward_c, backward_c])

    # ----- Decoder -----
    decoder_inputs = Input(shape=(None,), name="decoder_inputs")  # shifted target input
    dec_embedding = Embedding(target_vocab_size, embedding_dim, mask_zero=True, name="decoder_embedding")(decoder_inputs)

    # Project encoder_outputs if needed for attention
    attention = Attention(name="attention_layer")  # Bahdanau uses score mechanism; you can switch to AdditiveAttention
    context_vector = attention([dec_embedding, encoder_outputs])  # Shape: (batch, Ty-1, enc_seq_len)

    # Concatenate context vector with decoder embedding
    decoder_combined_input = Concatenate(axis=-1)([dec_embedding, context_vector])

    # Decoder LSTM (unidirectional)
    decoder_lstm = LSTM(lstm_units * 2, return_sequences=True, name="decoder_lstm")
    decoder_outputs = decoder_lstm(decoder_combined_input, initial_state=[state_h, state_c])

    # Output layer
    dense = Dense(target_vocab_size, activation='softmax', name="output_dense")
    decoder_outputs = dense(decoder_outputs)

    # Build Model
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)
    return model

In [208]:
model = create_nmt_model(
    Tx=13,
    Ty=17,
    input_vocab_size=len(eng_vocab),
    target_vocab_size=len(fra_vocab),
    embedding_dim=256,
    lstm_units=256
)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [209]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 13)]         0           []                               
                                                                                                  
 encoder_embedding (Embedding)  (None, 13, 256)      7492352     ['encoder_inputs[0][0]']         
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 bi_encoder_lstm (Bidirectional  [(None, 13, 512),   1050624     ['encoder_embedding[0][0]']      
 )                               (None, 256),                                               

In [210]:
import tensorflow as tf
from tqdm import tqdm

# Loss and optimizer
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')
optimizer = tf.keras.optimizers.Adam()

# Custom loss
def loss_function(real, pred):
    mask = tf.math.not_equal(real, 0)
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask  # Apply mask to ignore padding
    return tf.reduce_mean(loss_)

# Metrics
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')


In [211]:
@tf.function
def train_step(inp, targ, model):
    with tf.GradientTape() as tape:
        predictions = model([inp, targ[:, :-1]], training=True)  # Exclude last token of target
        loss = loss_function(targ[:, 1:], predictions)  # Exclude first token (start token)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(targ[:, 1:], predictions)


In [212]:
def train_model(model, dataset, epochs):
    for epoch in range(epochs):
        train_loss.reset_state()
        train_accuracy.reset_state()

        print(f'\nEpoch {epoch + 1}/{epochs}')
        for (batch, (inp, targ)) in enumerate(tqdm(dataset)):
            train_step(inp, targ, model)

        print(f'Epoch {epoch + 1} Loss: {train_loss.result():.4f}, Accuracy: {train_accuracy.result():.4f}')


In [213]:
BUFFER_SIZE = len(X)
BATCH_SIZE = 64

train_dataset = tf.data.Dataset.from_tensor_slices((X, Y))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


In [214]:
num_samples = 5000
X_small = X[:num_samples]
Y_small = Y[:num_samples]

BATCH_SIZE = 32

train_dataset = tf.data.Dataset.from_tensor_slices((X_small, Y_small))
train_dataset = train_dataset.shuffle(buffer_size=num_samples).batch(BATCH_SIZE, drop_remainder=True)


In [219]:
train_model(model, train_dataset, epochs=10)  

  3%|▎         | 4/156 [00:00<00:04, 33.32it/s]


Epoch 1/10


100%|██████████| 156/156 [00:04<00:00, 36.64it/s]
  3%|▎         | 4/156 [00:00<00:04, 32.58it/s]

Epoch 1 Loss: 0.9152, Accuracy: 0.1049

Epoch 2/10


100%|██████████| 156/156 [00:04<00:00, 36.71it/s]
  3%|▎         | 4/156 [00:00<00:04, 34.04it/s]

Epoch 2 Loss: 0.7926, Accuracy: 0.1176

Epoch 3/10


100%|██████████| 156/156 [00:04<00:00, 36.64it/s]
  3%|▎         | 4/156 [00:00<00:04, 33.75it/s]

Epoch 3 Loss: 0.6929, Accuracy: 0.1275

Epoch 4/10


100%|██████████| 156/156 [00:04<00:00, 36.57it/s]
  3%|▎         | 4/156 [00:00<00:04, 33.43it/s]

Epoch 4 Loss: 0.6101, Accuracy: 0.1342

Epoch 5/10


100%|██████████| 156/156 [00:04<00:00, 36.72it/s]
  3%|▎         | 4/156 [00:00<00:04, 33.70it/s]

Epoch 5 Loss: 0.5390, Accuracy: 0.1415

Epoch 6/10


100%|██████████| 156/156 [00:04<00:00, 36.71it/s]
  3%|▎         | 4/156 [00:00<00:04, 32.84it/s]

Epoch 6 Loss: 0.4754, Accuracy: 0.1473

Epoch 7/10


100%|██████████| 156/156 [00:04<00:00, 36.48it/s]
  3%|▎         | 4/156 [00:00<00:04, 33.78it/s]

Epoch 7 Loss: 0.4177, Accuracy: 0.1549

Epoch 8/10


100%|██████████| 156/156 [00:04<00:00, 36.30it/s]
  3%|▎         | 4/156 [00:00<00:04, 33.25it/s]

Epoch 8 Loss: 0.3680, Accuracy: 0.1613

Epoch 9/10


100%|██████████| 156/156 [00:04<00:00, 36.38it/s]
  3%|▎         | 4/156 [00:00<00:04, 33.66it/s]

Epoch 9 Loss: 0.3250, Accuracy: 0.1671

Epoch 10/10


100%|██████████| 156/156 [00:04<00:00, 36.51it/s]

Epoch 10 Loss: 0.2868, Accuracy: 0.1734





In [220]:
def encode_sentence(sentence, word2idx, Tx):
    tokens = sentence.lower().strip().split()
    indices = [word2idx.get(word, word2idx['<unk>']) for word in tokens]
    padded = indices + [word2idx['<pad>']] * (Tx - len(indices))
    return padded[:Tx]

def decode_sequence(sequence, idx2word):
    words = [idx2word.get(idx, '') for idx in sequence]
    return ' '.join([word for word in words if word not in ['<pad>', '<end>']])


In [221]:
def predict_translation(model, sentence, Tx, Ty, eng_word2idx, fra_word2idx, fra_idx2word):
    input_seq = encode_sentence(sentence, eng_word2idx, Tx)
    input_tensor = tf.convert_to_tensor([input_seq])  # shape: (1, Tx)

    # Initialize decoder input with <sos> followed by zeros (padding)
    start_token = fra_word2idx['<sos>']
    end_token = fra_word2idx['<eos>']
    decoder_input = tf.convert_to_tensor([[start_token] + [0] * (Ty - 1)])  # shape: (1, Ty)

    translated_ids = []

    for t in range(Ty - 1):
        # Predict output
        predictions = model([input_tensor, decoder_input])  # shape: (1, Ty, vocab_size)

        # Get token at timestep t
        predicted_id = tf.argmax(predictions[0, t]).numpy()
        translated_ids.append(predicted_id)

        if predicted_id == end_token:
            break

        # Update decoder input for next timestep
        decoder_input = decoder_input.numpy()
        decoder_input[0, t + 1] = predicted_id
        decoder_input = tf.convert_to_tensor(decoder_input)

    return decode_sequence(translated_ids, fra_idx2word)


In [222]:
english_sentence = "how are you"
translation = predict_translation(model, english_sentence, Tx, Ty, eng_vocab, fra_vocab, inv_fra_vocab)
print(f"English: {english_sentence}")
print(f"French: {translation}")

English: how are you
French: êtes-vous ? <eos>


In [173]:
train_model(model, train_dataset, epochs=3)

  0%|          | 0/3620 [00:00<?, ?it/s]


Epoch 1/3


100%|██████████| 3620/3620 [02:47<00:00, 21.62it/s]
  0%|          | 0/3620 [00:00<?, ?it/s]

Epoch 1 Loss: 1.8793, Accuracy: 0.1800

Epoch 2/3


  6%|▋         | 235/3620 [00:10<02:35, 21.73it/s]


KeyboardInterrupt: 