In [1]:
!pip install tensorflow numpy pandas




In [4]:
!wget http://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip

# Read the dataset
lines = open('fra.txt', encoding='utf-8').read().strip().split('\n')
pairs = [line.split('\t') for line in lines]


--2025-06-24 08:14:01--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8143096 (7.8M) [application/zip]
Saving to: ‘fra-eng.zip’


2025-06-24 08:14:01 (15.6 MB/s) - ‘fra-eng.zip’ saved [8143096/8143096]

Archive:  fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [5]:
# Download and extract English–French sentence pairs
!wget http://www.manythings.org/anki/fra-eng.zip
!unzip fra-eng.zip

--2025-06-24 08:14:07--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8143096 (7.8M) [application/zip]
Saving to: ‘fra-eng.zip.1’


2025-06-24 08:14:08 (13.7 MB/s) - ‘fra-eng.zip.1’ saved [8143096/8143096]

Archive:  fra-eng.zip
replace _about.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: _about.txt              
  inflating: fra.txt                 


In [6]:
# Load and preview the dataset
lines = open('fra.txt', encoding='utf-8').read().strip().split('\n')
pairs = [line.split('\t') for line in lines]
print(f"Total pairs: {len(pairs)}")
print("Example:", pairs[0])

Total pairs: 237838
Example: ['Go.', 'Va !', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)']


In [7]:
import tensorflow as tf
import numpy as np
import re

# Function to clean each sentence
def preprocess_sentence(s):
    s = s.lower().strip()
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)
    s = s.strip()
    s = '<start> ' + s + ' <end>'
    return s

# Limit to 30,000 examples
num_examples = 30000
input_texts = []
target_texts = []

# Safe unpacking of sentence pairs
for pair in pairs[:num_examples]:
    if len(pair) < 2:
        continue  # skip malformed lines
    eng, fr = pair[0], pair[1]
    input_texts.append(preprocess_sentence(eng))
    target_texts.append(preprocess_sentence(fr))

# Preview preprocessed sentences
print("Total cleaned pairs:", len(input_texts))
print("Sample input sentence:", input_texts[0])
print("Sample target sentence:", target_texts[0])


Total cleaned pairs: 30000
Sample input sentence: <start> go . <end>
Sample target sentence: <start> va ! <end>


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize input (English)
inp_tokenizer = Tokenizer(filters='')
inp_tokenizer.fit_on_texts(input_texts)
input_tensor = inp_tokenizer.texts_to_sequences(input_texts)
input_tensor = pad_sequences(input_tensor, padding='post')

# Tokenize target (French)
tgt_tokenizer = Tokenizer(filters='')
tgt_tokenizer.fit_on_texts(target_texts)
target_tensor = tgt_tokenizer.texts_to_sequences(target_texts)
target_tensor = pad_sequences(target_tensor, padding='post')

# Vocabulary sizes
inp_vocab_size = len(inp_tokenizer.word_index) + 1
tgt_vocab_size = len(tgt_tokenizer.word_index) + 1

# Print sample tokenized sentence
print("Input tensor example:", input_tensor[0])
print("Target tensor example:", target_tensor[0])
print("Input vocab size:", inp_vocab_size)
print("Target vocab size:", tgt_vocab_size)


Input tensor example: [ 1 28  3  2  0  0  0  0  0]
Target tensor example: [ 1 67  8  2  0  0  0  0  0  0  0  0  0  0  0  0  0]
Input vocab size: 4293
Target vocab size: 6908


In [9]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

# Set embedding and units
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(input_dim=inp_vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(units, return_state=True)(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(input_dim=tgt_vocab_size, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

decoder_dense = Dense(tgt_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [10]:
# Convert target tensor to numpy array
import numpy as np

decoder_input_data = target_tensor[:, :-1]  # all except last token
decoder_target_data = target_tensor[:, 1:]  # all except first token

# Add an extra dimension (required for sparse_categorical_crossentropy)
decoder_target_data = np.expand_dims(decoder_target_data, -1)

# Print shapes
print("Encoder input shape:", input_tensor.shape)
print("Decoder input shape:", decoder_input_data.shape)
print("Decoder target shape:", decoder_target_data.shape)


Encoder input shape: (30000, 9)
Decoder input shape: (30000, 16)
Decoder target shape: (30000, 16, 1)


In [11]:
model.fit(
    [input_tensor, decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m753s[0m 2s/step - loss: 2.4184 - val_loss: 1.7981
Epoch 2/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m751s[0m 2s/step - loss: 1.3868 - val_loss: 1.5478
Epoch 3/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m801s[0m 2s/step - loss: 1.1530 - val_loss: 1.3696
Epoch 4/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m755s[0m 2s/step - loss: 0.9446 - val_loss: 1.2550
Epoch 5/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m803s[0m 2s/step - loss: 0.8022 - val_loss: 1.1903
Epoch 6/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m798s[0m 2s/step - loss: 0.6916 - val_loss: 1.1475
Epoch 7/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m753s[0m 2s/step - loss: 0.5971 - val_loss: 1.1126
Epoch 8/10
[1m375/375[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m791s[0m 2s/step - loss: 0.5166 - val_loss: 1.0805
Epoch 9/10
[1m375/375[0m [32m

<keras.src.callbacks.history.History at 0x782a503f7ed0>

In [12]:
# Define encoder model for inference
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
decoder_state_input_h = Input(shape=(units,))
decoder_state_input_c = Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2
)

In [13]:
def translate(sentence):
    # Preprocess and tokenize the input sentence
    sentence = preprocess_sentence(sentence)
    sequence = inp_tokenizer.texts_to_sequences([sentence])
    sequence = pad_sequences(sequence, maxlen=input_tensor.shape[1], padding='post')

    # Encode the input sentence
    states_value = encoder_model.predict(sequence)

    # Create empty target sequence with only the <start> token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tgt_tokenizer.word_index['<start>']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Get the word with highest probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = tgt_tokenizer.index_word.get(sampled_token_index, '')

        if (sampled_word == '<end>' or len(decoded_sentence.split()) > 20):
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_word

        # Update target sequence and states
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return decoded_sentence.strip()

In [14]:
print("English: I love you.")
print("French :", translate("I love you."))

English: I love you.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
French : je t aime .


In [15]:
print("English: How Are You.")
print("French :", translate("How Are You."))

English: How Are You.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
French : comme vous tes grande !
