In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer , tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import json

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("devicharith/language-translation-englishfrench")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/devicharith/language-translation-englishfrench?dataset_version_number=2...


100%|██████████| 3.51M/3.51M [00:00<00:00, 165MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/devicharith/language-translation-englishfrench/versions/2





In [3]:
data = pd.read_csv("/root/.cache/kagglehub/datasets/devicharith/language-translation-englishfrench/versions/2/eng_-french.csv",names=["English","French"])
english_sentences = data["English"].tolist()
french_sentences = data["French"].tolist()
print(data.head())

                   English                  French
0  English words/sentences  French words/sentences
1                      Hi.                  Salut!
2                     Run!                 Cours !
3                     Run!                Courez !
4                     Who?                   Qui ?


In [4]:
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_seq = tokenizer_eng.texts_to_sequences(english_sentences)

tokenizer_fr = Tokenizer()
tokenizer_fr.fit_on_texts(french_sentences)
fr_seq = tokenizer_fr.texts_to_sequences(french_sentences)

In [5]:
print(eng_seq[0:5])
print(fr_seq[0:5])

[[291, 634, 3272], [2818], [429], [429], [79]]
[[18607, 18608, 18609], [4241], [6947], [18610], [32]]


In [6]:
vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_fr = len(tokenizer_fr.word_index) + 1
print(vocab_size_eng)
print(vocab_size_fr)

14532
30664


In [7]:
max_length = max(len(seq) for seq in eng_seq + fr_seq)
eng_seq_padded = pad_sequences(eng_seq, maxlen=max_length, padding='post')
fr_seq_padded = pad_sequences(fr_seq, maxlen=max_length, padding='post')

In [8]:
embedding_dim = 256 # Dimension of the embedding space
units = 512 # Number of units in the LSTM layers for both the encoder and decoder

In [9]:
encoder_inputs = Input(shape=(max_length,))
enc_emb = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c] # Stores the LSTM's final hidden and cell states,
                                   #which will be used to initialize the decoder for generating the output sequence.

In [10]:
decoder_inputs = Input(shape=(max_length,))
dec_emb_layer = Embedding(input_dim=vocab_size_fr, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True) # return_sequences: Whether to return the last output in the output sequence,
                                                                    #or the full sequence
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_fr, activation='softmax')
output = decoder_dense(decoder_outputs)

In [11]:
model = Model([encoder_inputs, decoder_inputs], output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [12]:
X_train, x_temp, y_train, y_temp = train_test_split(eng_seq_padded, fr_seq_padded, test_size=0.2)
X_val, X_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5)

In [13]:
model.fit([X_train, X_train], y_train, validation_data=([X_val, X_val], y_val), epochs=5, batch_size=128)

Epoch 1/5
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m600s[0m 541ms/step - accuracy: 0.8785 - loss: 1.2244 - val_accuracy: 0.8991 - val_loss: 0.7080
Epoch 2/5
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m624s[0m 545ms/step - accuracy: 0.9019 - loss: 0.6682 - val_accuracy: 0.9067 - val_loss: 0.6020
Epoch 3/5
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m623s[0m 546ms/step - accuracy: 0.9094 - loss: 0.5565 - val_accuracy: 0.9124 - val_loss: 0.5261
Epoch 4/5
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m636s[0m 559ms/step - accuracy: 0.9153 - loss: 0.4736 - val_accuracy: 0.9166 - val_loss: 0.4797
Epoch 5/5
[1m1098/1098[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m609s[0m 547ms/step - accuracy: 0.9205 - loss: 0.4112 - val_accuracy: 0.9195 - val_loss: 0.4500


<keras.src.callbacks.history.History at 0x79b3613bd210>

In [14]:
model.evaluate([X_test, X_test], y_test)

[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 72ms/step - accuracy: 0.9209 - loss: 0.4433


[0.4443463981151581, 0.9204893112182617]

In [15]:
def translate_sentence(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    translated = np.argmax(model.predict([padded, padded]), axis=-1)

    translated_sentence = []
    for i in translated[0]:
        if i in tokenizer_fr.index_word:
            translated_sentence.append(tokenizer_fr.index_word[i])
        else:
            translated_sentence.append(' ')

    return ' '.join(translated_sentence)

In [16]:
input_sentence = "I am french."
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 221ms/step
Input: I am french.
Translated: je suis français français                                                                                                      


In [17]:
input_sentence = "I love you."
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Input: I love you.
Translated: je t'aime                                                                                                          


In [18]:
input_sentence = "we can go study."
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
Input: we can go study.
Translated: nous pouvons y                                                                                                        


In [19]:
# Create directories for model artifacts
os.makedirs("artifacts/seq2seq", exist_ok=True)

In [20]:
# Save the full model
model.save("artifacts/seq2seq/model.keras")
print("Full model saved to artifacts/seq2seq/model.keras")

Full model saved to artifacts/seq2seq/model.keras


In [22]:
# Save tokenizers
with open("artifacts/seq2seq/tokenizer_eng.json", "w") as f:
    f.write(tokenizer_eng.to_json())
print("English tokenizer saved to artifacts/seq2seq/tokenizer_eng.json")

with open("artifacts/seq2seq/tokenizer_fr.json", "w") as f:
    f.write(tokenizer_fr.to_json())
print("French tokenizer saved to artifacts/seq2seq/tokenizer_fr.json")

# Also save some metadata for later reference
metadata = {
    "vocab_size_eng": vocab_size_eng,
    "vocab_size_fr": vocab_size_fr,
    "embedding_dim": embedding_dim,
    "units": units,
    "max_length": max_length
}

with open("artifacts/seq2seq/metadata.json", "w") as f:
    json.dump(metadata, f)
print("Model metadata saved to artifacts/seq2seq/metadata.json")

print("All Seq2Seq model artifacts saved successfully!")

# Test loading the saved models
print("\nTesting model loading...")
try:
    from tensorflow.keras.models import load_model

    # Load encoder
    loaded_encoder = load_model("artifacts/seq2seq/encoder_model.keras")
    print("✓ Encoder model loaded successfully")

    # Load decoder
    loaded_decoder = load_model("artifacts/seq2seq/decoder_model.keras")
    print("✓ Decoder model loaded successfully")

    # Load tokenizers
    with open("artifacts/seq2seq/tokenizer_eng.json", "r") as f:
        loaded_tokenizer_eng = tokenizer_from_json(f.read())
    print("✓ English tokenizer loaded successfully")

    with open("artifacts/seq2seq/tokenizer_fr.json", "r") as f:
        loaded_tokenizer_fr = tokenizer_from_json(f.read())
    print("✓ French tokenizer loaded successfully")

    print("\nModel artifacts can be used for inference!")
except Exception as e:
    print(f"Error during loading test: {str(e)}")

English tokenizer saved to artifacts/seq2seq/tokenizer_eng.json
French tokenizer saved to artifacts/seq2seq/tokenizer_fr.json
Model metadata saved to artifacts/seq2seq/metadata.json
All Seq2Seq model artifacts saved successfully!

Testing model loading...
Error during loading test: File not found: filepath=artifacts/seq2seq/encoder_model.keras. Please ensure the file is an accessible `.keras` zip file.
