# User Trained Language Translator

This model is a multilingual text translation system using a sequence-to-sequence architecture with LSTM layers. It preprocesses text data by tokenizing and padding sequences, then trains an encoder-decoder model to translate between languages. The model predicts translations word by word and uses a Tkinter-based GUI for user interaction. Users can input a sentence, select languages, and view the translation along with training metrics (loss and accuracy).

## Importing Necessary Libraries

In [4]:
import tkinter as tk
from tkinter import ttk, messagebox
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, RepeatVector, TimeDistributed
from sklearn.model_selection import train_test_split

## Loading and Preprocessing the Dataset

In [6]:
# Load the dataset
data = pd.read_csv("lt dataset updated.csv")

# List of available languages
languages = data.columns.tolist()

## Creating and Fitting Tokenizers

In [8]:
# Preprocess the text data and create tokenizers for each language
tokenizers = {}
sequences = {}
max_lengths = {}

for language in languages:
    tokenizer = Tokenizer()
    tokenizers[language] = tokenizer
    texts = data[language].values
    tokenizer.fit_on_texts(texts)
    sequences[language] = tokenizer.texts_to_sequences(texts)
    max_lengths[language] = max(len(seq) for seq in sequences[language])

## Padding Sequences to Ensure Equal Length

In [10]:
# Add padding to make all sequences the same length
padded_sequences = {}
for language in languages:
    padded_sequences[language] = pad_sequences(sequences[language], maxlen=max_lengths[language], padding='post')

## Preparing Target Sequences with Start and End Tokens

In [12]:
# Function to create target sequences with start and end tokens
def prepare_target_sequences(target_texts, tokenizer, max_len):
    target_texts = ['<start> ' + text + ' <end>' for text in target_texts]
    target_sequences = tokenizer.texts_to_sequences(target_texts)
    return pad_sequences(target_sequences, maxlen=max_len, padding='post')

## Defining Source and Target Languages

In [14]:
# Define source and target languages
source_language = "English"  # Replace with a dynamic choice if needed
target_language = "French"   # Replace with a dynamic choice if needed

## Preparing Data for Training

In [16]:
# Prepare training data
source_seq = padded_sequences[source_language]
target_texts = data[target_language].values
target_seq = prepare_target_sequences(target_texts, tokenizers[target_language], max_lengths[target_language])

# Prepare the target data for training (shifted sequences)
target_seq_input = np.zeros_like(target_seq)
target_seq_input[:, 1:] = target_seq[:, :-1]

## Defining the Model Architecture

In [18]:
# Define vocabulary sizes
source_vocab_size = len(tokenizers[source_language].word_index) + 1
target_vocab_size = len(tokenizers[target_language].word_index) + 1

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(source_seq, target_seq_input, test_size=0.2)

# Define the model
model = Sequential()

# Encoder
model.add(Embedding(input_dim=source_vocab_size, output_dim=256, input_length=max_lengths[source_language]))
model.add(LSTM(256))

# Decoder
model.add(RepeatVector(max_lengths[target_language]))
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(target_vocab_size, activation='softmax')))



## Compiling and Training the Model

In [20]:
from tensorflow.keras.callbacks import EarlyStopping

# Define early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',  # Can be 'val_accuracy' if you prefer
    patience=3,          # Number of epochs to wait for improvement
    restore_best_weights=True  # Restore the best model weights when stopping
)

# Compile the model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Train the model with early stopping
print("Training the model, please wait...")
history = model.fit(
    X_train,
    np.expand_dims(y_train, -1),
    epochs=10,  # Adjust epochs as needed for your dataset
    batch_size=32,
    validation_data=(X_test, np.expand_dims(y_test, -1)),
    callbacks=[early_stopping]  # Add early stopping callback here
)
print("Model training complete!")


Training the model, please wait...
Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 255ms/step - accuracy: 0.6123 - loss: 3.3578 - val_accuracy: 0.6155 - val_loss: 2.8386
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 254ms/step - accuracy: 0.6217 - loss: 2.5604 - val_accuracy: 0.6584 - val_loss: 2.3339
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 254ms/step - accuracy: 0.6587 - loss: 2.2874 - val_accuracy: 0.6662 - val_loss: 2.2729
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 254ms/step - accuracy: 0.6702 - loss: 2.2043 - val_accuracy: 0.6740 - val_loss: 2.1914
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m318s[0m 254ms/step - accuracy: 0.6817 - loss: 2.0967 - val_accuracy: 0.6871 - val_loss: 2.0651
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m317s[0m 253ms/step - accuracy: 0.6993 - loss: 1.9451

In [21]:
# # Compile the model
# model.compile(
#     loss='sparse_categorical_crossentropy',
#     optimizer='adam',
#     metrics=['accuracy']
# )

# # Train the model
# print("Training the model, please wait...")
# history = model.fit(
#     X_train,
#     np.expand_dims(y_train, -1),
#     epochs=20,  # Adjust epochs as needed for your dataset
#     batch_size=32,
#     validation_data=(X_test, np.expand_dims(y_test, -1))
# )
# print("Model training complete!")

## Evaluating the Model's Performance

In [23]:
# Calculate total loss and accuracy
final_train_loss = history.history['loss'][-1]
final_train_accuracy = history.history['accuracy'][-1]
final_val_loss = history.history['val_loss'][-1]
final_val_accuracy = history.history['val_accuracy'][-1]

## Creating the Translation Function

In [25]:
# Function to translate a sentence
def translate_sentence(sentence, src_lang, tgt_lang):
    source_tokenizer = tokenizers[src_lang]
    target_tokenizer = tokenizers[tgt_lang]
    
    sequence = source_tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=max_lengths[src_lang], padding='post')
    predicted_seq = model.predict(padded_sequence)
    
    translated_words = []
    for word_probs in predicted_seq[0]:
        word_index = np.argmax(word_probs)
        
        if word_index == 0:  # Padding token
            continue
        if word_index == target_tokenizer.word_index.get('<end>', -1):
            break
        if word_index == target_tokenizer.word_index.get('<start>', -1):
            continue
        
        word = target_tokenizer.index_word.get(word_index, None)
        if word is None:  # Handle unknown token
            break
        
        translated_words.append(word)
    
    return ' '.join(translated_words)

## Building the User Interface (GUI)

In [27]:
# Create the GUI
def create_interface():
    def translate():
        sentence = input_sentence.get()
        src_lang = source_language_combobox.get()
        tgt_lang = target_language_combobox.get()
        
        if not sentence or not src_lang or not tgt_lang:
            messagebox.showerror("Error", "Please fill all fields!")
            return
        
        if src_lang == tgt_lang:
            messagebox.showerror("Error", "Source and Target languages cannot be the same!")
            return
        
        try:
            translated_sentence = translate_sentence(sentence, src_lang, tgt_lang)
            result_label.config(text=f"Translated Sentence: {translated_sentence}")
        except Exception as e:
            messagebox.showerror("Error", f"Translation failed: {str(e)}")

    root = tk.Tk()
    root.title("Multilingual Translator")
    root.geometry("700x500")
    
    # Title
    tk.Label(root, text="Multilingual Translator", font=("Arial", 16, "bold")).pack(pady=10)
    
    # Source Language
    tk.Label(root, text="Source Language:", font=("Arial", 12)).pack(pady=5)
    source_language_combobox = ttk.Combobox(root, values=languages, state="readonly")
    source_language_combobox.pack(pady=5)
    source_language_combobox.set(source_language)
    
    # Target Language
    tk.Label(root, text="Target Language:", font=("Arial", 12)).pack(pady=5)
    target_language_combobox = ttk.Combobox(root, values=languages, state="readonly")
    target_language_combobox.pack(pady=5)
    target_language_combobox.set(target_language)
    
    # Input Sentence
    tk.Label(root, text="Enter Sentence:", font=("Arial", 12)).pack(pady=5)
    input_sentence = tk.Entry(root, width=50, font=("Arial", 12))
    input_sentence.pack(pady=5)
    
    # Translate Button
    tk.Button(root, text="Translate", font=("Arial", 12), command=translate).pack(pady=10)
    
    # Translation Result
    result_label = tk.Label(root, text="Translated Sentence: ", font=("Arial", 12), wraplength=600, justify="left")
    result_label.pack(pady=10)
    
    # Metrics
    metrics_label = tk.Label(
        root,
        text=f"Training Loss: {final_train_loss:.4f}, Training Accuracy: {final_train_accuracy:.4f}\n"
             f"Validation Loss: {final_val_loss:.4f}, Validation Accuracy: {final_val_accuracy:.4f}",
        font=("Arial", 12),
        justify="left",
        wraplength=600
    )
    metrics_label.pack(pady=10)
    
    root.mainloop()

## Launching the Application

In [None]:
# Launch the GUI
create_interface()