<a href="https://colab.research.google.com/github/maruthidaggu/BATCH30/blob/main/nlpLab7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample dataset
data = [
    ("hello", "bonjour"),
    ("how are you", "comment ça va"),
    ("I am fine", "je vais bien"),
    ("what is your name", "comment tu t'appelles"),
    ("my name is", "je m'appelle"),
    ("thank you", "merci"),
    ("goodbye", "au revoir")
]

# (a) Data Preprocessing
def preprocess_data(data):
    # Split English and French sentences
    eng_texts = [pair[0] for pair in data]
    fra_texts = ['<start> ' + pair[1] + ' <end>' for pair in data]

    # Create tokenizers
    eng_tokenizer = Tokenizer()
    fra_tokenizer = Tokenizer()

    # Fit tokenizers
    eng_tokenizer.fit_on_texts(eng_texts)
    fra_tokenizer.fit_on_texts(fra_texts)

    # Convert texts to sequences
    eng_sequences = eng_tokenizer.texts_to_sequences(eng_texts)
    fra_sequences = fra_tokenizer.texts_to_sequences(fra_texts)

    # Find maximum lengths
    max_eng_len = max(len(seq) for seq in eng_sequences)
    max_fra_len = max(len(seq) for seq in fra_sequences)

    # Pad sequences
    eng_padded = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
    fra_padded = pad_sequences(fra_sequences, maxlen=max_fra_len, padding='post')

    return eng_padded, fra_padded, eng_tokenizer, fra_tokenizer, max_eng_len, max_fra_len

# Process the data
eng_data, fra_data, eng_tokenizer, fra_tokenizer, max_eng_len, max_fra_len = preprocess_data(data)

# Get vocabulary sizes
eng_vocab_size = len(eng_tokenizer.word_index) + 1
fra_vocab_size = len(fra_tokenizer.word_index) + 1

# Print shapes for debugging
print("Input shapes:")
print(f"English data shape: {eng_data.shape}")
print(f"French data shape: {fra_data.shape}")

# (b) Build Seq2Seq Model
def build_model(input_vocab, output_vocab, input_length, output_length):
    # Encoder
    encoder_inputs = Input(shape=(input_length,))
    encoder_embedding = Embedding(input_vocab, 50)(encoder_inputs)
    encoder = LSTM(100, return_state=True)
    encoder_outputs, state_h, state_c = encoder(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(output_length,))
    decoder_embedding = Embedding(output_vocab, 50)(decoder_inputs)
    decoder_lstm = LSTM(100, return_sequences=True)
    decoder_outputs = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(output_vocab, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Create model
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

# (c) Prepare Data for Training
decoder_input_data = fra_data[:, :-1]  # Remove last token
decoder_target_data = fra_data[:, 1:]  # Remove first token

# Print shapes for debugging
print("\nTraining data shapes:")
print(f"Decoder input shape: {decoder_input_data.shape}")
print(f"Decoder target shape: {decoder_target_data.shape}")

# (d) Train the Model
model = build_model(
    eng_vocab_size,
    fra_vocab_size,
    max_eng_len,
    max_fra_len - 1
)

# Compile model
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# Train model
history = model.fit(
    [eng_data, decoder_input_data],
    decoder_target_data,
    batch_size=2,
    epochs=100,
    validation_split=0.2
)

# (e) Inference Setup
class Translator:
    def _init_(self, model, eng_tokenizer, fra_tokenizer, max_eng_len, max_fra_len):
        self.model = model
        self.eng_tokenizer = eng_tokenizer
        self.fra_tokenizer = fra_tokenizer
        self.max_eng_len = max_eng_len
        self.max_fra_len = max_fra_len
        self.fra_index_word = {v: k for k, v in fra_tokenizer.word_index.items()}

    def translate(self, text):
        # Tokenize input text
        sequence = self.eng_tokenizer.texts_to_sequences([text])
        padded = pad_sequences(sequence, maxlen=self.max_eng_len, padding='post')

        # Initialize target sequence
        target_seq = np.zeros((1, self.max_fra_len - 1))

        # Generate translation
        prediction = self.model.predict([padded, target_seq])

        # Convert prediction to text
        output_sequence = np.argmax(prediction[0], axis=1)
        translated_text = []

        for idx in output_sequence:
            if idx != 0:
                word = self.fra_index_word.get(idx, '')
                if word and word not in ['<start>', '<end>']:
                    translated_text.append(word)

        return ' '.join(translated_text)

# (f) Translate New Sentences
translator = Translator(model, eng_tokenizer, fra_tokenizer, max_eng_len, max_fra_len)

# Test translations
test_sentences = [
    "hello",
    "thank you",
    "your name",
    "how are you"
]

print("\nTranslations:")
for sentence in test_sentences:
    translation = translator.translate(sentence)
    print(f"English: {sentence}")
    print(f"French: {translation}\n")