<a href="https://colab.research.google.com/github/madanjha/Machine-Learning/blob/main/Encoder%26Decoder_Eng2French_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample dataset
english_sentences = ["I love NLP", "How are you"]
french_sentences = ["J'adore le NLP", "Comment allez-vous"]

# Tokenize English
eng_tokenizer = Tokenizer()
eng_tokenizer.fit_on_texts(english_sentences)
eng_sequences = eng_tokenizer.texts_to_sequences(english_sentences)
eng_vocab_size = len(eng_tokenizer.word_index) + 1

# Tokenize French
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(french_sentences)
fr_sequences = fr_tokenizer.texts_to_sequences(french_sentences)
fr_vocab_size = len(fr_tokenizer.word_index) + 1

# Pad sequences
max_eng_len = max(len(seq) for seq in eng_sequences)
max_fr_len = max(len(seq) for seq in fr_sequences)
eng_sequences = pad_sequences(eng_sequences, maxlen=max_eng_len, padding='post')
fr_sequences = pad_sequences(fr_sequences, maxlen=max_fr_len, padding='post')

# Model parameters
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_eng_len,))
# 256-Dimensional Embedding Vector
enc_emb = Embedding(eng_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
# Here, this our LSTM-based Encoder
encoder_lstm = LSTM(latent_dim, return_state=True)
# WE are processing the information through encoder
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# This is the o/p provided to decoder
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_fr_len,))
dec_emb = Embedding(fr_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare target sequences
decoder_target_data = np.expand_dims(fr_sequences, -1)

# Train model
model.fit([eng_sequences, fr_sequences], decoder_target_data, batch_size=64, epochs=50)

# Inference Models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_emb2 = Embedding(fr_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(decoder_emb2, initial_state=decoder_states_inputs)
decoder_states = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states)

# Translation function
def translate_sentence(input_sentence):
    input_seq = eng_tokenizer.texts_to_sequences([input_sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_eng_len, padding='post')
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = fr_tokenizer.word_index["j'adore"]
    output_sentence = ""

    for _ in range(max_fr_len):
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = fr_tokenizer.index_word.get(sampled_token_index, "")
        if sampled_word == "<EOS>":
            break
        output_sentence += " " + sampled_word
        target_seq[0, 0] = sampled_token_index
        states_value = [h, c]

    return output_sentence.strip()

print(translate_sentence("I love NLP"))


Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 1.9465
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - loss: 1.9117
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - loss: 1.8757
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 1.8368
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 1.7932
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 1.7431
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 1.6846
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 1.6159
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - loss: 1.5355
Epoch 10/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 1.4427
Epoch 11/50
[1m1/1[0m

In [None]:
help(eng_tokenizer)

Help on Tokenizer in module keras.src.legacy.preprocessing.text object:

class Tokenizer(builtins.object)
 |  Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, analyzer=None, **kwargs)
 |  
 |  DEPRECATED.
 |  
 |  Methods defined here:
 |  
 |  __init__(self, num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, analyzer=None, **kwargs)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit_on_sequences(self, sequences)
 |  
 |  fit_on_texts(self, texts)
 |  
 |  get_config(self)
 |  
 |  sequences_to_matrix(self, sequences, mode='binary')
 |  
 |  sequences_to_texts(self, sequences)
 |  
 |  sequences_to_texts_generator(self, sequences)
 |  
 |  texts_to_matrix(self, texts, mode='binary')
 |  
 |  texts_to_sequences(self, texts)
 |  
 |  texts_to_sequences_generator(self, texts)
 |  
 |  to_json(self, **kwargs