# English to Bengali Translator


In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding

# Sample dataset
english_sentences = ["Hello", "How are you?", "What is your name?"]
bengali_sentences = ["হ্যালো", "আপনি কেমন আছেন?", "আপনার নাম কি?"]

# Tokenization
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_vocab_size = len(tokenizer_eng.word_index) + 1

tokenizer_ben = Tokenizer()
tokenizer_ben.fit_on_texts(bengali_sentences)
ben_vocab_size = len(tokenizer_ben.word_index) + 1

# Convert text to sequences
eng_sequences = tokenizer_eng.texts_to_sequences(english_sentences)
ben_sequences = tokenizer_ben.texts_to_sequences(bengali_sentences)

# Pad sequences
eng_padded = pad_sequences(eng_sequences)
ben_padded = pad_sequences(ben_sequences)

# Model architecture
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(eng_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(ben_vocab_size, embedding_dim)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding(decoder_inputs), initial_state=encoder_states)
decoder_dense = Dense(ben_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([eng_padded, ben_padded[:, :-1]], np.expand_dims(ben_padded[:, 1:], -1), epochs=50)

# Function to translate English to Bengali
def translate(input_text):
    input_seq = tokenizer_eng.texts_to_sequences([input_text])
    input_padded = pad_sequences(input_seq)

    initial_states = encoder_lstm.predict(input_padded)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tokenizer_ben.word_index['<start>']
    
    stop_condition = False
    translation = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_lstm.predict([target_seq] + initial_states)
        predicted_token = np.argmax(output_tokens[0, -1, :])
        
        if predicted_token == tokenizer_ben.word_index['<end>']:
            stop_condition = True
        else:
            sampled_word = tokenizer_ben.index_word[predicted_token]
            translation += ' ' + sampled_word
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = predicted_token
        
        initial_states = [h, c]

    return translation.strip()

# Example usage
input_text = "Hello"
output_text = translate(input_text)
print(f"{input_text} (English) -> {output_text} (Bengali)")
