In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd

# Read the text file into a DataFrame
df = pd.read_csv('thedataset.csv')

In [3]:
df.head()

Unnamed: 0,Eng,Beng
0,the,দ্য
1,and,এবং
2,have,আছে
3,that,যে
4,for,জন্য


In [4]:
english_sentences = df["Eng"].tolist()
bengali_sentences = df["Beng"].tolist()

In [5]:
tokenizer_eng = Tokenizer()
tokenizer_eng.fit_on_texts(english_sentences)
eng_seq = tokenizer_eng.texts_to_sequences(english_sentences)

tokenizer_beng = Tokenizer()
tokenizer_beng.fit_on_texts(bengali_sentences)
beng_seq = tokenizer_beng.texts_to_sequences(bengali_sentences)


vocab_size_eng = len(tokenizer_eng.word_index) + 1
vocab_size_beng = len(tokenizer_beng.word_index) + 1

# Padding
max_length = max(len(seq) for seq in eng_seq + beng_seq)
eng_seq_padded = pad_sequences(eng_seq, maxlen=max_length, padding='post')
beng_seq_padded = pad_sequences(beng_seq, maxlen=max_length, padding='post')

In [6]:
embedding_dim = 256
units = 512

# Encoder
encoder_inputs = Input(shape=(max_length,))
enc_emb = Embedding(input_dim=vocab_size_eng, output_dim=embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(units, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_length,))
dec_emb_layer = Embedding(input_dim=vocab_size_beng, output_dim=embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_beng, activation='softmax')
output = decoder_dense(decoder_outputs)

# Model
model = Model([encoder_inputs, decoder_inputs], output)

# Compilation of model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [7]:
X_train, X_val, y_train, y_val = train_test_split(eng_seq_padded, beng_seq_padded, test_size=0.2)
model.fit([X_train, X_train], y_train, validation_data=([X_val, X_val], y_val), epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2583a909890>

In [8]:
def translate_sentence(sentence):
    seq = tokenizer_eng.texts_to_sequences([sentence])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    translated = np.argmax(model.predict([padded, padded]), axis=-1)
    
    translated_sentence = []
    for i in translated[0]:
        if i in tokenizer_beng.index_word:
            translated_sentence.append(tokenizer_beng.index_word[i])
        else:
            translated_sentence.append(' ')  
        
    return ' '.join(translated_sentence)

In [14]:
input_sentence = input('Enter your sentence:')
translated_sentence = translate_sentence(input_sentence)
print(f"Input: {input_sentence}")
print(f"Translated: {translated_sentence}")

Enter your sentence:i hate you
Input: i hate you
Translated: আমি আমার আপনি করি।                                
