In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import torch

In [None]:
!pip install -q kaggle

In [None]:
!kaggle datasets download -d abdeltawabali/hindi-english-truncated-corpus-csv

Dataset URL: https://www.kaggle.com/datasets/abdeltawabali/hindi-english-truncated-corpus-csv
License(s): unknown
Downloading hindi-english-truncated-corpus-csv.zip to /content
 57% 8.00M/13.9M [00:00<00:00, 81.3MB/s]
100% 13.9M/13.9M [00:00<00:00, 95.5MB/s]


In [None]:
!unzip hindi-english-truncated-corpus-csv.zip

Archive:  hindi-english-truncated-corpus-csv.zip
  inflating: Hindi_English_Truncated_Corpus.csv  


In [None]:
data=pd.read_csv("Hindi_English_Truncated_Corpus.csv")
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [None]:
data.isna().sum()

Unnamed: 0,0
source,0
english_sentence,2
hindi_sentence,0


In [None]:
data.dropna(inplace=True)

In [None]:
import random
random.seed(42)

sample_size = int(len(data) * 0.1)
sampled_data = data.sample(n=sample_size, random_state=42)
sampled_data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
3556,tides,He declares the result and reports it to the E...,वही परिणाम की घोषणा करता है और निर्वाचन आयोग क...
25899,ted,was a little uncomfortable for them.,थोडा कठिन था।
90924,tides,"A multi-purpose auditorium , a branch of the S...","बहुउद्देशीय सभागार , भारतीय स्टेट बैंक की शाखा..."
78213,tides,No fees is to be paid for filing the appeal to...,अधिकरण में अपील करने के लिए कोई फीस नहीं देनी ...
96955,indic2012,headind kaun banega crorepati,शीर्षक कौन बनेगा करोड़पति (Kaun Banega Crorepa...


In [None]:
data = sampled_data
data['english_sentence'] = data['english_sentence'].str.lower().str.replace(r'[^a-zA-Z\s]', '', regex=True)
data['hindi_sentence'] = data['hindi_sentence'].str.replace(r'[^अ-ह\s]', '', regex=True)

In [None]:
data.head()

Unnamed: 0,source,english_sentence,hindi_sentence
3556,tides,he declares the result and reports it to the e...,वह परणम क घषण करत ह और नरवचन आयग क और सबदध सदन...
25899,ted,was a little uncomfortable for them,थड कठन थ
90924,tides,a multipurpose auditorium a branch of the sta...,बहउददशय सभगर भरतय सटट बक क शख एक वशल बकट हल ...
78213,tides,no fees is to be paid for filing the appeal to...,अधकरण म अपल करन क लए कई फस नह दन पडत
96955,indic2012,headind kaun banega crorepati,शरषक कन बनग करडपत


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

english_tokenizer = Tokenizer()
hindi_tokenizer = Tokenizer()

english_tokenizer.fit_on_texts(data['english_sentence'])
hindi_tokenizer.fit_on_texts(data['hindi_sentence'])

english_sequences = english_tokenizer.texts_to_sequences(data['english_sentence'])
hindi_sequences = hindi_tokenizer.texts_to_sequences(data['hindi_sentence'])

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length_eng = max(len(seq) for seq in english_sequences)
max_length_hin = max(len(seq) for seq in hindi_sequences)

english_padded = pad_sequences(english_sequences, maxlen=max_length_eng, padding='post')
hindi_padded = pad_sequences(hindi_sequences, maxlen=max_length_hin, padding='post')

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Embedding, Dense

vocab_size_eng = len(english_tokenizer.word_index) + 1
vocab_size_hin = len(hindi_tokenizer.word_index) + 1
embedding_dim = 256
latent_dim = 256

In [None]:
max_length_hin

243

In [None]:
encoder_inputs = Input(shape=(max_length_eng,))
encoder_embedding = Embedding(vocab_size_eng, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

In [None]:
decoder_inputs = Input(shape=(max_length_hin-1,))
decoder_embedding = Embedding(vocab_size_hin, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size_hin, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
hindi_padded_reshaped = np.expand_dims(hindi_padded[:, 1:])

In [None]:
# Ensure correct padding
max_length_hin = max(len(seq) for seq in hindi_sequences)  # Get the maximum length of Hindi sequences
hindi_padded = pad_sequences(hindi_sequences, maxlen=max_length_hin, padding='post')

# Check shapes after padding
print("Padded Hindi shape:", hindi_padded.shape)  # Should be (num_samples, max_length_hin)

# Prepare target data correctly
# We want to shift the sequence by one for teacher forcing
hindi_padded_reshaped = hindi_padded[:, 1:]  # Remove the first token
print("Reshaped Hindi shape:", hindi_padded_reshaped.shape)  # Should be (num_samples, max_length_hin - 1)

# If max_length_hin is 416, then hindi_padded_reshaped should have a shape of (num_samples, 415)

Padded Hindi shape: (12760, 243)
Reshaped Hindi shape: (12760, 242)


In [None]:
hindi_padded_reshaped.shape

(12760, 242)

In [None]:
decoder_input_data = hindi_padded[:, :-1] #last one hatao
decoder_output_data = hindi_padded[:, 1:] #first one hatao

model.fit(
    [english_padded, decoder_input_data],
    np.expand_dims(decoder_output_data, axis=-1),
    batch_size=32,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3602s[0m 11s/step - accuracy: 0.9183 - loss: 1.7538 - val_accuracy: 0.9422 - val_loss: 0.4202
Epoch 2/10
[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3675s[0m 11s/step - accuracy: 0.9403 - loss: 0.4292 - val_accuracy: 0.9439 - val_loss: 0.4085
Epoch 3/10
[1m165/319[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m26:15[0m 10s/step - accuracy: 0.9418 - loss: 0.4183