In [1]:
# Make imports
!pip install datasets
import numpy as np
import re
import pickle
import os
import seaborn as sns
import string
import tensorflow as tf
from datasets import load_dataset

# TPU settings
%tensorflow_version 2.x



def preprocess(text):
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    text = text.lower()
    text = re.sub(r'\d', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

# Specify the root directory
dataset_root = "/content/sample_data"

# Load the IITB English-Hindi dataset from the datasets library
dataset = load_dataset("cfilt/iitb-english-hindi")

# Get translation pairs
translation_pairs = dataset["train"]["translation"]

# Extract English and Hindi sentences from translation pairs
english_sentences = [preprocess(pair["en"]) for pair in translation_pairs]
hindi_sentences = ['<START> ' + re.sub('[a-zA-Z]', '', preprocess(pair["hi"])) + ' <END>' for pair in translation_pairs]

# Remove duplicate sentences
english_unique = set()
english_sentences_temp = []
hindi_sentences_temp = []
l = len(english_sentences)
for i in range(l):
    if english_sentences[i] not in english_unique:
        english_unique.add(english_sentences[i])
        english_sentences_temp.append(english_sentences[i])
        hindi_sentences_temp.append(hindi_sentences[i])

english_sentences = english_sentences_temp
hindi_sentences = hindi_sentences_temp

os.makedirs(os.path.join(dataset_root, "parallel"), exist_ok=True)

# Save preprocessed data using pickle
with open(os.path.join(dataset_root, "parallel", "preprocessed_data.pickle"), 'wb') as f:
    pickle.dump((english_sentences, hindi_sentences), f)



Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

Downloading readme:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading metadata:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/190M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/500k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

English Vocab Size: 8020
Hindi Vocab Size: 9395
(23750, 10) (23750, 10) (23750, 10)




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 256)    2053120     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 256)    2405120     ['input_2[0][0]']                
                                                                                              

<keras.callbacks.History at 0x7bd09d0e8e50>

In [7]:
# Some parameters
vocab_size = 10000
total_sentences = 25000
maxlen = 10
epochs = 50
validation_split = 0.05

en_data = []
hi_data = []
cnt = 0

for (en, hi) in zip(english_sentences, hindi_sentences):
    l = min(len(en.split()), len(hi.split()))
    if l <= maxlen:
        en_data.append(en)
        hi_data.append(hi)
        cnt += 1
    if cnt == total_sentences:
        break

# Tokenize the texts and convert to sequences
en_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

hi_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>', lower=False)
hi_tokenizer.fit_on_texts(hi_data)
hi_sequences = hi_tokenizer.texts_to_sequences(hi_data)

english_vocab_size = len(en_tokenizer.word_index) + 1
hindi_vocab_size = len(hi_tokenizer.word_index) + 1

print("English Vocab Size:", english_vocab_size)
print("Hindi Vocab Size:", hindi_vocab_size)

# Prepare encoder data
encoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(en_sequences, maxlen=maxlen, padding='post')

# Prepare decoder data
decoder_inputs = []
decoder_outputs = []

for hi in hi_sequences:
    decoder_inputs.append(hi[:-1])
    decoder_outputs.append(hi[1:])

decoder_inputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_inputs, maxlen=maxlen, padding='post')
decoder_outputs = tf.keras.preprocessing.sequence.pad_sequences(decoder_outputs, maxlen=maxlen, padding='post')

# Training and Testing split (95%, 5%)
split = int(0.95 * total_sentences)

X_train = [encoder_inputs[:split], decoder_inputs[:split]]
y_train = decoder_outputs[:split]

X_test = en_data[:split]
y_test = hi_data[:split]

print(X_train[0].shape, X_train[1].shape, y_train.shape)

# Define LSTM model
d_model = 256

# Encoder
inputs = tf.keras.layers.Input(shape=(None,))
x = tf.keras.layers.Embedding(english_vocab_size, d_model, mask_zero=True)(inputs)
_, state_h, state_c = tf.keras.layers.LSTM(d_model, activation='relu', return_state=True)(x)

# Decoder
targets = tf.keras.layers.Input(shape=(None,))
embedding_layer = tf.keras.layers.Embedding(hindi_vocab_size, d_model, mask_zero=True)
x = embedding_layer(targets)
decoder_lstm = tf.keras.layers.LSTM(d_model, activation='relu', return_sequences=True, return_state=True)
x, _, _ = decoder_lstm(x, initial_state=[state_h, state_c])
dense1 = tf.keras.layers.Dense(hindi_vocab_size, activation='softmax')
x = dense1(x)

model = tf.keras.models.Model(inputs=[inputs, targets], outputs=x)
model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer='rmsprop', loss=loss, metrics=['accuracy'])

# Define the ModelCheckpoint callback
save_model_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='/content/sample_data/hinglush_model/en-hi.h5',
    monitor='val_accuracy',
    mode='max'
)

# Train the model
model.fit(X_train, y_train, epochs=epochs, validation_split=validation_split, callbacks=[save_model_callback, tf.keras.callbacks.TerminateOnNaN()])


English Vocab Size: 8020
Hindi Vocab Size: 9395




(23750, 10) (23750, 10) (23750, 10)
Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 256)    2053120     ['input_7[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, None, 256)    2405120     ['input_8[0][0]']                
                                                        

<keras.callbacks.History at 0x7bd09d30c670>

In [10]:
# Print the summary of the loaded model
saved_model.summary()


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_8 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, None, 256)    2053120     ['input_7[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, None, 256)    2405120     ['input_8[0][0]']                
                                                                                            

In [11]:
# Retrieve specific layers from the loaded model
inputs = saved_model.input[0]  # input_7
_, state_h, state_c = saved_model.get_layer('lstm_4').output  # lstm_4
targets = saved_model.input[1]  # input_8
embedding_layer = saved_model.get_layer('embedding_4')  # embedding_4
decoder_lstm = saved_model.get_layer('lstm_5')  # lstm_5
dense1 = saved_model.get_layer('dense_2')  # dense_2


In [12]:
#Inference Model

#Encoder
encoder = tf.keras.models.Model(inputs, [state_h, state_c])

#Decoder
decoder_input_h = tf.keras.layers.Input(shape=(d_model,))
decoder_input_c = tf.keras.layers.Input(shape=(d_model,))
x = embedding_layer(targets)
decoder_output, decoder_output_h, decoder_output_c = decoder_lstm(x, initial_state=[decoder_input_h, decoder_input_c])
x = dense1(decoder_output)
decoder = tf.keras.models.Model([targets, decoder_input_h, decoder_input_c],
                                [x, decoder_output_h, decoder_output_c])

def predict_sentence(en_input):
    input_seq = en_tokenizer.texts_to_sequences([en_input])

    next_h, next_c = encoder.predict(input_seq)

    curr_token = np.zeros((1, 1))
    curr_token[0, 0] = hi_tokenizer.word_index['<START>']

    pred_sentence = ''

    for i in range(maxlen):
        output, next_h, next_c = decoder.predict([curr_token, next_h, next_c])
        next_token = np.argmax(output[0, 0, :])
        next_word = hi_tokenizer.index_word[next_token]
        if next_word == '<END>':
            break
        else:
            pred_sentence += ' ' + next_word
            curr_token[0, 0] = next_token

    return pred_sentence

# Example of calling predict_sentence function
english_input = ["Definitely share your feedback in the comment section.",
"So even if it's a big video, I will clearly mention all the products.",
"I was waiting for my bag"]
for i in english_input:
  predicted_translation = predict_sentence(i)
  print("English Input:", i)
  print("Predicted Hindi Translation:", predicted_translation)


English Input: Definitely share your feedback in the comment section.
Predicted Hindi Translation:  करने को का का का का को को लिये चाहिए
English Input: So even if it's a big video, I will clearly mention all the products.
Predicted Hindi Translation:  कर और से को से को से को को के
English Input: I was waiting for my bag
Predicted Hindi Translation:  करने के ड्राइव के के के के लिये लिये लिये
