### Import Libraries

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import string
from string import digits
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input , LSTM, Embedding, Dense
from keras.models import Model

### Load the dataset

In [2]:
lines = pd.read_csv(r"C:\Users\SALOME\Downloads\Requirments (11)\Hindi_English_Truncated_Corpus.csv", encoding ='utf-8')
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


## Explanatory Data Analysis

In [3]:
lines.shape

(127607, 3)

In [4]:
lines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127607 entries, 0 to 127606
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   source            127607 non-null  object
 1   english_sentence  127605 non-null  object
 2   hindi_sentence    127607 non-null  object
dtypes: object(3)
memory usage: 2.9+ MB


### Check for missing values

In [5]:
lines.isnull().sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

### Check for duplicates 

In [6]:
lines.duplicated().sum()

2778

### Handle missing values and duplicates

In [None]:
lines = lines[lines['source'] == 'ted']
lines = lines[~pd.isnull(lines['english_sentence'])]
lines.drop_duplicates(inplace=True)

In [8]:
# Pick any 25000 rows from the dataset
lines = lines.sample(n=25000, random_state=42)
lines.shape

(25000, 3)

### Text Preprocessing

In [9]:
# Lower case all characters in the dataset for simplicity
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: x.lower())

In [10]:
# Remove quotes from the data
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

In [11]:
# Remove special characters
exclude = set(string.punctuation) #set of all special characters

lines['english_sentence'] = lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [12]:
# Remove numbers and extra spaces
remove_digits = str.maketrans('', '', digits)
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("२३०८१५७९४६", "", x))

# Remove extra spaces
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence'] = lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: 'START_ ' + x + '_END')

In [13]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापि...
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं_END
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ_END
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा_END
122330,ted,and its not as hard as you think integrate cli...,START_ और जितना आपको लगता है यह उतना कठिन नहीं...


### Create Vocabularies of unique English and Hindi words in sentences

In [14]:
# Initialize sets to store unique words and populate them
all_eng_words = set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words = set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)
            
# Calculate length of each English and Hindi sentence
lines['length_eng_sentence'] = lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hindi_sentence'] = lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

#### We filter out sentences that are longer than 20 words to simplify the training process and ensure that the model can handle the input within reasonable computational limits

In [15]:
# Filter out Sentences that are too long
lines = lines[lines['length_eng_sentence'] <=20]
lines = lines[lines['length_hindi_sentence'] <=20]

# Check maximum length of remaining sentences
max_length_src = max(lines['length_hindi_sentence'])
max_length_tar = max(lines['length_eng_sentence'])

In [16]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hindi_sentence
82040,ted,we still dont know who her parents are who she is,START_ हम अभी तक नहीं जानते हैं कि उसके मातापि...,11,15
85038,ted,no keyboard,START_ कोई कुंजीपटल नहीं_END,2,4
58018,ted,but as far as being a performer,START_ लेकिन एक कलाकार होने के साथ_END,7,7
74470,ted,and this particular balloon,START_ और यह खास गुब्बारा_END,4,5
122330,ted,and its not as hard as you think integrate cli...,START_ और जितना आपको लगता है यह उतना कठिन नहीं...,16,19


## Concept of Encoding and Decoding in Machine Learning
### Encoding
Purpose: Convert human-readable text into a numerical format that a machine learning model can process.
##### Process:
##### Tokenization: Break down sentences into individual words or tokens.
##### Indexing: Assign each unique word a unique number (index). 
##### Mapping: Create dictionaries to map words to their corresponding indices.
##### Padding: Ensure all sequences (sentences) are the same length by adding special tokens (e.g., zero-padding) where necessary.

### Decoding
Purpose: Convert the numerical output of a machine learning model back into human-readable text.
##### Process:
##### Reverse Mapping: Use the dictionaries created during encoding to map indices back to their corresponding words.
##### Sequence Generation: Form sentences by combining the words obtained from the reverse mapping.

#### In this context:
##### Encoder: The part of the model that processes the input sentence (e.g., English sentence) and converts it into a numerical format (encoded representation).
##### Decoder: The part of the model that takes the encoded representation and generates the output sentence (e.g., Hindi sentence) in numerical format, which is then converted back to text.


In [None]:
# Sort list of all unique English and Hindi words
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))

# Calculate the number of unique tokens (words) in each language
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)

# Ensure number of encoder tokens matches that of decoder tokens
num_encoder_tokens = num_decoder_tokens
num_decoder_tokens += 1 #for zero padding

# Create a dictionary that maps each English word to a unique index
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])

# Create a dictionary that maps each Hindi word to a unique index
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create a reverse dictionary that maps each index back to the corresponding English word
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())

# Create a reverse dictionary that maps each index back to the corresponding Hindi word
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

# Shuffle the dataset to ensure randomness
lines = shuffle(lines)

## Training model to translate English to Hindi

In [26]:
# Split the dataset to training and testing sets
X, y = lines['english_sentence'], lines['hindi_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# Save Data to Pickle files
X_train.to_pickle('X_train.pk1')
X_test.to_pickle('X_test.pk1')

This function prepares batches of data for training by encoding input and output sequences into numerical formats (encoder_input_data, decoder_input_data, decoder_target_data). It handles tokenization, indexing, and one-hot encoding necessary for training a sequence-to-sequence model, ensuring each batch is ready for consumption by the neural network.

The use of a generator allows for memory-efficient processing, especially useful for handling large datasets in machine learning tasks.

In [29]:
def generate_batch (X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''''
    while True:
        for j in range (0, len(x), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src), dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar), dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens), dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] #decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

# Define the dimensionality of the latent space
latent_dim = 300

# Define encoder input layer
encoder_inputs = Input(shape=(None,))

# Embedding layer for tokenizing and converting input sequences into dense vectors
enc_emb = Embedding(num_encoder_tokens, latent_dim, mask_zero=True)(encoder_inputs)

# LSTM layer in the encoder to process input sequences and return states
encoder_lstm = LSTM(latent_dim, return_state=True)

# Get encoder outputs (sequence), final hidden state, and final cell state
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# Keep only the states (final hidden and cell states) for later use in decoding
encoder_states = [state_h, state_c]

# Define decoder input layer
decoder_inputs = Input(shape=(None,))

# Embedding layer for tokenizing and converting decoder input sequences
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)

# LSTM layer in the decoder to generate output sequences and return states
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)

# Get decoder outputs (sequences), hidden state, and cell state using encoder states as initial states
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Dense layer to predict probabilities over the target vocabulary
decoder_dense = Dense(num_decoder_tokens, activation='softmax')

decoder_outputs = decoder_dense(decoder_outputs)

# Define the entire model that inputs encoder and decoder inputs and outputs decoder outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model with RMSprop optimizer and categorical crossentropy loss function
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# Print model summary to show its architecture and parameters
model.summary()

# Calculate number of samples in training and validation sets
train_samples = len(X_train)
val_samples = len(X_test)

# Define batch size and number of epochs for training
batch_size = 128
epochs = 100     

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 input_6 (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding_4 (Embedding)     (None, None, 300)            5935800   ['input_5[0][0]']             
                                                                                                  
 embedding_5 (Embedding)     (None, None, 300)            5936100   ['input_6[0][0]']             
                                                                                            

In [None]:
# Train the model using a generator function for batches of data
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size, # Number of batches per epoch
                    epochs=epochs, # Number of training epochs
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

# Save the weights of the trained model to a file
model.save_weights('nmt_weights.h5')

  model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
 13/155 [=>............................] - ETA: 54:23 - loss: nan

### Final Decoder Model Setup

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)
    
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence

train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

## Use Case
Demonstrates evaluation of a sequence-to-sequence model trained for translating English sentences into Hindi

In [None]:
k+=1
(input_seq, actual_output), _ = next(train_gen)
decoded_sentence = decode_sequence(input_seq)
print('Input English sentence:', X_train[k:k+1].values[0])
print('Actual Hindi Translation:', y_train[k:k+1].values[0][6:-4])
print('Predicted Hindi Translation:', decoded_sentence[:-4])