# **Machine Translation Using Attention:**

# English to Hindi Translation: 

## **Data Preparation:**

## **Import all libraries:**

In [76]:
import numpy as np 

import pandas as pd
from sklearn.model_selection import train_test_split

import os
import tensorflow as tf

import time

In [77]:
data=pd.read_csv('../input/english-to-hindi-parallel-dataset/newdata.csv')

In [78]:
data.head()

Unnamed: 0.1,Unnamed: 0,english_sentence,hindi_sentence
0,0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [79]:
data=data.drop('Unnamed: 0',axis=1)

In [80]:
data.head()

Unnamed: 0,english_sentence,hindi_sentence
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [81]:
data.describe()

Unnamed: 0,english_sentence,hindi_sentence
count,177604,177606
unique,174076,147327
top,(Laughter),(हँसी)
freq,555,212


In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177606 entries, 0 to 177605
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   english_sentence  177604 non-null  object
 1   hindi_sentence    177606 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [83]:
data=data.dropna()
print(data.shape)

(177604, 2)


In [84]:
n=int(input())

en=data['english_sentence'].values[n]

hi=data['hindi_sentence'].values[n]

print(en)

print(hi)

123
and that is for their children to grow up successful,
और वह है कि उनके बच्चे कामयाब निकलें,


# Cleaning The Data:

In [85]:
import string

sc = list(set(string.punctuation))

In [86]:
hi

'और वह है कि उनके बच्चे कामयाब निकलें,'

In [87]:
data['english_sentence']=data['english_sentence'].apply(lambda x: x.lower())

In [88]:
data.columns

Index(['english_sentence', 'hindi_sentence'], dtype='object')

In [89]:
data['english_sentence']=data['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in sc))
data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in sc))

In [90]:
data['english_sentence']=data['english_sentence'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))

In [91]:
data['english_sentence']=data['english_sentence'].apply(lambda x: '<start> '+x+' <end>')
data['hindi_sentence']=data['hindi_sentence'].apply(lambda x: '<start> '+x+' <end>')

In [92]:
data['length_eng_sentence']=data['english_sentence'].apply(lambda x:len(x.split(" ")))
data['length_hin_sentence']=data['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [93]:
data.head()

Unnamed: 0,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,<start> politicians do not have permission to ...,<start> राजनीतिज्ञों के पास जो कार्य करना चाहि...,14,16
1,<start> id like to tell you about one such chi...,<start> मई आपको ऐसे ही एक बच्चे के बारे में बत...,11,13
2,<start> this percentage is even greater than t...,<start> यह प्रतिशत भारत में हिन्दुओं प्रतिशत स...,12,11
3,<start> what we really mean is that theyre bad...,<start> हम ये नहीं कहना चाहते कि वो ध्यान नहीं...,14,13
4,<start> the ending portion of these vedas is c...,<start> इन्हीं वेदों का अंतिम भाग उपनिषद कहलात...,11,10


# Fliter the values based upon length of sentences:

In [94]:
data=data[data['length_eng_sentence']<=20]
data=data[data['length_hin_sentence']<=20]

In [95]:
data.shape

(109012, 4)

In [96]:
n=int(input())

en=data['english_sentence'].values[n]

hi=data['hindi_sentence'].values[n]

print(en)

print(hi)

143
<start> compilation of bachans great poetry  <end>
<start> बच्चन के श्रेष्ठ कविताओं का संकलन <end>


# Preprocessing the Data

* combine all words
* sort the words based upon frequency 
* assign the ranks of the words based upon frequency
* convert the text sentence into list of tokens
* padding the token's list

In [97]:
from collections import Counter 
def tokenize(lang):
    words=[]
    for i in lang:
        words.extend(i.split())
    s=Counter(words)
    a=list(s.keys())
    b=list(s.values())
    ind=np.argsort(np.array(b))
    word_to_ind={}
    for i in range(len(ind)):
        word_to_ind[a[ind[-(i+1)]]]=i+1
    sequences=[]
    for i in lang:
        sen=[]
        for j in i.split():
            sen.append(word_to_ind[j])
        sequences.append(sen)
    pad_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences,padding='post')
    
    return word_to_ind,pad_sequences
    

In [98]:
en_word_to_ind,en_sequences=tokenize(data['english_sentence'].values)
hin_word_to_ind,hin_sequences=tokenize(data['hindi_sentence'].values)

In [100]:
len(en_word_to_ind),len(hin_word_to_ind)

(47181, 53372)

In [101]:
en_sequences.shape,hin_sequences.shape

((109012, 20), (109012, 20))

In [102]:
en_sequences[0].shape

(20,)

# Split The data into train and validation:

In [103]:
x_train, x_val, y_train, y_val = train_test_split(en_sequences,hin_sequences, test_size=0.2)


print(len(x_train), len(y_train), len(x_val), len(y_val))

87209 87209 21803 21803


# shuffle data and use Data Generators:

In [104]:
BUFFER_SIZE = len(x_train)
BATCH_SIZE = 128
steps_per_epoch = len(x_train)//BATCH_SIZE
embedding_dim = 256
units = 512
vocab_inp_size = len(en_word_to_ind)+1
vocab_tar_size = len(hin_word_to_ind)+1

dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

# Attention MOdel:

# Enoder of Attention Model:

In [105]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [106]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)


# Attention Layer

In [107]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):

        query_with_time_axis = tf.expand_dims(query, 1)

        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis) + self.W2(values)))


        attention_weights = tf.nn.softmax(score, axis=1)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

# Decoder of Attention Model:

In [108]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)

        
        output = tf.reshape(output, (-1, output.shape[2]))

        x = self.fc(output)

        return x, state, attention_weights

In [109]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)


# Define Optimizer and Loss Function:

In [110]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

# Define checkpoint to store the Model:

In [111]:
checkpoint_dir = '/kaggle/working/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# Training the Model:

In [112]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        dec_input = tf.expand_dims([hin_word_to_ind['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
           
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [127]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,batch,batch_loss.numpy()))

    if (epoch + 1) % 10 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 1.0041
Epoch 1 Batch 100 Loss 1.0714
Epoch 1 Batch 200 Loss 1.0046
Epoch 1 Batch 300 Loss 1.0268
Epoch 1 Batch 400 Loss 1.0905
Epoch 1 Batch 500 Loss 1.1144
Epoch 1 Batch 600 Loss 1.1082
Epoch 1 Loss 1.0768
Time taken for 1 epoch 334.2789969444275 sec

Epoch 2 Batch 0 Loss 0.8810
Epoch 2 Batch 100 Loss 0.8535
Epoch 2 Batch 200 Loss 0.9281
Epoch 2 Batch 300 Loss 0.9610
Epoch 2 Batch 400 Loss 1.0462
Epoch 2 Batch 500 Loss 0.8626
Epoch 2 Batch 600 Loss 0.9808
Epoch 2 Loss 0.9638
Time taken for 1 epoch 333.14415216445923 sec

Epoch 3 Batch 0 Loss 0.8228
Epoch 3 Batch 100 Loss 0.9114
Epoch 3 Batch 200 Loss 0.9229
Epoch 3 Batch 300 Loss 0.9736
Epoch 3 Batch 400 Loss 0.7572
Epoch 3 Batch 500 Loss 0.8735
Epoch 3 Batch 600 Loss 0.9038
Epoch 3 Loss 0.8660
Time taken for 1 epoch 331.37976360321045 sec

Epoch 4 Batch 0 Loss 0.7407
Epoch 4 Batch 100 Loss 0.7864
Epoch 4 Batch 200 Loss 0.7146
Epoch 4 Batch 300 Loss 0.7955
Epoch 4 Batch 400 Loss 0.7588
Epoch 4 Batch 500 Loss 0.816

# Prediction Of the Model:

In [149]:
hin_ind_to_word={}

for i in hin_word_to_ind:
    hin_ind_to_word[hin_word_to_ind[i]]=i
    
en_ind_to_word={}

for i in en_word_to_ind:
    en_ind_to_word[en_word_to_ind[i]]=i

In [71]:
def preprocess_sentence(sentence):
    x=sentence.lower()
    x=''.join(ch for ch in x if ch not in sc)
    x=''.join([i for i in x if not i.isdigit()])
    x='<start> '+x+' <end>'
    return x

In [153]:
def evaluate(sentence):
    attention_plot = np.zeros((20, 20))

    sentence = preprocess_sentence(sentence)
   
    inputs = [en_word_to_ind[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=20,padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = ''

    hidden = [tf.zeros((1, 512))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([hin_word_to_ind['<start>']], 0)

    for t in range(20):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                         dec_hidden,
                                                         enc_out)
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        

        if hin_ind_to_word[predicted_id] == '<end>':
            return result, sentence, attention_plot
        result += hin_ind_to_word[predicted_id] + ' '

        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [160]:
for i in range(5):
    k=int(input())
    sentence=''
    for j in range(1,len(x_val[k])-1):
        if  x_val[k][j+1]==0:
            continue
        sentence+=en_ind_to_word[x_val[k][j]]+' '
    
    pred,x,atten_plot=evaluate(sentence.strip())
    actual=''
    for j in range(1,len(y_val[k])-1):
        if  x_val[k][j+1]==0:
            continue
        
        actual+=' '+hin_ind_to_word[y_val[k][j]]
    x=' '.join([j for j in x.split()[1:-1]])       
    print("english sentence---> "+x)
    print('\n')
    print('predicted sentence--->'+pred)
    print('\n')
    print('actual sentence-->'+actual)
    print('\n')
    print('--------------------------------------')

7
english sentence---> peach is great for and dries up very well


predicted sentence--->आडू चेहरे के लिए श्रेष्‍ठ भलीभाँति सूखता है । 


actual sentence--> जरदालू चेहरे के लिए श्रेष्‍ठ तथा भलीभाँति सूखता है


--------------------------------------
18
english sentence---> on being hit by heatstroke feeding onion will get benefits


predicted sentence--->जले व्यक्‍ति को नष्‍ट करने से रोगी को लाभ पहुँचाती है । 


actual sentence--> लू लगने पर प्याज का सेवन करायें तो लाभ मिलेगा


--------------------------------------
278
english sentence---> penis is an external genital organ of male of some of the vertebrate and invertebrate both


predicted sentence--->शिश्न Penis कशेरुकी और उसके पीछे दोनों से संबंधित कऋ से संबंधित कऋ से संबंधित कऋ से संबंधित कुछ खास 


actual sentence--> शिश्न Penis कशेरुकी और अकशेरुकी दोनो प्रकार के कुछ नर जीवों का एक बाह्य यौन अंग


--------------------------------------
897
english sentence---> hitler had overrun almost the whole of western europe and italy had joi