In [62]:
import os
import string
import numpy as np
import pandas as pd
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
import logging
import tensorflow as tf
# tf.enable_eager_execution()
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
logging.getLogger('tensorflow').setLevel(logging.FATAL)
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import io
import time
import warnings
import sys
        
PATH = "../data_text/Hindi_English_Truncated_Corpus.csv"


In [63]:
# df = pd.read_csv(PATH,sep='\t', encoding='utf-8', names = ["english_sentence", "hindi_sentence"] )

# df.to_csv("../data_text/pmindia.v1.hi-en.csv", index=False)

In [64]:
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,An advance is placed with the Medical Superint...,अग्रिम धन राशि इन अस्पतालों को चिकित्सा निरीक्...
1,Since the DoHFW provides funds to the hospital...,चूंकि स्वास्थ्य एवं परिवार कल्याण विभाग अस्पता...
2,"RAN functions can, therefore, be vested in DoHFW.",इस तरह आरएएन का कामकाज स्वास्थ्य एवं परिवार कल...
3,Managing Committee of RAN Society will meet to...,"आरएएन, सोसायटी की प्रबंध समिति सोसायटी पंजीकरण..."
4,"In addition to this, Health Minister’s Cancer ...",इसके अलावा स्वास्थ्य मंत्री के कैंसर रोगी निधि...


In [65]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    return w


def hindi_preprocess_sentence(w):
    w = w.rstrip().strip()
    return w
def create_dataset(path=PATH):
    lines=pd.read_csv(path, encoding='utf-8')
    lines=lines.dropna()
    lines = lines[lines['source']=='ted']
    en = []
    hd = []
    for i, j in zip(lines['english_sentence'], lines['hindi_sentence']):
        en_1 = [preprocess_sentence(w) for w in i.split(' ')]
        en_1.append('<end>')
        en_1.insert(0, '<start>')
        hd_1 = [hindi_preprocess_sentence(w) for w in j.split(' ')]
        hd_1.append('<end>')
        hd_1.insert(0, '<start>')
        en.append(en_1)
        hd.append(hd_1)
    return hd, en

def max_length(tensor):
    return max(len(t) for t in tensor)

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
    return tensor, lang_tokenizer

def load_dataset(path=PATH):
    targ_lang, inp_lang = create_dataset(path)
    input_tensor, inp_lang_tokenizer = tokenize(inp_lang)
    target_tensor, targ_lang_tokenizer = tokenize(targ_lang)
    return input_tensor, target_tensor, inp_lang_tokenizer, targ_lang_tokenizer


In [66]:
input_tensor, target_tensor, inp_lang, targ_lang = load_dataset(PATH)

In [67]:
input_tensor.shape, target_tensor.shape

((39881, 23), (39881, 33))

In [68]:
max_length_targ, max_length_inp

(33, 23)

In [69]:
max_length_targ, max_length_inp = max_length(target_tensor), max_length(input_tensor)

input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

31904 31904 7977 7977


In [70]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print ("%d ----> %s" % (t, lang.index_word[t]))
    
print ("Input Language; index to word mapping")
convert(inp_lang, input_tensor_train[0])
print ()
print ("Target Language; index to word mapping")
convert(targ_lang, target_tensor_train[0])


Input Language; index to word mapping
1 ----> <start>
9 ----> that
76 ----> really
2311 ----> prefer
3 ----> the
264 ----> end
6 ----> of
11363 ----> texts ,
2 ----> <end>

Target Language; index to word mapping
1 ----> <start>
20 ----> जो
1337 ----> वाक्य
51 ----> या
2695 ----> पृष्ठ
4 ----> के
354 ----> अंत
5 ----> में
409 ----> आते
32 ----> हैं,
2 ----> <end>


In [71]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 128
units = 256
vocab_inp_size = len(inp_lang.word_index)+1
vocab_tar_size = len(targ_lang.word_index)+1

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)


class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)



class BahdanauAttention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))
    attention_weights = tf.nn.softmax(score, axis=1)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)
    return context_vector, attention_weights


class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    self.attention = BahdanauAttention(self.dec_units)

  def call(self, x, hidden, enc_output):
    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)
    return x, state, attention_weights

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
#   print(type(mask))
  loss_ *= mask
  return tf.reduce_mean(loss_)
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)


In [72]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']] * BATCH_SIZE, 1)
    # Teacher forcing
    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables + decoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))      
  return batch_loss

In [73]:
EPOCHS = 50

for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 2.7485
Epoch 1 Batch 100 Loss 2.0161
Epoch 1 Batch 200 Loss 1.8809
Epoch 1 Batch 300 Loss 1.8865
Epoch 1 Loss 1.9453
Time taken for 1 epoch 401.7105362415314 sec

Epoch 2 Batch 0 Loss 1.8744
Epoch 2 Batch 100 Loss 1.6609
Epoch 2 Batch 200 Loss 1.7167
Epoch 2 Batch 300 Loss 1.7101
Epoch 2 Batch 400 Loss 1.6359
Epoch 2 Loss 1.7437
Time taken for 1 epoch 367.8652675151825 sec

Epoch 3 Batch 0 Loss 1.6424
Epoch 3 Batch 100 Loss 1.8386
Epoch 3 Batch 200 Loss 1.6650
Epoch 3 Batch 300 Loss 1.6430
Epoch 3 Batch 400 Loss 1.4835
Epoch 3 Loss 1.6458
Time taken for 1 epoch 366.37940526008606 sec

Epoch 4 Batch 0 Loss 1.5123
Epoch 4 Batch 100 Loss 1.6303
Epoch 4 Batch 200 Loss 1.6166
Epoch 4 Batch 300 Loss 1.5113
Epoch 4 Batch 400 Loss 1.5433
Epoch 4 Loss 1.5537
Time taken for 1 epoch 366.91216921806335 sec

Epoch 5 Batch 0 Loss 1.5321
Epoch 5 Batch 100 Loss 1.4973
Epoch 5 Batch 200 Loss 1.4603
Epoch 5 Batch 300 Loss 1.4860
Epoch 5 Batch 400 Loss 1.4598
Epoch 5 Loss 1.4613
Time

Epoch 39 Batch 0 Loss 0.1261
Epoch 39 Batch 100 Loss 0.1371
Epoch 39 Batch 200 Loss 0.1188
Epoch 39 Batch 300 Loss 0.1520
Epoch 39 Batch 400 Loss 0.1213
Epoch 39 Loss 0.1326
Time taken for 1 epoch 339.09317088127136 sec

Epoch 40 Batch 0 Loss 0.1310
Epoch 40 Batch 100 Loss 0.1129
Epoch 40 Batch 200 Loss 0.1310
Epoch 40 Batch 300 Loss 0.1199
Epoch 40 Batch 400 Loss 0.1264
Epoch 40 Loss 0.1228
Time taken for 1 epoch 339.04984307289124 sec

Epoch 41 Batch 0 Loss 0.1144
Epoch 41 Batch 100 Loss 0.0783
Epoch 41 Batch 200 Loss 0.1092
Epoch 41 Batch 300 Loss 0.1276
Epoch 41 Batch 400 Loss 0.1229
Epoch 41 Loss 0.1120
Time taken for 1 epoch 338.9287736415863 sec

Epoch 42 Batch 0 Loss 0.0970
Epoch 42 Batch 100 Loss 0.1093
Epoch 42 Batch 200 Loss 0.0742
Epoch 42 Batch 300 Loss 0.1278
Epoch 42 Batch 400 Loss 0.1246
Epoch 42 Loss 0.1050
Time taken for 1 epoch 339.2525441646576 sec

Epoch 43 Batch 0 Loss 0.1126
Epoch 43 Batch 100 Loss 0.0954
Epoch 43 Batch 200 Loss 0.0862
Epoch 43 Batch 300 Loss 0.1

In [74]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = preprocess_sentence(sentence)
    inputs = [inp_lang.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_inp,
                                                           padding='post')
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word_index['<start>']], 0)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        result += targ_lang.index_word[predicted_id] + ' '
        if targ_lang.index_word[predicted_id] == '<end>':
            return result, sentence
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence
def translate(sentence):
    result, sentence = evaluate(sentence)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))
# restoring the latest checkpoint in checkpoint_dir
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))


Input: politicians do not have permission to do what needs to be done .
Predicted translation: यह तो बेहतर नहीं होगा अब हमें खाना निकल रहा है कैसे तो नहीं होना चाहिए जा रहा है कैसे तो सही होना नहीं है कैसे दिखते क्या कर सकते। <end> 


In [79]:
df['english_sentence'][5]

'The timeline required for this is one year.'

In [81]:
translate(u'The time required for this is one year.')


Input: the time required for this is one year .
Predicted translation: इस साल की प्रक्रिया से थोडा सा सैन्य उत्सव एक लय पर काम कर रहा है. <end> 
