# **Importing necessary libraries**

In [None]:
import nltk
nltk.download('book')

In [None]:
import numpy as np
from nltk.tokenize import word_tokenize
import pandas as pd
import tensorflow as tf
import operator
import gensim
import string
import re
from keras.layers.recurrent import LSTM
from keras.models import Model
from keras.models import model_from_json
from keras.utils.vis_utils import plot_model
from tensorflow.keras.layers import Dense, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers import Bidirectional,Concatenate
from keras.layers import TimeDistributed
from keras.layers import Dense, Activation ,Input
from keras.models import Sequential, load_model
from keras.utils.data_utils import get_file

ModuleNotFoundError: No module named 'keras.layers.recurrent'

# **ATTENTION LAYER**

In [None]:
import tensorflow as tf
import os
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K


class AttentionLayer(Layer):
    """
    This class implements Bahdanau attention (https://arxiv.org/pdf/1409.0473.pdf).
    There are three sets of weights introduced W_a, U_a, and V_a
     """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        assert isinstance(input_shape, list)
        # Create a trainable weight variable for this layer.

        self.W_a = self.add_weight(name='W_a',
                                   shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.U_a = self.add_weight(name='U_a',
                                   shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])),
                                   initializer='uniform',
                                   trainable=True)
        self.V_a = self.add_weight(name='V_a',
                                   shape=tf.TensorShape((input_shape[0][2], 1)),
                                   initializer='uniform',
                                   trainable=True)

        super(AttentionLayer, self).build(input_shape)  # Be sure to call this at the end

    def call(self, inputs, verbose=False):
        """
        inputs: [encoder_output_sequence, decoder_output_sequence]
        """
        assert type(inputs) == list
        encoder_out_seq, decoder_out_seq = inputs
        if verbose:
            print('encoder_out_seq>', encoder_out_seq.shape)
            print('decoder_out_seq>', decoder_out_seq.shape)

        def energy_step(inputs, states):
            """ Step function for computing energy for a single decoder state """

            assert_msg = "States must be a list. However states {} is of type {}".format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            """ Some parameters required for shaping tensors"""
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            """ Computing S.Wa where S=[s0, s1, ..., si]"""
            # <= batch_size*en_seq_len, latent_dim
            reshaped_enc_outputs = K.reshape(encoder_out_seq, (-1, en_hidden))
            # <= batch_size*en_seq_len, latent_dim
            W_a_dot_s = K.reshape(K.dot(reshaped_enc_outputs, self.W_a), (-1, en_seq_len, en_hidden))
            if verbose:
                print('wa.s>',W_a_dot_s.shape)

            """ Computing hj.Ua """
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)  # <= batch_size, 1, latent_dim
            if verbose:
                print('Ua.h>',U_a_dot_h.shape)

            """ tanh(S.Wa + hj.Ua) """
            # <= batch_size*en_seq_len, latent_dim
            reshaped_Ws_plus_Uh = K.tanh(K.reshape(W_a_dot_s + U_a_dot_h, (-1, en_hidden)))
            if verbose:
                print('Ws+Uh>', reshaped_Ws_plus_Uh.shape)

            """ softmax(va.tanh(S.Wa + hj.Ua)) """
            # <= batch_size, en_seq_len
            e_i = K.reshape(K.dot(reshaped_Ws_plus_Uh, self.V_a), (-1, en_seq_len))
            # <= batch_size, en_seq_len
            e_i = K.softmax(e_i)

            if verbose:
                print('ei>', e_i.shape)

            return e_i, [e_i]

        def context_step(inputs, states):
            """ Step function for computing ci using ei """
            # <= batch_size, hidden_size
            c_i = K.sum(encoder_out_seq * K.expand_dims(inputs, -1), axis=1)
            if verbose:
                print('ci>', c_i.shape)
            return c_i, [c_i]

        def create_inital_state(inputs, hidden_size):
            # We are not using initial states, but need to pass something to K.rnn funciton
            fake_state = K.zeros_like(inputs)  # <= (batch_size, enc_seq_len, latent_dim
            fake_state = K.sum(fake_state, axis=[1, 2])  # <= (batch_size)
            fake_state = K.expand_dims(fake_state)  # <= (batch_size, 1)
            fake_state = K.tile(fake_state, [1, hidden_size])  # <= (batch_size, latent_dim
            return fake_state

        fake_state_c = create_inital_state(encoder_out_seq, encoder_out_seq.shape[-1])
        fake_state_e = create_inital_state(encoder_out_seq, encoder_out_seq.shape[1])  # <= (batch_size, enc_seq_len, latent_dim

        """ Computing energy outputs """
        # e_outputs => (batch_size, de_seq_len, en_seq_len)
        last_out, e_outputs, _ = K.rnn(
            energy_step, decoder_out_seq, [fake_state_e],
        )

        """ Computing context vectors """
        last_out, c_outputs, _ = K.rnn(
            context_step, e_outputs, [fake_state_c],
        )

        return c_outputs, e_outputs

    def compute_output_shape(self, input_shape):
        """ Outputs produced by the layer """
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])),
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1]))
        ]

# **Preparing Dataset**

In [None]:
encoder_url = 'https://raw.githubusercontent.com/joshua-decoder/indian-parallel-corpora/master/te-en/training.te-en.en'
encoder_path = get_file('encoder_data.txt', origin=encoder_url)

with open(encoder_path) as encoder_file_:
  total_encoder_docs = encoder_file_.readlines()
print(total_encoder_docs[1000])

In [None]:
decoder_url = 'https://raw.githubusercontent.com/joshua-decoder/indian-parallel-corpora/master/te-en/training.te-en.te'
decoder_path = get_file('decoder_data.txt', origin=decoder_url)

with open(decoder_path) as decoder_file_:
  total_decoder_docs = decoder_file_.readlines()
print(total_decoder_docs[1000])

In [None]:
encoder_docs=[]
decoder_docs=[]

for i in range(len(total_encoder_docs)):
  #if(i%4 == 0  or i%4 ==3):
    encoder_docs.append(total_encoder_docs[i])
    decoder_docs.append(total_decoder_docs[i])

NameError: name 'total_encoder_docs' is not defined

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    return text

In [None]:
cleaned_encoder_docs=[]
cleaned_decoder_input_docs=[]
cleaned_decoder_output_docs=[]
start_of_sentence="<s>"
end_of_sentence="<e>"
unknown_word="<unk>"

for i in range(len(encoder_docs)):
  cleaned_encoder_sent=clean_text(encoder_docs[i])
  cleaned_decoder_sent=clean_text(decoder_docs[i])

  cleaned_decoder_input_sent=start_of_sentence + " " + cleaned_decoder_sent
  cleaned_decoder_output_sent=cleaned_decoder_sent + " " + end_of_sentence

  splitted_encoder_sent=cleaned_encoder_sent.split()
  splitted_decoder_input_sent=cleaned_decoder_input_sent.split()
  splitted_decoder_output_sent=cleaned_decoder_output_sent.split()

  if(len(splitted_encoder_sent)<=50 and len(splitted_decoder_input_sent)<=50 and len(splitted_decoder_output_sent)<=50):
    cleaned_encoder_docs.append(splitted_encoder_sent)
    cleaned_decoder_input_docs.append(splitted_decoder_input_sent)
    cleaned_decoder_output_docs.append(splitted_decoder_output_sent)

In [None]:
print(len(cleaned_encoder_docs))
print(len(cleaned_decoder_input_docs))
print(len(cleaned_decoder_output_docs))

In [None]:
extra_list=[]
extra_list.append(unknown_word)
cleaned_encoder_docs.append(extra_list)

extra_list=[]
extra_list.append(start_of_sentence)
extra_list.append(unknown_word)
cleaned_decoder_input_docs.append(extra_list)

extra_list=[]
extra_list.append(unknown_word)
extra_list.append(end_of_sentence)

cleaned_decoder_output_docs.append(extra_list)

In [None]:
print(len(cleaned_encoder_docs))
print(len(cleaned_decoder_input_docs))
print(len(cleaned_decoder_output_docs))

In [None]:
encoder_word_model = gensim.models.Word2Vec(cleaned_encoder_docs, size=100, min_count=1, window=2, iter=100)
encoder_pretrained_weights = encoder_word_model.wv.syn0
encoder_vocab_size, encoder_features_size = encoder_pretrained_weights.shape

decoder_word_model = gensim.models.Word2Vec(cleaned_decoder_input_docs + cleaned_decoder_output_docs, size=100, min_count=1, window=2, iter=100)
decoder_pretrained_weights = decoder_word_model.wv.syn0
decoder_vocab_size, decoder_features_size = decoder_pretrained_weights.shape

In [None]:
print(encoder_word_model.wv.vocab[unknown_word].index)
print(decoder_word_model.wv.vocab[unknown_word].index)
print(len(encoder_word_model.wv.vocab))
print(len(decoder_word_model.wv.vocab))

In [None]:
encoder_data_x=np.zeros([len(cleaned_encoder_docs), 50], dtype=np.int32)                      # MAX SENTENCE LENGTH = 50 for source language
decoder_input_data_x=np.zeros([len(cleaned_decoder_input_docs), 50], dtype=np.int32)          # MAX SENTENCE LENGTH = 50 for target language
decoder_output_data_x=np.zeros([len(cleaned_decoder_output_docs), 50], dtype=np.int32)        # MAX SENTENCE LENGTH = 50 for target language

for i, sentence in enumerate(cleaned_encoder_docs):
  for t, word in enumerate(sentence):
    encoder_data_x[i, t] = encoder_word_model.wv.vocab[word].index

for i, sentence in enumerate(cleaned_decoder_input_docs):
  for t, word in enumerate(sentence):
    decoder_input_data_x[i, t] = decoder_word_model.wv.vocab[word].index

for i, sentence in enumerate(cleaned_decoder_output_docs):
  for t, word in enumerate(sentence):
    decoder_output_data_x[i, t] = decoder_word_model.wv.vocab[word].index

In [None]:
encoder_input_data=np.asarray(encoder_data_x)
decoder_input_data=np.asarray(decoder_input_data_x)
decoder_output_data=np.asarray(decoder_output_data_x)

print(cleaned_encoder_docs[0])
print(encoder_input_data[0])

print(cleaned_decoder_input_docs[0])
print(decoder_input_data[0])

print(cleaned_decoder_output_docs[0])
print(decoder_output_data[0])

In [None]:
hidden_features=256

In [None]:
encoder_inputs=Input(shape=(50, ))
encoder_embedding_layer = Embedding(input_dim = encoder_vocab_size,output_dim = encoder_features_size,weights=[encoder_pretrained_weights])
encoder_embeddings=encoder_embedding_layer(encoder_inputs)
encoder_LSTM = Bidirectional(LSTM(units=hidden_features, return_state=True))
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_LSTM(encoder_embeddings)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])


decoder_inputs = Input(shape=(50, ))
decoder_embedding_layer = Embedding(input_dim = decoder_vocab_size,output_dim = decoder_features_size,weights=[decoder_pretrained_weights])
decoder_embeddings=decoder_embedding_layer(decoder_inputs)
decoder_LSTM = LSTM(units=hidden_features*2, return_state=True, return_sequences=True)
decoder_outputs, _h, _c = decoder_LSTM(decoder_embeddings, initial_state=[state_h, state_c])


outputs = Dense(decoder_vocab_size, activation='softmax')(decoder_outputs)


model = Model([encoder_inputs, decoder_inputs], outputs)

In [None]:
json_file = open('/content/model_4_ex (4).json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("/content/model_4_ex (6).h5")
print("Loaded model from disk")

In [None]:
print(model.summary())

model.compile(optimizer='rmsprop', loss ='sparse_categorical_crossentropy', metrics = ['accuracy'])

plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
10

In [None]:
history = model.fit([encoder_input_data, decoder_input_data], decoder_output_data, epochs=5, batch_size=64)

model_json = model.to_json()
with open("model_4_ex_bi.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("model_4_ex_bi.h5")

In [None]:
encoder_inputs=Input(shape=(50, ))
encoder_embedding_layer = model.get_layer('embedding')
encoder_embeddings=encoder_embedding_layer(encoder_inputs)
encoder_LSTM=model.get_layer('bidirectional')
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_LSTM(encoder_embeddings)
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

encoder_model = Model(encoder_inputs, encoder_states)


decoder_state_input_h = Input(shape=(hidden_features*2,))
decoder_state_input_c = Input(shape=(hidden_features*2,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

single_input=Input(shape=(1,))
decoder_embedding_layer=model.get_layer('embedding_1')
embeddings=decoder_embedding_layer(single_input)
decoder_LSTM=model.get_layer('lstm_1')
decoder_outputs, _h, _c = decoder_LSTM(embeddings, initial_state=decoder_states_inputs)
decoder_states = [_h, _c]
decoder_dense = model.get_layer('dense')
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([single_input] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
encoder_states = [state_h, state_c]
encoder_model = Model(encoder_inputs, encoder_states)


decoder_state_input_h = Input(shape=(hidden_features,))
decoder_state_input_c = Input(shape=(hidden_features,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

single_input=Input(shape=(1,))
embeddings=decoder_embedding_layer(single_input)
decoder_outputs, _h, _c = decoder_LSTM(embeddings, initial_state=decoder_states_inputs)
decoder_states = [_h, _c]
decoder_dense = model.get_layer('dense')
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model([single_input] + decoder_states_inputs, [decoder_outputs] + decoder_states)

In [None]:
print(decoder_model.summary())

In [None]:
print(encoder_model.summary())

In [None]:
def translate(english_input):
    x_input=np.zeros((1,50) ,dtype=np.int32)
    cleaned_english_input=clean_text(english_input)
    splitted_english_input=cleaned_english_input.split()
    f=0
    for word in splitted_english_input:
      if(word not in encoder_word_model.wv.vocab):
        x_input[0][f]=46567
      else:
        x_input[0][f]=encoder_word_model.wv.vocab[word].index
      f=f+1

    encoder_states = encoder_model.predict(x_input)

    telugu_seq = np.zeros((1, 1) ,dtype=np.int32)
    telugu_seq[0,0] = 0
    telugu_output = ""

    for i in range(50):
      output_tokens, h, c = decoder_model.predict([telugu_seq] + encoder_states)
      indx=np.argmax(output_tokens[0,0,:])

      if (indx==1):
        break

      if (indx>0):
        telugu_output=telugu_output+" "+decoder_word_model.wv.index2word[indx]

      telugu_seq = np.zeros((1, 1) ,dtype=np.int32)
      telugu_seq[0,0] = indx
      encoder_states=[h,c]

    return telugu_output


In [None]:
j=200
for i in range(len(encoder_docs)):
  print("ENGLISH -------------",encoder_docs[j])
  print("TELUGU---------------",translate(encoder_docs[j]))
  j=j+4
  print("")
  if(i==10):
    break
text="After freedom, the Indian government is taken Police action on Nizam samsthanam after this action Hyderabad is arranged as state of these places."
print("ENGLISH -------------",text)
print("TELUGU---------------",translate(text))


In [None]:
text="thousands of villages in krishna,guntur sunk in the flow of krishna river."
print("ENGLISH -------------",text)
print("TELUGU---------------",translate(text))


In [None]:
for i in range(len(total_training_en)):
  if(len(clean_text(total_training_en[i]).split())>50):
    continue
  print("ENGLISH -------------",total_training_en[i])
  print("TELUGU---------------",translate(total_training_en[i]))
  print("")
  if(i>10):
    break

In [None]:
from nltk.translate.bleu_score import sentence_bleu
predicted_docs=[]
actual_docs=[]

for i in range(len(total_training_en)):
  if(len(clean_text(total_training_en[i]).split())>50):
    continue
  telugu=translate(total_training_en[i])

  predicted_docs.append(telugu.split())
  if(i<len(total_training_te_docs)):
    actual_docs.append(clean_text(total_training_te_docs[i]).split())

score=0
for sentence in predicted_docs:
  score = score +  sentence_bleu(actual_docs,sentence)

print(score/len(predicted_docs))

In [None]:
print("BLEU SCORE FOR LSTM MODEL WITHOUT ATTENTION   ------>  ",score/len(predicted_docs))