# Sequence to Sequence Implementation #

In [2]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf

In [4]:
# import zipfile
# with zipfile.ZipFile('/content/ita-eng.zip', 'r') as zip_ref:
#     zip_ref.extractall()

In [5]:
!wget https://www.dropbox.com/s/ddkmtqz01jc024u/glove.6B.100d.txt

--2022-07-02 15:24:24--  https://www.dropbox.com/s/ddkmtqz01jc024u/glove.6B.100d.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.9.18, 2620:100:601f:18::a27d:912
Connecting to www.dropbox.com (www.dropbox.com)|162.125.9.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/ddkmtqz01jc024u/glove.6B.100d.txt [following]
--2022-07-02 15:24:24--  https://www.dropbox.com/s/raw/ddkmtqz01jc024u/glove.6B.100d.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc2a2497631126f46f6ed895f5bb.dl.dropboxusercontent.com/cd/0/inline/BoUfTR94AvutO1RuGuX0tHGQ_v1v8M0c0lgDJDXjyRHgKf-ifk414UA52DZnUbQ67C_q2nZDK6QFrwW7DeKG9zWyvaEHSmj4jYgmybs3jYBq4ge7rtHfM0B3xTHxv1e6841tdDfksiwV1dQV5OPctrajZZSQElTsnpjtg6b9JDx7bg/file# [following]
--2022-07-02 15:24:25--  https://uc2a2497631126f46f6ed895f5bb.dl.dropboxusercontent.com/cd/0/inline/BoUfTR94AvutO1RuGuX0tHGQ_v1v8M0c0lgDJDXjyRHgKf-ifk414

## Preprocessing Data ##

In [6]:
with open('../input/seq2seq/ita.txt', 'r', encoding="utf8") as f:
    eng=[]
    ita=[]
    for i in f.readlines():
        eng.append(i.split("\t")[0])
        ita.append(i.split("\t")[1])
data = pd.DataFrame(data=list(zip(eng, ita)), columns=['english','italian'])
print(data.shape)
data.head()

(354238, 2)


Unnamed: 0,english,italian
0,Hi.,Ciao!
1,Hi.,Ciao.
2,Run!,Corri!
3,Run!,Corra!
4,Run!,Correte!


In [7]:
def decontractions(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"won\’t", "will not", phrase)
    phrase = re.sub(r"can\’t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)

    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)

    return phrase

def preprocess(text):
    text = text.lower()
    text = decontractions(text)
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    return text

def preprocess_ita(text):
    text = text.lower()
    text = decontractions(text)
    text = re.sub('[$)\?"’.°!;\'€%:,(/]', '', text)
    text = re.sub('\u200b', ' ', text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub('-', ' ', text)
    return text


data['english'] = data['english'].apply(preprocess)
data['italian'] = data['italian'].apply(preprocess_ita)
data.head()

Unnamed: 0,english,italian
0,hi,ciao
1,hi,ciao
2,run,corri
3,run,corra
4,run,correte


In [8]:
import warnings
warnings.filterwarnings('ignore')

data['italian_len'] = data['italian'].str.split().apply(len)
data = data[data['italian_len'] < 20]

data['english_len'] = data['english'].str.split().apply(len)
data = data[data['english_len'] < 20]

data['italian'] = '<start> ' + data['italian'] +' <end>'
data['english_inp'] = '<start> ' + data['english'].astype(str)
data['english_out'] = data['english'].astype(str) + ' <end>'

data = data.drop(['english','italian_len','english_len'], axis=1)
data.head()

Unnamed: 0,italian,english_inp,english_out
0,<start> ciao <end>,<start> hi,hi <end>
1,<start> ciao <end>,<start> hi,hi <end>
2,<start> corri <end>,<start> run,run <end>
3,<start> corra <end>,<start> run,run <end>
4,<start> correte <end>,<start> run,run <end>


### Data Pipeline ###

In [9]:
from sklearn.model_selection import train_test_split
train, validation = train_test_split(data, test_size=0.2)

In [10]:
# Only for the first sentance, adding a token <end> so that we will have <end> in tokenizer

train.iloc[0]['english_inp']= str(train.iloc[0]['english_inp']) + ' <end>'
train.iloc[0]['english_out']= str(train.iloc[0]['english_out']) + ' <end>'

In [11]:
import pickle

with open('../input/tknizer/tknizer_eng.pickle', 'rb') as handle:
    tknizer_eng = pickle.load(handle)
with open('../input/tknizer/tknizer_ita.pickle', 'rb') as handle:
    tknizer_ita = pickle.load(handle)

In [12]:
# tknizer_ita = Tokenizer()
# tknizer_ita.fit_on_texts(train['italian'].values)

# tknizer_eng = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
# tknizer_eng.fit_on_texts(train['english_inp'].values)

In [13]:
# import pickle
# with open('tknizer_eng.pickle', 'wb') as handle:
#     pickle.dump(tknizer_eng, handle, protocol=pickle.HIGHEST_PROTOCOL)
# with open('tknizer_ita.pickle', 'wb') as handle:
#     pickle.dump(tknizer_ita, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
vocab_size_eng=len(tknizer_eng.word_index.keys())
print(vocab_size_eng)
vocab_size_ita=len(tknizer_ita.word_index.keys())
print(vocab_size_ita)

13043
26625


In [15]:
tknizer_eng.word_index['<start>'], tknizer_eng.word_index['<end>']

(1, 10347)

In [16]:
embeddings_index = dict()
f = open('glove.6B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_size_eng+1, 100))
for word, i in tknizer_eng.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [17]:
class Dataset:
    def __init__(self, data, tknizer_ita, tknizer_eng, max_len):
        self.encoder_inps = data['italian'].values
        self.decoder_inps = data['english_inp'].values
        self.decoder_outs = data['english_out'].values
        self.tknizer_eng = tknizer_eng
        self.tknizer_ita = tknizer_ita
        self.max_len = max_len

    def __getitem__(self, i):
        self.encoder_seq = self.tknizer_ita.texts_to_sequences([self.encoder_inps[i]]) # need to pass list of values
        self.decoder_inp_seq = self.tknizer_eng.texts_to_sequences([self.decoder_inps[i]])
        self.decoder_out_seq = self.tknizer_eng.texts_to_sequences([self.decoder_outs[i]])

        self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_len, dtype='int32', padding='post')
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_len, dtype='int32', padding='post')
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_len, dtype='int32', padding='post')
        

        return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq

    def __len__(self):
        return len(self.encoder_inps)

    
class Dataloader(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.encoder_inps))


    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data = []
        for j in range(start, stop):
            data.append(self.dataset[j])

        batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)]
        return tuple([[batch[0],batch[1]],batch[2]]) #([italian, english_inp], english_out)

    def __len__(self):
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

In [18]:
train_dataset = Dataset(train, tknizer_ita, tknizer_eng, 20)
test_dataset  = Dataset(validation, tknizer_ita, tknizer_eng, 20)

train_dataloader = Dataloader(train_dataset, batch_size=1024)
test_dataloader = Dataloader(test_dataset, batch_size=1024)

# print(train_dataloader[0][0][0].shape, train_dataloader[0][0][1].shape, train_dataloader[0][1].shape)

## Simple Encoder and Decoder

In [5]:
class Encoder(tf.keras.Model):
    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.inp_vocab_size = inp_vocab_size
        self.enc_embedding_size = embedding_size
        self.enc_lstm_size = lstm_size
        self.enc_input_length = input_length
        
        self.encoder_embedding = Embedding(input_dim=self.inp_vocab_size, output_dim=self.enc_embedding_size, 
                                           input_length=self.enc_input_length, mask_zero=True, name="Embedding_Layer_Encoder")
        self.encoder_lstm = LSTM(self.enc_lstm_size, return_state=True, return_sequences=True, name="Encoder_LSTM")

    def call(self,input_sequence,states):
        
        self.input_embedd = self.encoder_embedding(input_sequence)
        self.encoder_output, self.lstm_state_h,self.lstm_state_c = self.encoder_lstm(self.input_embedd, initial_state = states)
        return self.encoder_output, self.lstm_state_h,self.lstm_state_c #encoder_output, last time step's hidden and cell state
    
    def initialize_states(self,batch_size):
        self.batch_size = batch_size
        self.initial_hidden_state = tf.random.normal(shape=(self.batch_size,self.enc_lstm_size))
        self.initial_cell_state = tf.random.normal(shape=(self.batch_size,self.enc_lstm_size))
        return [self.initial_hidden_state,self.initial_cell_state]


In [7]:
class Decoder(tf.keras.Model):

    def __init__(self,out_vocab_size,embedding_size,lstm_size,input_length, flag=0):

        super().__init__()
        self.out_vocab_size = out_vocab_size
        self.dec_embedding_size = embedding_size
        self.dec_lstm_size = lstm_size
        self.dec_input_length = input_length
        self.flag = flag
        
        if self.flag == 0:
            self.decoder_embedding = Embedding(input_dim=self.out_vocab_size, output_dim=self.dec_embedding_size, 
                                           input_length=self.dec_input_length, mask_zero=True, name="Embedding_Layer_Decoder")
        else:
            self.decoder_embedding = Embedding(input_dim=self.out_vocab_size, output_dim=self.dec_embedding_size, 
                                           input_length=self.dec_input_length, mask_zero=True, weights=[embedding_matrix], trainable=False, name="Embedding_Layer_Decoder")
            
            
        self.decoder_lstm = LSTM(self.dec_lstm_size, return_state=True, return_sequences=True, name="Decoder_LSTM")
        


    def call(self,input_sequence,initial_state):
        
        self.input_embedd = self.decoder_embedding(input_sequence)
        decoder_output, decoder_state_h, decoder_state_c = self.decoder_lstm(self.input_embedd, initial_state=initial_state)
        return decoder_output, decoder_state_h, decoder_state_c
    

In [None]:
class Encoder_Decoder(tf.keras.Model):
    def __init__(self, encoder_inputs_length,decoder_inputs_length, output_vocab_size,batch_size):
        super().__init__()
        self.batch_size = batch_size
        self.encoder = Encoder(inp_vocab_size=vocab_size_ita+1, embedding_size=50, lstm_size=64, input_length=encoder_inputs_length)
        self.decoder = Decoder(out_vocab_size=vocab_size_eng+1, embedding_size=100, lstm_size=64, input_length=decoder_inputs_length, flag = 1)
        self.dense   = Dense(output_vocab_size, activation='softmax')
        
        
    def call(self, data):
        input_seq,output_seq = data[0], data[1] 
        
        # Encoder
        initial_state=self.encoder.initialize_states(self.batch_size)
        encoder_output, encoder_final_h, encoder_final_c = self.encoder(input_seq,initial_state)
        dec_state = [encoder_final_h,encoder_final_c]

        #Decoder
        dec_input = output_seq[:,0] # First word <start>
        dec_input = dec_input[:,np.newaxis]
        decoder_output,dec_h,dec_c = self.decoder(dec_input, dec_state)
        dec_state = [dec_h,dec_c] # Decoder state that'll be passed on next timestep

        for timestep in range(1,20):
     
          dec_input = output_seq[:,timestep] # decoder input at timestep
          dec_input = dec_input[:,np.newaxis]
          temp,dec_h,dec_c = self.decoder(dec_input, dec_state) # hidden and cell state from previous timestep
          dec_state = [dec_h,dec_c] # passing on current hidden and cell state to next timestep

          decoder_output = tf.keras.layers.Concatenate(axis=1)([decoder_output, temp])
          # print(timestep, decoder_output.shape)

        dropout = tf.keras.layers.Dropout(0.15)(decoder_output)
        output = self.dense(decoder_output)

        return output


In [27]:
model  = Encoder_Decoder(encoder_inputs_length=20,decoder_inputs_length=1,output_vocab_size=vocab_size_eng, batch_size = 1024)
opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt,loss=tf.keras.losses.SparseCategoricalCrossentropy())

train_steps=train.shape[0]//1024
valid_steps=validation.shape[0]//1024

model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=25, validation_data=test_dataloader, validation_steps=valid_steps)
model.summary()

2022-06-19 18:17:22.956353: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005
2022-06-19 18:17:23.383984: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Model: "encoder__decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_2 (Encoder)          multiple                  1364240   
_________________________________________________________________
decoder_1 (Decoder)          multiple                  1350640   
_________________________________________________________________
dense (Dense)                multiple                  850395    
Total params: 3,565,275
Trainable params: 2,256,875
Non-trainable params: 1,308,400
_________________________________________________________________


In [28]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=25, validation_data=test_dataloader, validation_steps=valid_steps)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f38340b4610>

In [None]:
model.save_weights('model_50.h5')

In [119]:
def predict(input_data):
    
  initial_state = [tf.random.normal((1,64)),tf.random.normal((1,64))] # Initial state of Encoder

  enc_output, state_h, state_c = model.layers[0](input_data,initial_state) #Encoder

  dec_input = np.array(tknizer_eng.texts_to_sequences(['<start>'])) # First word

  translated_seq = []

  final_word = ''

  i=0
  while ((i<20) or (final_word == '<end>')):
    dec_output,state_h,state_c = model.layers[1](dec_input,[state_h,state_c]) # Decoder
    pred_output = model.layers[2](dec_output) # Dense
    
    max_prob_word_index = tf.math.argmax(pred_output[0][0], axis=0)

    final_word = list(tknizer_eng.word_index.keys())[max_prob_word_index]
    translated_seq.append(final_word)

    dec_input = np.array(tknizer_eng.texts_to_sequences([final_word])) # Predicted word passed on to next timestep
    i += 1

  translated_seq = ' '.join(translated_seq)
  return translated_seq

In [133]:
import random
from nltk.translate.bleu_score import corpus_bleu

val_index = [index for index in validation.index.values]
random_seq_index = random.sample(val_index,1000)

sample_ita = data.loc[random_seq_index,'italian']
actual_sample_eng = data.loc[random_seq_index,'english_out']

In [134]:
test_seq = tknizer_ita.texts_to_sequences(sample_ita)
padded_test_seq = pad_sequences(test_seq, 20, dtype='int32', padding='post')

In [137]:
import nltk
pred_eng = []
for seq,actual in zip(padded_test_seq,actual_sample_eng):
    pred = predict(seq[np.newaxis,:])
    pred_eng.append(pred)
pred_eng = pd.Series(pred_eng)
print('Corpus Bleu:',nltk.translate.bleu_score.corpus_bleu(pred_eng,actual_sample_eng))

Corpus Bleu: 0.7634903560832931


## Encoder Decoder with Attention Layer

### Encoder ###

In [9]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        super().__init__()
        self.inp_vocab_size = inp_vocab_size
        self.enc_embedding_size = embedding_size
        self.enc_lstm_size = lstm_size
        self.enc_input_length = input_length
        
        self.encoder_embedding = Embedding(input_dim=self.inp_vocab_size, output_dim=self.enc_embedding_size, 
                                           input_length=self.enc_input_length, mask_zero=True, name="Embedding_Layer_Encoder")
        self.encoder_lstm = LSTM(self.enc_lstm_size, return_state=True, return_sequences=True, name="Encoder_LSTM")

    def call(self,input_sequence,states):
        
        self.input_embedd = self.encoder_embedding(input_sequence)
        self.encoder_output, self.lstm_state_h,self.lstm_state_c = self.encoder_lstm(self.input_embedd, initial_state = states)
        return self.encoder_output, self.lstm_state_h,self.lstm_state_c #encoder_output, last time step's hidden and cell state
    
    def initialize_states(self,batch_size):
        self.batch_size = batch_size
        self.initial_hidden_state = tf.random.normal((self.batch_size,self.enc_lstm_size))
        self.initial_cell_state = tf.random.normal((self.batch_size,self.enc_lstm_size))
        return [self.initial_hidden_state,self.initial_cell_state]

### Attention ###

In [11]:
from tensorflow.python.eager import context
class Attention(tf.keras.layers.Layer):
    def __init__(self,scoring_fun, att_units, k = 4):
        super().__init__()
        self.scoring_fun = scoring_fun
        self.att_units = att_units

        if self.scoring_fun=='dot':
            pass
        elif self.scoring_fun == 'general':
            pass
        elif self.scoring_fun == 'concat':
            self.k = k
        
    def call(self,decoder_hidden_state_batch,encoder_outputs_batch):

        if self.scoring_fun == 'dot':
            scores = tf.keras.layers.Dot(axes=(2,1))([encoder_outputs_batch,decoder_hidden_state_batch[:,:,np.newaxis]])
            softmax_scores = np.exp(scores) / np.sum(np.exp(scores), axis=0)
            softmax_scores = np.squeeze(softmax_scores, axis=2)

                
        elif self.scoring_fun == 'general':
            
            weights = np.random.rand(encoder_outputs_batch.shape[2], decoder_hidden_state_batch.shape[1])
            temp1 = np.tensordot(encoder_outputs_batch,weights, axes=((2),(0))) # 1024x20x64
            temp2 = np.tensordot(decoder_hidden_state_batch,weights, axes=((1),(1))) # 1024x64
            scores = tf.keras.layers.Dot(axes=(2,1))([temp1,temp2[:,:,np.newaxis]]) # 1024x20x1
            
            scores = np.squeeze(scores,axis=2)
            temp_scores = scores/tf.norm(scores)
            softmax_scores = np.exp(temp_scores) / np.sum(np.exp(temp_scores), axis=0) 
            
        elif self.scoring_fun == 'concat':
            
            weights_enc = np.random.rand(encoder_outputs_batch.shape[2],self.k)
            weights_dec = np.random.rand(decoder_hidden_state_batch.shape[1],self.k)
            v = np.random.rand(self.k,1)
            
            temp_enc = np.tensordot(encoder_outputs_batch,weights_enc, axes=((2),(0))) # 1024x20xk
            
            temp_dec = np.tensordot(decoder_hidden_state_batch,weights_dec, axes=((1),(0))) # 1024xk
            temp_dec = temp_dec[:,np.newaxis,:] #1024x1xk
            temp_dec = np.repeat(temp_dec,encoder_outputs_batch.shape[1],axis=1) #1024x20xk
            
            temp = np.add(temp_enc,temp_dec) #1024x20xk
            temp = np.tanh(temp) #1024x20xk
            
            scores = np.tensordot(temp,v, axes=((2),(0))) #1024x20x1
            softmax_scores = np.exp(scores) / np.sum(np.exp(scores), axis=0) #1024x20x1
            softmax_scores = np.squeeze(softmax_scores, axis=2) #1024x20

        context_vector = tf.keras.layers.Dot(axes=(1,1))([encoder_outputs_batch,softmax_scores])
        
        return context_vector,softmax_scores[:,:,np.newaxis] #BatchSizexAtt_Units, BatchSizexTimestepsx1

### Decoder ###

#### OneStepDecoder ####

In [13]:
class OneStepDecoder(tf.keras.layers.Layer):
  def __init__(self,tar_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units,flag=0):
    
    # Initializing decoder embedding layer, LSTM and other objects.
    
    super().__init__()
    self.tar_vocab_size = tar_vocab_size
    self.embedding_dim = embedding_dim
    self.input_length = input_length
    self.dec_units = dec_units
    self.score_fun = score_fun
    self.att_units = att_units
    self.flag = flag
    
    if self.flag == 0:
        self.decoder_embedding = Embedding(input_dim=self.tar_vocab_size, output_dim=self.embedding_dim, 
                                        input_length=self.input_length, mask_zero=True, name="Decoder_Embedding")
    else:
        self.decoder_embedding = Embedding(input_dim=self.tar_vocab_size, output_dim=self.embedding_dim, 
                                        input_length=self.input_length, weights = [embedding_matrix], trainable = False, mask_zero=True, name="Decoder_Embedding")
        
    self.decoder_lstm = LSTM(self.att_units, return_state=True, return_sequences=True, name="Decoder_LSTM")
    self.attention = Attention(self.score_fun,self.att_units)
    self.dense = Dense(self.tar_vocab_size, activation = 'softmax')

  def call(self,input_to_decoder, encoder_output, state_h,state_c):
    
    dec_input_embedd = self.decoder_embedding(input_to_decoder) # BatchSizex1xEmbeddingUnits

#     print('Dec:', dec_input_embedd.shape)
    
    context_vector, scores = self.attention(state_h,encoder_output) # BatchSizexEmbeddingUnits

#     print('Con:', context_vector.shape)
    
    final_input = np.concatenate((dec_input_embedd,context_vector[:,np.newaxis,:]), axis=-1)
    
    decoder_output, decoder_state_h, decoder_state_c = self.decoder_lstm(final_input, initial_state = [state_h,state_c])
    
    final_output = self.dense(decoder_output) #BatchSizex1xVocabSize
    
    final_output = final_output[:,0,:] # BatchSizexVocabSize
    
    return final_output,decoder_state_h,decoder_state_c,scores,context_vector

In [15]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,out_vocab_size, embedding_dim, input_length, dec_units ,score_fun ,att_units, flag = 0):
      #Intializing necessary variables and creating an object from the class onestepdecoder
        
        super().__init__()
        self.out_vocab_size = out_vocab_size
        self.dec_embedding_dim = embedding_dim
        self.dec_input_length = input_length
        self.dec_lstm_size = dec_units
        self.score_fun = score_fun
        self.att_units = att_units
        self.flag = flag
        
        self.onestepdecoder=OneStepDecoder(self.out_vocab_size, self.dec_embedding_dim, self.dec_input_length, self.dec_lstm_size ,self.score_fun ,self.att_units, self.flag)

        
    def call(self, input_to_decoder,encoder_output,decoder_hidden_state,decoder_cell_state, flag=0):
        
        all_outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
        
        if flag == 0:
            tmp = input_to_decoder.shape[1]
        else:
            tmp = 20
            
        for timestep in range(tmp): # for each timestep in decoder input
            dec_input = input_to_decoder[:,timestep]
            dec_input = dec_input[:,np.newaxis]
            output,decoder_hidden_state,decoder_cell_state,attention_weights,context_vector=self.onestepdecoder(dec_input,encoder_output,decoder_hidden_state,decoder_cell_state)
            all_outputs = all_outputs.write(timestep,output)

        all_outputs = tf.transpose(all_outputs.stack(),[1,0,2])
        return all_outputs

### Encoder Decoder Model ###

In [30]:
class encoder_decoder(tf.keras.Model):
  def __init__(self,batch_size,enc_input_length, dec_input_length,score_fun ,att_units,flag):
    #Intializing objects from encoder decoder
               
    super().__init__()
    self.batch_size = batch_size
    self.enc_input_length = enc_input_length
    self.dec_input_length = dec_input_length
    self.score_fun = score_fun
    self.att_units = att_units
    self.flag = flag

    self.encoder = Encoder(vocab_size_ita+1, 50, 64, self.enc_input_length)
    self.decoder = Decoder(vocab_size_eng+1, 100, self.dec_input_length, 64, self.score_fun,self.att_units,self.flag)

  
  def call(self,data):

    input_seq,output_seq = data[0], data[1]

    initial_state=self.encoder.initialize_states(self.batch_size)

    encoder_output, encoder_final_h, encoder_final_c = self.encoder(input_seq,initial_state)
    decoder_output = self.decoder(output_seq,encoder_output, encoder_final_h, encoder_final_c,1)

    return decoder_output

#### Dot ####

In [33]:
model  = encoder_decoder(batch_size = 1024,enc_input_length=20,dec_input_length=1,score_fun='dot',att_units=64,flag=1)
opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt,loss=tf.keras.losses.SparseCategoricalCrossentropy(), run_eagerly=True)

train_steps=train.shape[0]//1024
valid_steps=validation.shape[0]//1024

model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=25, validation_data=test_dataloader, validation_steps=valid_steps)
model.summary()

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Model: "encoder_decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_1 (Encoder)          multiple                  1360740   
_________________________________________________________________
decoder_3 (Decoder)          multiple                  2210884   
Total params: 3,571,624
Trainable params: 2,267,224
Non-trainable params: 1,304,400
_________________________________________________________________


In [None]:
model.save_weights('W_Attention_25_dot.h5')

In [28]:
# Loading Weights

model  = encoder_decoder(batch_size = 1024,enc_input_length=20,dec_input_length=1,score_fun='dot',att_units=64,flag=1)
opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt,loss=tf.keras.losses.SparseCategoricalCrossentropy(), run_eagerly=True)

train_steps=train.shape[0]//1024
valid_steps=validation.shape[0]//1024

model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=1, validation_data=test_dataloader, validation_steps=valid_steps, verbose=0)
model.load_weights('../input/tknizer/W_Attention_25_dot.h5')
model.summary()

Model: "encoder_decoder_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_2 (Encoder)          multiple                  1360740   
_________________________________________________________________
decoder_4 (Decoder)          multiple                  2210884   
Total params: 3,571,624
Trainable params: 2,267,224
Non-trainable params: 1,304,400
_________________________________________________________________


In [30]:
def predict(input_data):
    
    all_outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    
    initial_state = model.layers[0].initialize_states(input_data.shape[0]) 

    enc_output, state_h, state_c = model.layers[0](padded_test_seq,initial_state)

    dec_input = np.array(tknizer_eng.texts_to_sequences(['<start>'])) # First word 
    dec_input = np.repeat(dec_input,1000) 
    dec_input = dec_input[:,np.newaxis] # 1000x1

    osdec=OneStepDecoder(vocab_size_eng+1,100,1,64,'dot',64,1)
    
    output,decoder_hidden_state,decoder_cell_state,_,_ = osdec(dec_input,enc_output,state_h,state_c)
    
    all_outputs = all_outputs.write(0,output)
    
    for timestep in range(1,20): # for each timestep in decoder input
        
        max_prob_word_index = tf.math.argmax(output, axis=1)

        tmp = [list(tknizer_eng.word_index.keys())[index] for index in max_prob_word_index]
        
        dec_input = np.array(tknizer_eng.texts_to_sequences(tmp))
        
        output,decoder_hidden_state,decoder_cell_state,_,_ = osdec(dec_input,enc_output,decoder_hidden_state,decoder_cell_state)
    
        all_outputs = all_outputs.write(timestep,output)
        
    all_outputs = all_outputs.write(timestep,output)

    all_outputs = tf.transpose(all_outputs.stack(),[1,0,2])
    
    return all_outputs

In [31]:
import nltk
import random
from nltk.translate.bleu_score import corpus_bleu

val_index = [index for index in validation.index.values]
random_seq_index = random.sample(val_index,1000)

sample_ita = data.loc[random_seq_index,'italian']
actual_sample_eng = data.loc[random_seq_index,'english_out']

test_seq = tknizer_ita.texts_to_sequences(sample_ita)
padded_test_seq = pad_sequences(test_seq, 20, dtype='int32', padding='post')

In [32]:
preds = predict(padded_test_seq)

In [33]:
preds = tf.math.argmax(preds, axis=-1)

In [39]:
pred_eng = tknizer_eng.sequences_to_texts(preds.numpy())

In [42]:
pred_eng = pd.Series(pred_eng)
print('Corpus Bleu:',nltk.translate.bleu_score.corpus_bleu(pred_eng,actual_sample_eng))

Corpus Bleu: 0.794098876160373


#### General ####

In [31]:
model  = encoder_decoder(batch_size = 1024,enc_input_length=20,dec_input_length=1,score_fun='general',att_units=64,flag=1)
opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt,loss=tf.keras.losses.SparseCategoricalCrossentropy(), run_eagerly=True)

train_steps=train.shape[0]//1024
valid_steps=validation.shape[0]//1024

model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=25, validation_data=test_dataloader, validation_steps=valid_steps)
model.summary()

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Model: "encoder_decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_2 (Encoder)          multiple                  1360740   
_________________________________________________________________
decoder_3 (Decoder)          multiple                  2210884   
Total params: 3,571,624
Trainable params: 2,267,224
Non-trainable params: 1,304,400
_________________________________________________________________


In [32]:
model.save_weights('W_Attention_25_general.h5')

In [36]:
def predict(input_data):
    
    all_outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    
    initial_state = model.layers[0].initialize_states(input_data.shape[0]) 

    enc_output, state_h, state_c = model.layers[0](padded_test_seq,initial_state)

    dec_input = np.array(tknizer_eng.texts_to_sequences(['<start>'])) # First word 
    dec_input = np.repeat(dec_input,1000) 
    dec_input = dec_input[:,np.newaxis] # 1000x1

    osdec=OneStepDecoder(vocab_size_eng+1,100,1,64,'dot',64,1)
    
    output,decoder_hidden_state,decoder_cell_state,_,_ = osdec(dec_input,enc_output,state_h,state_c)
    
    all_outputs = all_outputs.write(0,output)
    
    for timestep in range(1,20): # for each timestep in decoder input
        
        max_prob_word_index = tf.math.argmax(output, axis=1)

        tmp = [list(tknizer_eng.word_index.keys())[index] for index in max_prob_word_index]
        
        dec_input = np.array(tknizer_eng.texts_to_sequences(tmp))
        
        output,decoder_hidden_state,decoder_cell_state,_,_ = osdec(dec_input,enc_output,decoder_hidden_state,decoder_cell_state)
    
        all_outputs = all_outputs.write(timestep,output)
        
    all_outputs = all_outputs.write(timestep,output)

    all_outputs = tf.transpose(all_outputs.stack(),[1,0,2])
    
    return all_outputs

In [37]:
import nltk
import random
from nltk.translate.bleu_score import corpus_bleu

val_index = [index for index in validation.index.values]
random_seq_index = random.sample(val_index,1000)

sample_ita = data.loc[random_seq_index,'italian']
actual_sample_eng = data.loc[random_seq_index,'english_out']

test_seq = tknizer_ita.texts_to_sequences(sample_ita)
padded_test_seq = pad_sequences(test_seq, 20, dtype='int32', padding='post')

In [38]:
preds = predict(padded_test_seq)
preds = tf.math.argmax(preds, axis=-1)

pred_eng = tknizer_eng.sequences_to_texts(preds.numpy())
pred_eng = pd.Series(pred_eng)

print('Corpus Bleu:',nltk.translate.bleu_score.corpus_bleu(pred_eng,actual_sample_eng))

Corpus Bleu: 0.7977816329796129


#### Concat ####

In [27]:
model  = encoder_decoder(batch_size = 1024,enc_input_length=20,dec_input_length=1,score_fun='concat',att_units=64,flag=1)
opt = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=opt,loss=tf.keras.losses.SparseCategoricalCrossentropy(), run_eagerly=True)

train_steps=train.shape[0]//1024
valid_steps=validation.shape[0]//1024

model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=25, validation_data=test_dataloader, validation_steps=valid_steps)
model.summary()

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Model: "encoder_decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_1 (Encoder)          multiple                  1360740   
_________________________________________________________________
decoder_3 (Decoder)          multiple                  2210884   
Total params: 3,571,624
Trainable params: 2,267,224
Non-trainable params: 1,304,400
_________________________________________________________________


In [28]:
model.save_weights('W_Attention_25_concat.h5')

In [31]:
def predict(input_data):
    
    all_outputs = tf.TensorArray(tf.float32, size=0, dynamic_size=True)
    
    initial_state = model.layers[0].initialize_states(input_data.shape[0]) 

    enc_output, state_h, state_c = model.layers[0](padded_test_seq,initial_state)

    dec_input = np.array(tknizer_eng.texts_to_sequences(['<start>'])) # First word 
    dec_input = np.repeat(dec_input,1000) 
    dec_input = dec_input[:,np.newaxis] # 1000x1

    osdec=OneStepDecoder(vocab_size_eng+1,100,1,64,'dot',64,1)
    
    output,decoder_hidden_state,decoder_cell_state,_,_ = osdec(dec_input,enc_output,state_h,state_c)
    
    all_outputs = all_outputs.write(0,output)
    
    for timestep in range(1,20): # for each timestep in decoder input
        
        max_prob_word_index = tf.math.argmax(output, axis=1)

        tmp = [list(tknizer_eng.word_index.keys())[index] for index in max_prob_word_index]
        
        dec_input = np.array(tknizer_eng.texts_to_sequences(tmp))
        
        output,decoder_hidden_state,decoder_cell_state,_,_ = osdec(dec_input,enc_output,decoder_hidden_state,decoder_cell_state)
    
        all_outputs = all_outputs.write(timestep,output)
        
    all_outputs = all_outputs.write(timestep,output)

    all_outputs = tf.transpose(all_outputs.stack(),[1,0,2])
    
    return all_outputs

In [32]:
import nltk
import random
from nltk.translate.bleu_score import corpus_bleu

val_index = [index for index in validation.index.values]
random_seq_index = random.sample(val_index,1000)

sample_ita = data.loc[random_seq_index,'italian']
actual_sample_eng = data.loc[random_seq_index,'english_out']

test_seq = tknizer_ita.texts_to_sequences(sample_ita)
padded_test_seq = pad_sequences(test_seq, 20, dtype='int32', padding='post')

In [33]:
preds = predict(padded_test_seq)
preds = tf.math.argmax(preds, axis=-1)

pred_eng = tknizer_eng.sequences_to_texts(preds.numpy())
pred_eng = pd.Series(pred_eng)

print('Corpus Bleu:',nltk.translate.bleu_score.corpus_bleu(pred_eng,actual_sample_eng))

Corpus Bleu: 0.7964345941820906


In [1]:
from prettytable import PrettyTable
myTable = PrettyTable(["Description", "Function", "Bleu Score"])

myTable.add_row(["Simple Encoder Decoder without Attention", "N/A", "0.7634"])
myTable.add_row(["Encoder Decoder with Attention", "Dot", "0.7940"])
myTable.add_row(["Encoder Decoder with Attention", "General", "0.7977"])
myTable.add_row(["Encoder Decoder with Attention", "Concat", "0.7964"])
  
print(myTable)

+------------------------------------------+----------+------------+
|               Description                | Function | Bleu Score |
+------------------------------------------+----------+------------+
| Simple Encoder Decoder without Attention |   N/A    |   0.7634   |
|      Encoder Decoder with Attention      |   Dot    |   0.7940   |
|      Encoder Decoder with Attention      | General  |   0.7977   |
|      Encoder Decoder with Attention      |  Concat  |   0.7964   |
+------------------------------------------+----------+------------+
