In [None]:
import numpy as np
import pandas as pd
import re

import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# downloading data
!wget http://www.manythings.org/anki/ita-eng.zip
!unzip ita-eng.zip

**Processing Data**

In [None]:
# reading data
with open('ita.txt', 'r', encoding="utf8") as f:
    print(f.readlines(500))

['Hi.\tCiao!\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #607364 (Cero)\n', 'Hi.\tCiao.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4522287 (Guybrush88)\n', 'Run!\tCorri!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906347 (Guybrush88)\n', 'Run!\tCorra!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906348 (Guybrush88)\n', 'Run!\tCorrete!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906350 (Guybrush88)\n', 'Who?\tChi?\tCC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #2126402 (Guybrush88)\n']


In [None]:
# creating dataframe with english , italian sentences
with open('ita.txt', 'r', encoding="utf8") as f:
    eng=[]
    ita=[]
    for i in f.readlines():
      a = i.split("\t")
      eng.append(a[0])
      ita.append(a[1])
data = pd.DataFrame()
data['english'] = eng
data['italian'] = ita
data

Unnamed: 0,english,italian
0,Hi.,Ciao!
1,Hi.,Ciao.
2,Run!,Corri!
3,Run!,Corra!
4,Run!,Correte!
...,...,...
362856,I know that adding sentences only in your nati...,So che aggiungere frasi soltanto nella sua lin...
362857,I know that adding sentences only in your nati...,So che aggiungere frasi solamente nella sua li...
362858,I know that adding sentences only in your nati...,So che aggiungere frasi solamente nella sua li...
362859,Doubtless there exists in this world precisely...,Senza dubbio esiste in questo mondo proprio la...


In [None]:
def decontractions(phrase): #https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python/47091490#47091490
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
def preprocess_eng(text):
    text = decontractions(text)
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    text = text.lower()
    return text

In [None]:
def preprocess_ita(text):
    text = decontractions(text)
    text = re.sub('[$)\?"'.°!;\'€%:,(/]', '', text)
    text = re.sub('\u200b', ' ', text)
    text = re.sub('\xa0', ' ', text)
    text = re.sub('-', ' ', text)
    text = text.lower()
    return text

In [None]:
data['english'] = data['english'].apply(preprocess_eng)
data['italian'] = data['italian'].apply(preprocess_ita)
data.head()

Unnamed: 0,english,italian
0,hi,ciao
1,hi,ciao
2,run,corri
3,run,corra
4,run,correte


In [None]:
# getting length of sentences to select max_length
def length_text(data):
  len_data = []
  for i in data:
    len_data.append(len(i.split()))
  return len_data

In [None]:
ita_lengths = length_text(data['italian'].values)
eng_lengths = length_text(data['english'].values)

In [None]:
np.quantile(ita_lengths , [0,0.25,0.5,0.75,0.9,0.95,0.99])

array([ 1.,  4.,  5.,  7.,  8.,  9., 12.])

In [None]:
np.quantile(eng_lengths , [0,0.25,0.5,0.75,0.9,0.95,0.99])

array([ 1.,  4.,  6.,  7.,  8.,  9., 12.])

In [None]:
np.quantile(ita_lengths , np.arange(.99,1,0.001)) # getting percentile value between 0.99 to 1.0

array([12., 12., 12., 13., 13., 13., 14., 15., 16., 22., 92.])

In [None]:
np.quantile(eng_lengths , np.arange(.99,1,0.001))

array([ 12.,  12.,  13.,  13.,  13.,  14.,  14.,  15.,  16.,  25., 101.])

In [None]:
ita_lengths = data['italian'].str.split().apply(len)
eng_lengths = data['english'].str.split().apply(len)

In [None]:
# selecting maximum input length of 20 words

data['italian_len'] = data['italian'].str.split().apply(len)
data = data[data['italian_len'] < 20]

data['english_len'] = data['english'].str.split().apply(len)
data = data[data['english_len'] < 20]

data['english_inp'] = '<start> ' + data['english'].astype(str) # decoder input starts with <start> token
data['english_out'] = data['english'].astype(str) + ' <end>'   # decoder output ends with <end> token

data = data.drop(['english','italian_len','english_len'], axis=1)

In [None]:
print(data.english_inp.values[15000])
print(data.english_out.values[15000])

<start> let tom drive
let tom drive <end>


In [None]:
data.sample(5)

Unnamed: 0,italian,english_inp,english_out
99788,tom sembra devastato,<start> tom looks devastated,tom looks devastated <end>
59360,aggiungete una spiegazione,<start> add an explanation,add an explanation <end>
250904,avevo tutto sotto controllo,<start> i had everything under control,i had everything under control <end>
12678,fu lapidato,<start> he was stoned,he was stoned <end>
123290,io non ve lo posso dare,<start> i can not give it to you,i can not give it to you <end>


In [None]:
# train test split

from sklearn.model_selection import train_test_split
trainval, test = train_test_split(data, test_size=0.003) # for test set getting appox. 1000 sentences
train, val = train_test_split(trainval, test_size=0.15)

In [None]:
print(train.shape, val.shape, test.shape)
train.iloc[0]['english_inp']= str(train.iloc[0]['english_inp'])+' <end>'

(307077, 3) (54191, 3) (1088, 3)


In [None]:
# tokenizing

tknizer_ita = Tokenizer()
tknizer_ita.fit_on_texts(train['italian'].values)
tknizer_eng = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n') # filter same as default except < , >
tknizer_eng.fit_on_texts(train['english_inp'].values)

In [None]:
vocab_size_eng=len(tknizer_eng.word_index.keys())
print(vocab_size_eng)
vocab_size_ita=len(tknizer_ita.word_index.keys())
print(vocab_size_ita)

13335
27402


In [None]:
tknizer_eng.word_index['<start>'], tknizer_eng.word_index['<end>']

(1, 10668)

In [None]:
# getting glove vectors
!wget https://www.dropbox.com/s/ddkmtqz01jc024u/glove.6B.100d.txt

In [None]:
# embedding matrix using glove 100d embeddings

embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_size_eng+1, 100))
for word, i in tknizer_eng.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape

(13336, 100)

**Dataset Loader:**

In [None]:
class Dataset:
    def __init__(self, data, tknizer_ita, tknizer_eng, max_len):
        self.encoder_inps = data['italian'].values
        self.decoder_inps = data['english_inp'].values
        self.decoder_outs = data['english_out'].values
        self.tknizer_eng = tknizer_eng
        self.tknizer_ita = tknizer_ita
        self.max_len = max_len

    def __getitem__(self, i):
        self.encoder_seq = self.tknizer_ita.texts_to_sequences([self.encoder_inps[i]])  # tokenizing input italian sentence
        self.decoder_inp_seq = self.tknizer_eng.texts_to_sequences([self.decoder_inps[i]]) # tokenizing decoder input 
        self.decoder_out_seq = self.tknizer_eng.texts_to_sequences([self.decoder_outs[i]]) # tokenizing decoder output

        self.encoder_seq = pad_sequences(self.encoder_seq, maxlen=self.max_len, dtype='int32', padding='post')
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, maxlen=self.max_len, dtype='int32', padding='post')
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq, maxlen=self.max_len, dtype='int32', padding='post')
        return self.encoder_seq, self.decoder_inp_seq, self.decoder_out_seq

    def __len__(self): 
        return len(self.encoder_inps)

    
class Dataloder(tf.keras.utils.Sequence):    
    def __init__(self, dataset, batch_size=1):
        self.dataset = dataset
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.dataset.encoder_inps))


    def __getitem__(self, i):
        start = i * self.batch_size
        stop = (i + 1) * self.batch_size
        data = []
        for j in range(start, stop):
            data.append(self.dataset[j])

        batch = [np.squeeze(np.stack(samples, axis=1), axis=0) for samples in zip(*data)]
        
        return tuple([[batch[0],batch[1]],batch[2]]) #input to encoder:italian sent, input to decoder:eng sent with <start>, decoder output:eng sent with <end>

    def __len__(self): 
        return len(self.indexes) // self.batch_size

    def on_epoch_end(self):
        self.indexes = np.random.permutation(self.indexes)

**Building Model:**

seq2seq_attention_mechanism_new.svg

ref: https://guillaumegenthial.github.io/sequence-to-sequence.html

**Encoder layer:**

In [None]:
class Encoder(tf.keras.layers.Layer):
    '''class that enocodes input sentence using lstm and returns encoded vector of shape [batch_size,input_length,lstm_units]'''
    def __init__(self, vocab_size, embedding_dim_enc, input_length, enc_units, dropout=0.0, recurrent_dropout=0.0):
      super().__init__()
      self.vocab_size = vocab_size
      self.embedding_dim_enc = embedding_dim_enc
      self.input_length = input_length
      self.enc_units= enc_units
      self.dropout = dropout 
      self.recurrent_dropout = recurrent_dropout 

    def get_config(self):
      config = super().get_config()
      config.update({'vocab_size': self.vocab_size, 'embedding_dim_enc': self.embedding_dim_enc,\
                     'input_length': self.input_length, 'enc_units': self.enc_units})
      return config

    def build(self, input_shape):
      self.embedding = Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim_enc, input_length=self.input_length,
                          mask_zero=True, name="embedding_layer_encoder") 
      self.lstm = LSTM(self.enc_units, return_state=True, return_sequences=True,dropout=self.dropout,\
                        recurrent_dropout=self.recurrent_dropout ,name="Encoder_LSTM")
      
    def call(self, input_sentances, training=True):
      '''input sentence is embeded and then passed to lstm'''
      input_embedd = self.embedding(input_sentances) # [b, max_len, embed-size]
      self.lstm_output, _, _ = self.lstm(input_embedd) # [b, max_len, lstm-units], 
      return self.lstm_output
    
    def initialize_states(self,batch_size):
      '''Given a batch size it will return intial hidden state and intial cell state'''
      return tf.zeros([batch_size, self.enc_units ]), tf.zeros([batch_size, self.enc_units ])

    def get_states(self):
      return self.lstm_state_h, self.lstm_state_c

In [None]:
# checking code

vocab_size = 150
embedding_dim_enc = 50
input_length = 10
enc_units= 32
dropout = 0.3
recurrent_dropout = 0.3
batch_size = 16

input_sentances = tf.random.uniform(shape=[batch_size,input_length])

e = Encoder(vocab_size, embedding_dim_enc, input_length, enc_units, dropout, recurrent_dropout)
e(input_sentances).shape

TensorShape([16, 10, 32])

**Attention Mechanism Layer:**

In [None]:
class AttentionMechanism(tf.keras.layers.Layer):
  '''Class the calculates attention weights and corresponding weighted vector using simple dot product operation.'''
  def __init__(self, initializer = tf.keras.initializers.GlorotUniform() ):
    super().__init__()
    self.initializer = initializer

  def get_config(self):
    config = super().get_config()
    config.update({'initializer': self.initializer })
    return config

  def call(self,decoder_hidden_state,encoder_output):
    '''decoder input is transformed to match encoder output dimension and attention weights are calculated
    based on similarity using dot products and weighted sum of encoder hidden state vector is returned as context vector
    to be used by decoder'''
    initializer = self.initializer
    # initializing decoder transformation matrix
    values = initializer(shape=(decoder_hidden_state.shape[2],encoder_output.shape[2])) # [dec_embed_dim, encoder_lstm_units]
    # tranforming decoder input
    similarity1 = tf.matmul(decoder_hidden_state,values) # [b,1,dec_embed_dim] X [dec_embed_dim, encoder_lstm_units] = [b,1,encoder_lstm_units]
    # finding similarity score
    similarity = tf.matmul(similarity1,encoder_output, transpose_b=True) # [b,1,encoder_lstm_units] X [b,encoder_lstm_units,max_len] = [b,1,max_len]
    # normalizing scores using softmax
    attn_weights = tf.nn.softmax( similarity,axis=-1 ) # [b,1,max_len]
    # calculating weighted sum
    context_vector = tf.matmul(attn_weights,encoder_output) # [b,1,max_len] X -[b,max_len,encoder_lstm_units] = [b,1,encoder_lstm_units]
    context_vector = tf.squeeze(context_vector,axis=1) # [b,encoder_lstm_units]
    attn_weights = tf.transpose(attn_weights, perm=[0, 2, 1]) # [b,max_len,1]
    return context_vector, attn_weights

In [None]:
# checking code

input_length=10
batch_size=16
encoder_lstm_units=32
dec_embed_dim=50

decoder_hidden_state=tf.random.uniform(shape=[batch_size,1,dec_embed_dim])
encoder_output=tf.random.uniform(shape=[batch_size,input_length,encoder_lstm_units])

attn=AttentionMechanism()
c,wt = attn(decoder_hidden_state,encoder_output)
c.shape , wt.shape

(TensorShape([16, 32]), TensorShape([16, 10, 1]))

**Decoder Encoder Cross Attention:**

In [None]:
class DecoderEncoderCrossAttention(tf.keras.Model):
  '''class that performs cross attention on decoder input, pass the attention updated input to
  lstm and then to final dense layer having units equal to output vocab size. the decoder input is passed
  one word at a time over batch'''
  def __init__(self,tar_vocab_size, embedding_dim_dec, input_length, dec_units, lstm_dropout, recurrent_dropout, trainable=True):
    super().__init__()
    self.tar_vocab_size = tar_vocab_size
    self.embedding_dim_dec = embedding_dim_dec
    self.input_length = input_length
    self.dec_units = dec_units
    self.lstm_dropout = lstm_dropout
    self.recurrent_dropout = recurrent_dropout 
    self.trainable = True

  def get_config(self):
      config = super().get_config()
      config.update({
          'tar_vocab_size': self.tar_vocab_size, 'embedding_dim_dec': self.embedding_dim_dec,
          'input_length': self.input_length, 'dec_units': self.dec_units, 'trainable': self.trainable})
      return config

  def build(self, input_shape):
    if self.trainable:
      self.embedding = Embedding(input_dim=self.tar_vocab_size, output_dim=self.embedding_dim_dec, input_length=self.input_length,
                        mask_zero=True, name="embedding_layer_decoder", trainable=True)
    else:
      self.embedding = Embedding(input_dim=self.tar_vocab_size, output_dim=self.embedding_dim_dec, input_length=self.input_length,
                        mask_zero=True, name="embedding_layer_decoder", weights=[embedding_matrix], trainable=False)

    self.lstm = LSTM(self.dec_units, return_sequences=True, return_state=True,dropout=self.lstm_dropout,recurrent_dropout=self.recurrent_dropout, name="Encoder_LSTM")
    self.attention = AttentionMechanism()
    self.dense_layer = Dense(self.tar_vocab_size,activation=None)

  def call(self,input_to_decoder, encoder_output): 
    '''takes decoder single input over batch, encoder outputs, performs cross attention and returns returns logits'''
    # embedding decoder single input
    target_embedd = self.embedding(input_to_decoder) # [b,1,embedding_dim_dec]
    # getting attention updated embedding vector
    context_vector,_ = self.attention(target_embedd,encoder_output) #[b,encoder_lstm_units]
    # concataneting embeded input and attention updated input
    concat_input = tf.concat([target_embedd, tf.expand_dims(context_vector, 1)], -1)  # [b,1,(embedding_dim_dec+encoder_lstm_units)] 
    # passing to lstm
    self.lstm_output, self.lstm_state_h, self.lstm_state_c = self.lstm(concat_input) # [b,1,dec_lstm_units] , [b,1,dec_lstm_units] , [b,1,dec_lstm_units]
    # getting logits
    output = self.dense_layer(self.lstm_output)  # [b,1,tar_vocab_size]
    output = tf.squeeze(output,axis=1) # [b,tar_vocab_size]
    return output, self.lstm_state_h,self.lstm_state_c, context_vector                                                   

In [None]:
# checking code

tar_vocab_size=150
embedding_dim_dec=50
input_length=10
dec_units=48
batch_size=16
lstm_dropout = 0.5
recurrent_dropout=0.5

input_to_decoder=tf.random.uniform(shape=(batch_size,1),maxval=10,minval=0,dtype=tf.int32)
encoder_output=tf.random.uniform(shape=[batch_size,input_length,32])

d = DecoderEncoderCrossAttention(tar_vocab_size,embedding_dim_dec,input_length,dec_units,lstm_dropout,recurrent_dropout)
o,_,_,_ = d(input_to_decoder,encoder_output)
o.shape

TensorShape([16, 150])

**Decoder Layer:**

In [None]:
class DecoderBlock(tf.keras.Model):
  '''class which gives logits values for full decoder input length '''
  def __init__(self,out_vocab_size, embedding_dim_dec, input_length, dec_units,lstm_dropout,recurrent_dropout):
    super().__init__()
    self.out_vocab_size = out_vocab_size
    self.embedding_dim_dec = embedding_dim_dec
    self.input_length = input_length
    self.dec_units = dec_units
    self.lstm_dropout = lstm_dropout
    self.recurrent_dropout = recurrent_dropout

  def get_config(self):
    config = super().get_config()
    config.update({
        'out_vocab_size': self.out_vocab_size, 'embedding_dim_dec': self.embedding_dim_dec,
        'input_length': self.input_length, 'dec_units': self.dec_units})
    return config

  def build(self,input_shapes):
    self.crossattention = DecoderEncoderCrossAttention(self.out_vocab_size, self.embedding_dim_dec, self.input_length,\
                                                        self.dec_units ,self.lstm_dropout,self.recurrent_dropout,False)

  def call(self, input_to_decoder,encoder_output):
    # creating a empty array of length equal to input length to fill logits value
    all_outputs = tf.TensorArray(tf.float32,size=self.input_length)
    # iterating over individual input word
    for timestep in range(self.input_length):
      # getting logits value for current input word
      output, decoder_hidden_state,decoder_cell_state,_ = self.crossattention(input_to_decoder[:,timestep:timestep+1], encoder_output)
      all_outputs = all_outputs.write(timestep, output) #[max_len,b,tar_vocab_size]
    all_outputs = tf.transpose(all_outputs.stack(), [1,0,2]) #[b,max_len,tar_vocab_size]
    return all_outputs 

In [None]:
# checking code

tar_vocab_size=150 
embedding_dim_dec=50
input_length=10
dec_units=48
enc_units = 32
batch_size=16
lstm_dropout=0.4
recurrent_dropout=0.4

input_to_decoder=tf.random.uniform(shape=(batch_size,input_length),maxval=10,minval=0,dtype=tf.int32)
encoder_output=tf.random.uniform(shape=[batch_size,input_length,enc_units])

d =  DecoderBlock(tar_vocab_size,embedding_dim_dec,input_length,dec_units,lstm_dropout,recurrent_dropout)
d(input_to_decoder,encoder_output).shape

TensorShape([16, 10, 150])

**Model:**

In [None]:
class TranslationModel(tf.keras.Model):

  def __init__(self, encoder_inputs_length,decoder_inputs_length, vocab_size_ita,vocab_size_eng,\
                embedding_dim_enc,embedding_dim_dec,enc_units,dec_units,lstm_dropout,recurrent_dropout):
    super().__init__()
    self.encoder_inputs_length = encoder_inputs_length
    self.decoder_inputs_length = decoder_inputs_length
    self.vocab_size_ita = vocab_size_ita
    self.vocab_size_eng = vocab_size_eng
    self.embedding_dim_enc = embedding_dim_enc
    self.embedding_dim_dec = embedding_dim_dec
    self.enc_units = enc_units
    self.dec_units = dec_units
    self.lstm_dropout = lstm_dropout
    self.recurrent_dropout = recurrent_dropout

  def get_config(self):
    config = super().get_config()
    config.update({'encoder_inputs_length': self.encoder_inputs_length, 'decoder_inputs_length': self.decoder_inputs_length,
        'vocab_size_ita': self.vocab_size_ita, 'vocab_size_eng': self.vocab_size_eng, 'embedding_dim_enc': self.embedding_dim_enc,
        'embedding_dim_dec': self.embedding_dim_dec , 'enc_units': self.enc_units, 'dec_units': self.dec_units, 'att_units': self.att_units})
    return config

  def build(self,input_shapes):
      self.encoder = Encoder(self.vocab_size_ita+1, self.embedding_dim_enc, self.encoder_inputs_length,\
                             self.enc_units,self.lstm_dropout,self.recurrent_dropout)
      self.decoder = DecoderBlock(self.vocab_size_eng+1, self.embedding_dim_dec, decoder_inputs_length,\
                                  self.lstm_dropout,self.recurrent_dropout)
      
  def call(self, data):
      input,output = data[0], data[1]
      #passing input to encoder
      encoder_output = self.encoder(input) #[b,max_len,encoder_lstm_units]
      #passing output to decoder
      decoder_output = self.decoder(output, encoder_output) #[b,max_len,tar_vocab_size] #logits
      return decoder_output

**Loss Function:**

In [None]:
# https://www.tensorflow.org/tutorials/text/image_captioning#model

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    """ Custom loss function that will not consider the loss for padded zero."""
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

def masked_acc(labels, preds):
  mask = tf.cast(labels!=0, tf.float32)
  preds = tf.argmax(preds, axis=-1)
  labels = tf.cast(labels, tf.int64)
  match = tf.cast(preds == labels, mask.dtype)
  acc = tf.reduce_sum(match*mask)/tf.reduce_sum(mask)
  return acc

**Callbacks:**

In [None]:
from itertools import combinations
import os
import datetime
log_dir = os.path.join('/content/drive/MyDrive/cross_attention/',"logs",'model','fits', datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir,histogram_freq=1,write_graph=True)

modelsave_wt = tf.keras.callbacks.ModelCheckpoint(
                     filepath='/content/drive/MyDrive/cross_attention/model.{epoch:02d}-{masked_acc:.4f}.h5', 
                     save_freq='epoch', verbose=1, monitor='masked_acc', 
                     save_weights_only=True, save_best_only=True
                 ) 

**Selected Hyperparameters:**

In [None]:
#vocab_size_ita=27260
#vocab_size_eng=13356

In [None]:
# after trying different values, following were selected

encoder_inputs_length = 20
decoder_inputs_length = 20
vocab_size_ita = vocab_size_ita
vocab_size_eng = vocab_size_eng
embedding_dim_enc = 100
embedding_dim_dec = 100
enc_units = 128
dec_units = 128
lstm_dropout = 0.2
recurrent_dropout = 0.2
optimizer = tf.keras.optimizers.Adam()

**Data Loader:**

In [None]:
batch_size=1024

train_dataset = Dataset(train, tknizer_ita, tknizer_eng, 20)
val_dataset  = Dataset(val, tknizer_ita, tknizer_eng, 20)
test_dataset  = Dataset(test, tknizer_ita, tknizer_eng, 20)

train_dataloader = Dataloder(train_dataset, batch_size=batch_size)
val_dataloader = Dataloder(val_dataset, batch_size=batch_size)
test_dataloader = Dataloder(test_dataset, batch_size=batch_size)

train_steps=train.shape[0]//batch_size
valid_steps=val.shape[0]//batch_size

**Training:**

In [None]:
model  = TranslationModel(encoder_inputs_length,decoder_inputs_length, vocab_size_ita,vocab_size_eng,embedding_dim_enc,\
                          embedding_dim_dec,enc_units,dec_units, lstm_dropout, recurrent_dropout)

model.compile(optimizer=optimizer, loss=loss_function, metrics=[masked_acc])

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=10, validation_data=val_dataloader, validation_steps=valid_steps,\
          callbacks=[modelsave_wt,tensorboard_callback])



Epoch 1/10
Epoch 1: masked_acc improved from 0.14619 to 0.15041, saving model to /content/drive/MyDrive/cross_attention/model.01-0.1504.h5
Epoch 2/10
Epoch 2: masked_acc improved from 0.15041 to 0.23276, saving model to /content/drive/MyDrive/cross_attention/model.02-0.2328.h5
Epoch 3/10
Epoch 3: masked_acc improved from 0.23276 to 0.30925, saving model to /content/drive/MyDrive/cross_attention/model.03-0.3093.h5
Epoch 4/10
Epoch 4: masked_acc improved from 0.30925 to 0.34244, saving model to /content/drive/MyDrive/cross_attention/model.04-0.3424.h5
Epoch 5/10
Epoch 5: masked_acc improved from 0.34244 to 0.36345, saving model to /content/drive/MyDrive/cross_attention/model.05-0.3634.h5
Epoch 6/10
Epoch 6: masked_acc improved from 0.36345 to 0.38222, saving model to /content/drive/MyDrive/cross_attention/model.06-0.3822.h5
Epoch 7/10
Epoch 7: masked_acc improved from 0.38222 to 0.40001, saving model to /content/drive/MyDrive/cross_attention/model.07-0.4000.h5
Epoch 8/10
Epoch 8: masked_

<keras.callbacks.History at 0x7f0db44b4fd0>

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=20, validation_data=val_dataloader, validation_steps=valid_steps,\
          callbacks=[modelsave_wt,tensorboard_callback],initial_epoch=10)

Epoch 11/20
Epoch 11: masked_acc improved from 0.45006 to 0.46839, saving model to /content/drive/MyDrive/cross_attention/model.11-0.4684.h5
Epoch 12/20
Epoch 12: masked_acc improved from 0.46839 to 0.48731, saving model to /content/drive/MyDrive/cross_attention/model.12-0.4873.h5
Epoch 13/20
Epoch 13: masked_acc improved from 0.48731 to 0.50789, saving model to /content/drive/MyDrive/cross_attention/model.13-0.5079.h5
Epoch 14/20
Epoch 14: masked_acc improved from 0.50789 to 0.53154, saving model to /content/drive/MyDrive/cross_attention/model.14-0.5315.h5
Epoch 15/20
Epoch 15: masked_acc improved from 0.53154 to 0.55796, saving model to /content/drive/MyDrive/cross_attention/model.15-0.5580.h5
Epoch 16/20
Epoch 16: masked_acc improved from 0.55796 to 0.58560, saving model to /content/drive/MyDrive/cross_attention/model.16-0.5856.h5
Epoch 17/20
Epoch 17: masked_acc improved from 0.58560 to 0.61246, saving model to /content/drive/MyDrive/cross_attention/model.17-0.6125.h5
Epoch 18/20
E

<keras.callbacks.History at 0x7f0c913a68b0>

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=30, validation_data=val_dataloader, validation_steps=valid_steps,\
          callbacks=[modelsave_wt,tensorboard_callback],initial_epoch=20)

Epoch 21/30
Epoch 21: masked_acc improved from 0.67557 to 0.69156, saving model to /content/drive/MyDrive/cross_attention/model.21-0.6916.h5
Epoch 22/30
Epoch 22: masked_acc improved from 0.69156 to 0.70654, saving model to /content/drive/MyDrive/cross_attention/model.22-0.7065.h5
Epoch 23/30
Epoch 23: masked_acc improved from 0.70654 to 0.71911, saving model to /content/drive/MyDrive/cross_attention/model.23-0.7191.h5
Epoch 24/30
Epoch 24: masked_acc improved from 0.71911 to 0.73017, saving model to /content/drive/MyDrive/cross_attention/model.24-0.7302.h5
Epoch 25/30
Epoch 25: masked_acc improved from 0.73017 to 0.73983, saving model to /content/drive/MyDrive/cross_attention/model.25-0.7398.h5
Epoch 26/30
Epoch 26: masked_acc improved from 0.73983 to 0.74851, saving model to /content/drive/MyDrive/cross_attention/model.26-0.7485.h5
Epoch 27/30
Epoch 27: masked_acc improved from 0.74851 to 0.75629, saving model to /content/drive/MyDrive/cross_attention/model.27-0.7563.h5
Epoch 28/30
E

<keras.callbacks.History at 0x7f0f6291c790>

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=40, validation_data=val_dataloader, validation_steps=valid_steps,\
          callbacks=[modelsave_wt,tensorboard_callback],initial_epoch=30)

Epoch 31/40
Epoch 31: masked_acc improved from 0.77534 to 0.78026, saving model to /content/drive/MyDrive/cross_attention/model.31-0.7803.h5
Epoch 32/40
Epoch 32: masked_acc improved from 0.78026 to 0.78462, saving model to /content/drive/MyDrive/cross_attention/model.32-0.7846.h5
Epoch 33/40
Epoch 33: masked_acc improved from 0.78462 to 0.78893, saving model to /content/drive/MyDrive/cross_attention/model.33-0.7889.h5
Epoch 34/40
Epoch 34: masked_acc improved from 0.78893 to 0.79167, saving model to /content/drive/MyDrive/cross_attention/model.34-0.7917.h5
Epoch 35/40
Epoch 35: masked_acc improved from 0.79167 to 0.79662, saving model to /content/drive/MyDrive/cross_attention/model.35-0.7966.h5
Epoch 36/40
Epoch 36: masked_acc improved from 0.79662 to 0.79986, saving model to /content/drive/MyDrive/cross_attention/model.36-0.7999.h5
Epoch 37/40
Epoch 37: masked_acc improved from 0.79986 to 0.80323, saving model to /content/drive/MyDrive/cross_attention/model.37-0.8032.h5
Epoch 38/40
E

<keras.callbacks.History at 0x7f0c922c7ca0>

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=50, validation_data=val_dataloader, validation_steps=valid_steps,\
          callbacks=[modelsave_wt,tensorboard_callback],initial_epoch=40)

Epoch 41/50
Epoch 41: masked_acc improved from 0.81110 to 0.81379, saving model to /content/drive/MyDrive/cross_attention/model.41-0.8138.h5
Epoch 42/50
Epoch 42: masked_acc improved from 0.81379 to 0.81554, saving model to /content/drive/MyDrive/cross_attention/model.42-0.8155.h5
Epoch 43/50
Epoch 43: masked_acc improved from 0.81554 to 0.81800, saving model to /content/drive/MyDrive/cross_attention/model.43-0.8180.h5
Epoch 44/50
Epoch 44: masked_acc improved from 0.81800 to 0.82016, saving model to /content/drive/MyDrive/cross_attention/model.44-0.8202.h5
Epoch 45/50
Epoch 45: masked_acc improved from 0.82016 to 0.82238, saving model to /content/drive/MyDrive/cross_attention/model.45-0.8224.h5
Epoch 46/50
Epoch 46: masked_acc improved from 0.82238 to 0.82418, saving model to /content/drive/MyDrive/cross_attention/model.46-0.8242.h5
Epoch 47/50
Epoch 47: masked_acc improved from 0.82418 to 0.82559, saving model to /content/drive/MyDrive/cross_attention/model.47-0.8256.h5
Epoch 48/50
E

<keras.callbacks.History at 0x7f0caa8cf100>

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=60, validation_data=val_dataloader, validation_steps=valid_steps,\
          callbacks=[modelsave_wt,tensorboard_callback],initial_epoch=50)

Epoch 51/60
Epoch 51: masked_acc improved from 0.83072 to 0.83216, saving model to /content/drive/MyDrive/cross_attention/model.51-0.8322.h5
Epoch 52/60
Epoch 52: masked_acc improved from 0.83216 to 0.83391, saving model to /content/drive/MyDrive/cross_attention/model.52-0.8339.h5
Epoch 53/60
Epoch 53: masked_acc improved from 0.83391 to 0.83507, saving model to /content/drive/MyDrive/cross_attention/model.53-0.8351.h5
Epoch 54/60
Epoch 54: masked_acc improved from 0.83507 to 0.83648, saving model to /content/drive/MyDrive/cross_attention/model.54-0.8365.h5
Epoch 55/60
Epoch 55: masked_acc improved from 0.83648 to 0.83771, saving model to /content/drive/MyDrive/cross_attention/model.55-0.8377.h5
Epoch 56/60
Epoch 56: masked_acc improved from 0.83771 to 0.83903, saving model to /content/drive/MyDrive/cross_attention/model.56-0.8390.h5
Epoch 57/60
Epoch 57: masked_acc improved from 0.83903 to 0.83997, saving model to /content/drive/MyDrive/cross_attention/model.57-0.8400.h5
Epoch 58/60
E

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=65, validation_data=val_dataloader, validation_steps=valid_steps,\
          callbacks=[modelsave_wt,tensorboard_callback],initial_epoch=59)

Epoch 60/65
Epoch 60: masked_acc improved from 0.84217 to 0.84383, saving model to /content/drive/MyDrive/cross_attention/model.60-0.8438.h5
Epoch 61/65
Epoch 61: masked_acc improved from 0.84383 to 0.84530, saving model to /content/drive/MyDrive/cross_attention/model.61-0.8453.h5
Epoch 62/65
Epoch 62: masked_acc improved from 0.84530 to 0.84659, saving model to /content/drive/MyDrive/cross_attention/model.62-0.8466.h5
Epoch 63/65
Epoch 63: masked_acc improved from 0.84659 to 0.84775, saving model to /content/drive/MyDrive/cross_attention/model.63-0.8477.h5
Epoch 64/65
Epoch 64: masked_acc improved from 0.84775 to 0.84884, saving model to /content/drive/MyDrive/cross_attention/model.64-0.8488.h5
Epoch 65/65
Epoch 65: masked_acc improved from 0.84884 to 0.85021, saving model to /content/drive/MyDrive/cross_attention/model.65-0.8502.h5


<keras.callbacks.History at 0x7f7e4eccbb20>

In [None]:
model.fit(train_dataloader, steps_per_epoch=train_steps, epochs=70, validation_data=val_dataloader, validation_steps=valid_steps,\
          callbacks=[modelsave_wt,tensorboard_callback],initial_epoch=65)

Epoch 66/70
Epoch 66: masked_acc improved from 0.85021 to 0.85099, saving model to /content/drive/MyDrive/cross_attention/model.66-0.8510.h5
Epoch 67/70
Epoch 67: masked_acc improved from 0.85099 to 0.85240, saving model to /content/drive/MyDrive/cross_attention/model.67-0.8524.h5
Epoch 68/70
Epoch 68: masked_acc improved from 0.85240 to 0.85333, saving model to /content/drive/MyDrive/cross_attention/model.68-0.8533.h5
Epoch 69/70
Epoch 69: masked_acc improved from 0.85333 to 0.85469, saving model to /content/drive/MyDrive/cross_attention/model.69-0.8547.h5
Epoch 70/70
Epoch 70: masked_acc improved from 0.85469 to 0.85520, saving model to /content/drive/MyDrive/cross_attention/model.70-0.8552.h5


<keras.callbacks.History at 0x7f7e4ece4100>

**Prediction:**

In [None]:
def predict(input_sentence):
  '''takes italian input sentence and outputs translated english sentence'''

  encoder_seq = tknizer_ita.texts_to_sequences([input_sentence]) # tokenizing
  encoder_seq = pad_sequences(encoder_seq, maxlen=20, dtype='int32', padding='post') #padding to len 20
  encoder_output, encoder_h, encoder_c = model.encoder(encoder_seq) # encoding

  decoder_input = tknizer_eng.texts_to_sequences(['<start>']) # feeding <start> token as 1st decoder input
  decoder_state_h = encoder_h
  decoder_state_c = encoder_c
  prediction=[]
  att_wt = []
  for i in range(20): # iterating over full max_length
    if i>0 and prediction[-1]==['<end>']: # end translation when <end> token predicted
      break
    else:
      input=tf.expand_dims(decoder_input[0][-1],0)  
      if i==0:
       input=tf.expand_dims(input,0)
      decoder_output, state_h,state_c,attention_weights, context_vector= model.layers[1].cross_attention(input, encoder_output,decoder_state_h,decoder_state_c)
      att_wt.append(attention_weights)
      index = tf.argmax(decoder_output,axis=-1).numpy()
      prediction.append(tknizer_eng.sequences_to_texts([index]))
      decoder_input[0].append(index)
      decoder_state_h = state_h
      decoder_state_c = state_c
    
  predicted_sent= prediction[0][0]
  for word in prediction:
    predicted_sent = predicted_sent + ' ' + word[0]
  return predicted_sent,att_wt

In [None]:
# predicting on random unseen 5 italian sentences

for doc in test.sample(5).values:
  sent=doc[0]
  eng_sent = doc[-1]
  print('italian: ',sent)
  print('English True: ',eng_sent)
  trans,_ = predict(sent)
  print('Model Translation: ',trans,'\n')

italian:  vedo cosavete fatto lì
English True:  i see what you did there <end>
Model Translation:  i i see what you have done there <end> 

italian:  tom non è un fisico
English True:  tom is not a physician <end>
Model Translation:  tom tom is not a physician <end> 

italian:  cè un costo di consegna
English True:  is there a delivery charge <end>
Model Translation:  there there is a charge of the delivery <end> 

italian:  è un tizio strano
English True:  he is a strange guy <end>
Model Translation:  it it is a strange guy <end> 

italian:  tutti qua sanno che non mangiamo la carne di maiale
English True:  everyone here knows that we do not eat pork <end>
Model Translation:  everyone everyone here knows we do not eat pork <end> 



**Bleu_score on Test Dataset:**

In [None]:
import nltk.translate.bleu_score as bleu

In [None]:
def bleu_score(input,weights):
  bleu_score=[]
  for doc in input:
    input_sentence = doc[0]
    eng_sent = doc[-1]
    predicted_sent,_ = predict(input_sentence)
    bleu_score.append(bleu.sentence_bleu(eng_sent, predicted_sent,weights=weights))
  return np.mean(bleu_score)

In [None]:
# bleu score by only matching uni-grams

bleu_score_test = bleu_score( input = test.values, weights = (1,0,0,0) )
print('average test data bleu score: ',bleu_score_test)

average test data bleu score:  0.4451662890214658


In [None]:
# bleu score by matching 1 to 4-grams

bleu_score_test = bleu_score( input = test.values, weights = (0.25,0.25,0.25,0.25) )
print('average test data cumulative 4-gram bleu score: ',bleu_score_test)

average test data cumulative 4-gram bleu score:  1.479362713798278e-231
