<a href="https://colab.research.google.com/github/ninonarido/PRESANA/blob/main/NMT_ATTN_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import time
import string


In [None]:
from google.colab import files

uploaded =  files.upload()

# New Section

In [None]:

file_path = 'sasa.txt' # please set the path according to your system

In [None]:
lines = open(file_path, encoding='UTF-8').read().strip().split('\n')
lines

['TGL\tBKL',
 'Takbo.\tDaralagan.',
 'Sino?\tSi esay?',
 'Yuko!\tduko!',
 'Para!\tPundo!',
 'Kamusta!\tKamusta!',
 'Dali!\tDali!',
 'Sinusubukan ko.\ttenistingan ko.',
 'Ngiti.\tngirit.',
 'Ngumiti ka.\tmagngirit ka.',
 'Sugod!\tsugod',
 'Kainin mo.\tkakanon mo.',
 'Tumakbo siya.\tnagdalagan sya.',
 "Yakapin mo ako.\tkuguson mo' ko.",
 'Nahulog ako.\tnauslog ako.',
 'Alam ko.\taram ko.',
 'Nagtatrabaho ako.\tNagtatrabaho ako.',
 'Talaga?\tay iyo?',
 'Subukan ito.\ttestingan ini.',
 'Nanalo kami.\tnaggana kami.',
 'Bakit ako?\ttanu ako?',
 'Alis.\thali.',
 'Tawagan mo ako.\tapudan mo ako.',
 'Tawagan mo kami.\tapudan mo kami.',
 'Tawagan niyo kami.\tapudan nindo kami.',
 'Labas!\tluwas!',
 'Lumabas ka!\tlumuwas ka!',
 'Alis.\thali.',
 'Umalis ka!\tlumuwas ka!',
 'Umalis ka.\tmaghali ka!',
 'Umuwi ka.\tmag-uli ka.',
 'Umalis siya.\tnaghali sya.',
 'Tulungan mo ako!\ttabangan mo ako!',
 'Tulungan niyo kami.\ttabangan nindo kami.',
 'Yakapin mo si Tom.\tKuguson mo si Tom.',
 'Sinubukan ko.

In [None]:
len(lines)

145

In [None]:

exclude = set(string.punctuation) # Set of all special characters
remove_digits = str.maketrans('', '', string.digits) # Set of all digits

In [None]:
def preprocess_tgl_sentence(sent):
    '''Function to preprocess tgl sentence'''
    sent = sent.lower() # lower casing
    sent = re.sub("'", '', sent) # remove the quotation marks if any
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.translate(remove_digits) # remove the digits
    sent = sent.strip()
    sent = re.sub(" +", " ", sent) # remove extra spaces
    sent = ' ' + sent + ' ' # add  and  tokens
    return sent

In [None]:
def preprocess_bkl_sentence(sent):
    '''Function to preprocess bkl sentence'''
    ssent = sent.lower() # lower casing
    sent = re.sub("'", '', sent) # remove the quotation marks if any
    sent = ''.join(ch for ch in sent if ch not in exclude)
    sent = sent.translate(remove_digits) # remove the digits
    sent = sent.strip()
    sent = re.sub(" +", " ", sent) # remove extra spaces
    sent = ' ' + sent + ' ' # add  and  tokens
    return sent

In [None]:
# Generate pairs of cleaned English and Marathi sentences
sent_pairs = []
for line in lines:
    sent_pair = []
    tgl, bkl = line.split('\t')
    tgl = preprocess_tgl_sentence(tgl)
    sent_pair.append(tgl)
    bkl = preprocess_bkl_sentence(bkl)
    sent_pair.append(bkl)
    sent_pairs.append(sent_pair)
sent_pairs

[[' tgl ', ' BKL '],
 [' takbo ', ' Daralagan '],
 [' sino ', ' Si esay '],
 [' yuko ', ' duko '],
 [' para ', ' Pundo '],
 [' kamusta ', ' Kamusta '],
 [' dali ', ' Dali '],
 [' sinusubukan ko ', ' tenistingan ko '],
 [' ngiti ', ' ngirit '],
 [' ngumiti ka ', ' magngirit ka '],
 [' sugod ', ' sugod '],
 [' kainin mo ', ' kakanon mo '],
 [' tumakbo siya ', ' nagdalagan sya '],
 [' yakapin mo ako ', ' kuguson mo ko '],
 [' nahulog ako ', ' nauslog ako '],
 [' alam ko ', ' aram ko '],
 [' nagtatrabaho ako ', ' Nagtatrabaho ako '],
 [' talaga ', ' ay iyo '],
 [' subukan ito ', ' testingan ini '],
 [' nanalo kami ', ' naggana kami '],
 [' bakit ako ', ' tanu ako '],
 [' alis ', ' hali '],
 [' tawagan mo ako ', ' apudan mo ako '],
 [' tawagan mo kami ', ' apudan mo kami '],
 [' tawagan niyo kami ', ' apudan nindo kami '],
 [' labas ', ' luwas '],
 [' lumabas ka ', ' lumuwas ka '],
 [' alis ', ' hali '],
 [' umalis ka ', ' lumuwas ka '],
 [' umalis ka ', ' maghali ka '],
 [' umuwi ka ', ' m

In [None]:

# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()

        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))

        self.vocab = sorted(self.vocab)

        self.word2idx[''] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1

        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [None]:
def max_length(tensor):
    return max(len(t) for t in tensor)

In [None]:
def load_dataset(pairs, num_examples):
    # pairs => already created cleaned input, output pairs

    # index language using the class defined above    
    inp_lang = LanguageIndex(bkl for bkl, tgl in pairs)
    targ_lang = LanguageIndex(tgl for bkl, tgl in pairs)
    
    # Vectorize the input and target languages
    
    # English sentences
    input_tensor = [[inp_lang.word2idx[s] for s in bkl.split(' ')] for bkl, tgl in pairs]
    
    # Marathi sentences
    target_tensor = [[targ_lang.word2idx[s] for s in tgl.split(' ')] for bkl, tgl in pairs]
    
    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)
    
    # Padding the input and output tensor to the maximum length
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                 maxlen=max_length_inp,
                                                                 padding='post')
    
    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                  maxlen=max_length_tar, 
                                                                  padding='post')
    
    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar

In [None]:
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(sent_pairs, len(lines))

In [None]:
# Creating training and validation sets using an 80-20 split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1, random_state = 101)

# Show length
len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(130, 130, 15, 15)

In [None]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 8
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 200
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
def gru(units):
  # If you have a GPU, we recommend using CuDNNGRU(provides a 3x speedup than GRU)
  # the code automatically does that.
    if tf.test.is_gpu_available():
        return tf.keras.layers.CuDNNGRU(units, 
                                        return_sequences=True, 
                                        return_state=True, 
                                        recurrent_initializer='glorot_uniform')
    else:
        return tf.keras.layers.GRU(units, 
                                   return_sequences=True, 
                                   return_state=True, 
                                   recurrent_activation='sigmoid', 
                                   recurrent_initializer='glorot_uniform')

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size+1, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size+1, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size+1)
        
        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)
        
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


In [None]:
optimizer = tf.optimizers.Adam()


def loss_function(real, pred):
    mask = 1 - np.equal(real, 0)
    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
    return tf.reduce_mean(loss_)

In [None]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset):
        loss = 0
        
        with tf.GradientTape() as tape:
            enc_output, enc_hidden = encoder(inp, hidden)
            
            dec_hidden = enc_hidden
            
            dec_input = tf.expand_dims([targ_lang.word2idx['']] * BATCH_SIZE, 1)       
            
            # Teacher forcing - feeding the target as the next input
            for t in range(1, targ.shape[1]):
                # passing enc_output to the decoder
                predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
                
                loss += loss_function(targ[:, t], predictions)
                
                # using teacher forcing
                dec_input = tf.expand_dims(targ[:, t], 1)
        
        batch_loss = (loss / int(targ.shape[1]))
        
        total_loss += batch_loss
        
        variables = encoder.variables + decoder.variables
        
        gradients = tape.gradient(loss, variables)
        
        optimizer.apply_gradients(zip(gradients, variables))
        
        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every epoch
    checkpoint.save(file_prefix = checkpoint_prefix)
    
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                        total_loss / N_BATCH))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 3.0344
Epoch 1 Loss 2.4312
Time taken for 1 epoch 21.587337970733643 sec

Epoch 2 Batch 0 Loss 2.1635
Epoch 2 Loss 1.9968
Time taken for 1 epoch 12.890891551971436 sec

Epoch 3 Batch 0 Loss 2.1062
Epoch 3 Loss 1.8075
Time taken for 1 epoch 13.570344686508179 sec

Epoch 4 Batch 0 Loss 1.5257
Epoch 4 Loss 1.6278
Time taken for 1 epoch 21.183542490005493 sec

Epoch 5 Batch 0 Loss 1.4524
Epoch 5 Loss 1.4702
Time taken for 1 epoch 12.337953329086304 sec

Epoch 6 Batch 0 Loss 1.2412
Epoch 6 Loss 1.3318
Time taken for 1 epoch 20.94216775894165 sec

Epoch 7 Batch 0 Loss 1.2984
Epoch 7 Loss 1.2167
Time taken for 1 epoch 11.454327583312988 sec

Epoch 8 Batch 0 Loss 1.0068
Epoch 8 Loss 1.1458
Time taken for 1 epoch 20.985050678253174 sec

Epoch 9 Batch 0 Loss 0.8403
Epoch 9 Loss 1.0546
Time taken for 1 epoch 11.22144365310669 sec

Epoch 10 Batch 0 Loss 0.9733
Epoch 10 Loss 0.9553
Time taken for 1 epoch 13.154844999313354 sec



In [None]:
def evaluate(inputs, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ):
    
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    sentence = ''
    for i in inputs[0]:
        if i == 0:
            break
        sentence = sentence + inp_lang.idx2word[i] + ' '
    sentence = sentence[:-1]
    
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''

    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_lang.word2idx['']], 0)

    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += targ_lang.idx2word[predicted_id] + ' '

        if targ_lang.idx2word[predicted_id] == '':
            return result, sentence, attention_plot
        
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot

In [None]:
!pip install chart-studio
from chart_studio import plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go


def predict_random_val_sentence():
    actual_sent = ''
    k = np.random.randint(len(input_tensor_val))
    random_input = input_tensor_val[k]
    random_output = target_tensor_val[k]
    random_input = np.expand_dims(random_input,0)
    result, sentence, attention_plot = evaluate(random_input, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)
    print('Input: {}'.format(sentence))
    print('Predicted translation: {}'.format(result))
    for i in random_output:
        if i == 0:
            break
        actual_sent = actual_sent + targ_lang.idx2word[i] + ' '
    actual_sent = actual_sent
    print('Actual translation: {}'.format(actual_sent))
    attention_plot = attention_plot[:len(result.split(' ')), 1:len(sentence.split(' '))]
    sentence, result = sentence.split(' '), result.split(' ')
    sentence = sentence[1:]
    result = result[:]
    trace = go.Heatmap(z= attention_plot, x = sentence, y = result, colorscale='Reds')
    data=[trace]
    iplot(data)
    
 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting chart-studio
  Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 1.7 MB/s 
Collecting retrying>=1.3.3
  Downloading retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py) ... [?25l[?25hdone
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11448 sha256=8fc1929caaf24e8e79de4f6bcd2103289cac7f2298d1ee0971b30e1fe45460e1
  Stored in directory: /root/.cache/pip/wheels/f9/8d/8d/f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, chart-studio
Successfully installed chart-studio-1.1.0 retrying-1.3.3


In [None]:
predict_random_val_sentence()

Input:  tulungan mo ako 
Predicted translation: sunodon mo ko  
Actual translation:  tabangan mo ako  
