* ### **Install library**                                  
 **Note:**  I use MosesTokenizer for the task of separating words in sentences 

In [1]:
!pip install mosestokenizer

Collecting mosestokenizer
  Downloading mosestokenizer-1.2.1.tar.gz (37 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting openfile
  Downloading openfile-0.0.7-py3-none-any.whl (2.4 kB)
Collecting uctools
  Downloading uctools-1.3.0.tar.gz (4.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting toolwrapper
  Downloading toolwrapper-2.1.0.tar.gz (3.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: mosestokenizer, toolwrapper, uctools
  Building wheel for mosestokenizer (setup.py) ... [?25ldone
[?25h  Created wheel for mosestokenizer: filename=mosestokenizer-1.2.1-py3-none-any.whl size=49189 sha256=4a27c10d36168e6be5932feaa135ebf5cb0c6b345d8f0f562b1050fd4ae71309
  Stored in directory: /root/.cache/pip/wheels/b0/35/f7/af1258779a0b890abc3c79481460c597cb1f3659d0603cfb9d
  Building wheel for toolwrapper (setup.py) ... [?25ldone
[?25h  Created wheel for toolwrapper: filename=toolwrapper-2.1.0-py3-none-any.whl si

* ### **Display GPU configuaration**

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print(gpu_info)

Thu Feb 16 11:57:58 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   29C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

* ### **Import library**

In [3]:
from mosestokenizer import MosesTokenizer
from tqdm import tqdm
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import LSTM, Embedding, Dense, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
from collections import Counter
import time

* ### **Setting parameters, hyper-parameters for the model**

In [4]:
class Config:
    def __init__(self,
               min_len = 0,
               max_len = 50,
               batch_size = 256,
               embedding_size = 512,
               hidden_units = 512,
               epochs = 20,
               learning_rate = 0.0008,
               dropout = 0.2,
               start_token = '<sos>',
               end_token = '<eos>',
               pad_token = '<pad>',
               unk_token = '<unk>',
               lang_1 = 'en',
               lang_2 = 'vi'):
        
        self.min_len = min_len
        self.max_len = max_len
        self.batch_size = batch_size
        self.embedding_size = embedding_size
        self.hidden_units = hidden_units
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.dropout = dropout
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.LANG_1 = lang_1
        self.LANG_2 = lang_2

* ### **Data processing and Dataloader**                                     
    **Note:**  The classes from **Vocabulary** to **Dataloader** are used for input data processing and dataloader creation

In [5]:
class Vocabulary:    
    def __init__(self,               
                 config,
                 data_path,
                 frequency,
                 lang,
                 add_sos,
                 add_eos,
                 add_pad,
                 add_unk):
        self.config = config
        self.data_path = data_path
        self.tokenizer = MosesTokenizer(lang)
        self.counter = Counter()
        self.frequency = frequency
        self.add_sos = add_sos
        self.add_eos = add_eos
        self.add_pad = add_pad
        self.add_unk = add_unk

    def word2idx(self):
        word2idx = {}
        with open(self.data_path, encoding = 'utf-8', mode = 'r') as f:          
            for line in f:
                self.counter.update(self.tokenizer(line))

        if self.add_pad:
            word2idx[self.config.pad_token] = len(word2idx)
        if self.add_unk:
            word2idx[self.config.unk_token] = len(word2idx)
        if self.add_sos:
            word2idx[self.config.start_token] = len(word2idx)
        if self.add_eos:
            word2idx[self.config.end_token] = len(word2idx)
    
        for word, freq in self.counter.items():
            if freq >= self.frequency:
                word2idx[word] = len(word2idx)

        return word2idx

In [6]:
class Language:
    def __init__(self,
                 config,
                 lang,
                 word2idx,
                 idx2word):
        self.config = config
        self.tokenizer = MosesTokenizer(lang)
        self.word2idx = word2idx
        self.idx2word = idx2word
  
    def encode(self, sentence):
        sequence = []
        for word in self.tokenizer(sentence):
            if word in self.word2idx:
                sequence.append(self.word2idx[word])
            else:
                sequence.append(self.word2idx[self.config.unk_token])
        return sequence

    def add_border(self, sequence):
        return [self.word2idx[self.config.start_token]] + sequence + [self.word2idx[self.config.end_token]]

    def decode(self, sequence):
        sentence = [self.idx2word[idx] for idx in sequence]
        sentence = " ".join(sentence)
        return sentence

In [7]:
en_train_file_path = '/kaggle/input/mt-en-vi/src-train.txt'
vi_train_file_path = '/kaggle/input/mt-en-vi/tgt-train.txt'
en_val_file_path = '/kaggle/input/mt-en-vi/src-val.txt'
vi_val_file_path = '/kaggle/input/mt-en-vi/tgt-val.txt'
en_test_file_path = '/kaggle/input/mt-en-vi/src-test.txt'
vi_test_file_path = '/kaggle/input/mt-en-vi/tgt-test.txt'

source_add = {'add_sos': True,
              'add_eos': True,
              'add_pad': True,
              'add_unk': True}
target_add = {'add_sos': True,
              'add_eos': True,
              'add_pad': True,
              'add_unk': True}

frequency = 5
config = Config()

source_vocab = Vocabulary(config,
                          en_train_file_path,
                          frequency,
                          config.LANG_1,
                          source_add['add_sos'],
                          source_add['add_eos'],
                          source_add['add_pad'],
                          source_add['add_unk'])
target_vocab = Vocabulary(config,
                          vi_train_file_path,
                          frequency,
                          config.LANG_2,
                          target_add['add_sos'],
                          target_add['add_eos'],
                          target_add['add_pad'],
                          target_add['add_unk'])

source_word2idx = source_vocab.word2idx()
target_word2idx = target_vocab.word2idx()
source_idx2word = {value: key for key, value in source_word2idx.items()}
target_idx2word = {value: key for key, value in target_word2idx.items()}

source_lang = Language(config,
                       config.LANG_1,
                       source_word2idx,
                       source_idx2word)
target_lang = Language(config,
                       config.LANG_2,
                       target_word2idx,
                       target_idx2word)

In [8]:
class Lang_utils:
    def __init__(self,
               word2idx,
               idx2word,
               lang,
               train_path,
               val_path,
               test_path):
        self.word_index = word2idx
        self.index_word = idx2word
        self.encode = lang.encode
        self.add_border = lang.add_border
        self.decode = lang.decode
        self.train_path = train_path
        self.val_path = val_path 
        self.test_path = test_path

en_utils = Lang_utils(source_word2idx,
                      source_idx2word,
                      source_lang,
                      en_train_file_path,
                      en_val_file_path,
                      en_test_file_path)

vi_utils = Lang_utils(target_word2idx,
                      target_idx2word,
                      target_lang,
                      vi_train_file_path,
                      vi_val_file_path,
                      vi_test_file_path)

In [9]:
class Dataloader:
    def __init__(self,
               en_utils,
               vi_utils,
               config):
        self.en_utils = en_utils
        self.vi_utils = vi_utils
        self.config = config
        self.train_process_en = []
        self.train_process_vi = []
        self.val_process_en = []
        self.val_process_vi = []
        self.test_process_en = []
        self.test_process_vi = []

    def read_data(self, path):
        with open(path, encoding = 'utf-8', mode = 'r') as f:
            file_opened = f.read().strip().split('\n')
        return file_opened
    
    def process_long_sequence(self, seq_1, seq_2):
        list_seq_1 = [seq_1[i: i + self.config.max_len] for i in range(0, len(seq_1), self.config.max_len)]
        list_seq_2 = [seq_2[i: i + self.config.max_len] for i in range(0, len(seq_2), self.config.max_len)]
        if len(list_seq_1) > len(list_seq_2):
            list_seq_1 = list_seq_1[:len(list_seq_2)]
        else:
            list_seq_2 = list_seq_2[:len(list_seq_1)]

        assert len(list_seq_1) == len(list_seq_2), "Length difference between 2 list"
        return list_seq_1, list_seq_2
            
    def padding(self, seq_batch, lang_utils):
        max_len = max([len(seq) for seq in seq_batch])
        for i in range(len(seq_batch)):
            seq = seq_batch[i]
            seq += [lang_utils.word_index[self.config.pad_token]]*(max_len - len(seq))
            seq_batch[i] = seq
        return seq_batch

    def texts_to_sequences(self, list_texts, lang_utils):
        for i in range(len(list_texts)):
            list_texts[i] = lang_utils.encode(list_texts[i])
        return list_texts

    def dataset(self):
        src_train_lang = self.read_data(self.en_utils.train_path)
        src_val_lang = self.read_data(self.en_utils.val_path)
        src_test_lang = self.read_data(self.en_utils.test_path)
        tgt_train_lang = self.read_data(self.vi_utils.train_path)
        tgt_val_lang = self.read_data(self.vi_utils.val_path)
        tgt_test_lang = self.read_data(self.vi_utils.test_path)

        seq_src_train = self.texts_to_sequences(src_train_lang, self.en_utils)
        seq_src_val = self.texts_to_sequences(src_val_lang, self.en_utils)
        seq_src_test = self.texts_to_sequences(src_test_lang, self.en_utils)
        seq_tgt_train = self.texts_to_sequences(tgt_train_lang, self.vi_utils)
        seq_tgt_val = self.texts_to_sequences(tgt_val_lang, self.vi_utils)
        seq_tgt_test = self.texts_to_sequences(tgt_test_lang, self.vi_utils)

        for en_seq, vi_seq in zip(seq_src_train, seq_tgt_train):  
            if self.config.min_len < len(en_seq) < self.config.max_len and self.config.min_len < len(vi_seq) < self.config.max_len:    
                self.train_process_en.append(self.en_utils.add_border(en_seq))
                self.train_process_vi.append(self.vi_utils.add_border(vi_seq))
            else:
                cut_en_seq, cut_vi_seq = self.process_long_sequence(en_seq, vi_seq)
                for en_gram, vi_gram in zip(cut_en_seq, cut_vi_seq):
                    self.train_process_en.append(self.en_utils.add_border(en_gram))
                    self.train_process_vi.append(self.vi_utils.add_border(vi_gram))
  
        assert len(self.train_process_en) == len(self.train_process_vi), "The size of the 2 training sets is not equal"
    
        for en_seq, vi_seq in zip(seq_src_val, seq_tgt_val):
            if self.config.min_len < len(en_seq) < self.config.max_len and self.config.min_len < len(vi_seq) < self.config.max_len:
                self.val_process_en.append(self.en_utils.add_border(en_seq))
                self.val_process_vi.append(self.vi_utils.add_border(vi_seq))
            else:
                cut_en_seq, cut_vi_seq = self.process_long_sequence(en_seq, vi_seq)
                for en_gram, vi_gram in zip(cut_en_seq, cut_vi_seq):
                    self.val_process_en.append(self.en_utils.add_border(en_gram))
                    self.val_process_vi.append(self.vi_utils.add_border(vi_gram))

        assert len(self.val_process_en) == len(self.val_process_vi), "The size of the 2 validation sets is not equal"
        
        for en_seq, vi_seq in zip(seq_src_test, seq_tgt_test):
            if self.config.min_len < len(en_seq) < self.config.max_len and self.config.min_len < len(vi_seq) < self.config.max_len:
                self.test_process_en.append(self.en_utils.add_border(en_seq))
                self.test_process_vi.append(self.vi_utils.add_border(vi_seq))
            else:
                cut_en_seq, cut_vi_seq = self.process_long_sequence(en_seq, vi_seq)
                for en_gram, vi_gram in zip(cut_en_seq, cut_vi_seq):
                    self.test_process_en.append(self.en_utils.add_border(en_gram))
                    self.test_process_vi.append(self.vi_utils.add_border(vi_gram))
                    
        assert len(self.test_process_en) == len(self.test_process_vi), "The size of the 2 testing sets is not equal"

        X_train = tf.convert_to_tensor(self.padding(self.train_process_en, self.en_utils))
        Y_train = tf.convert_to_tensor(self.padding(self.train_process_vi, self.vi_utils))
        X_val = tf.convert_to_tensor(self.padding(self.val_process_en, self.en_utils))
        Y_val = tf.convert_to_tensor(self.padding(self.val_process_vi, self.vi_utils))
        X_test = tf.convert_to_tensor(self.padding(self.test_process_en, self.en_utils))
        Y_test = tf.convert_to_tensor(self.padding(self.test_process_vi, self.vi_utils))
    
        print('Shape of train source tensor: ', X_train.shape)
        print('Shape of train target tensor: ', Y_train.shape)
        print('Shape of val source tensor: ', X_val.shape)
        print('Shape of val target tensor: ', Y_val.shape)
        print('Shape of test source tensor: ', X_test.shape)
        print('Shape of test target tensor: ', Y_test.shape)

        train_ds = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).batch(self.config.batch_size)
        val_ds = tf.data.Dataset.from_tensor_slices((X_val, Y_val))
        test_ds = tf.data.Dataset.from_tensor_slices((X_test, Y_test))
        
        print('Batch number of training set: ', len(train_ds))
        return train_ds, val_ds, test_ds

* ### **Encoder block**                                       

In [10]:
class Encoder(tf.keras.Model):
    def __init__(self, config, en_vocab_size):
        super(Encoder, self).__init__()
        self.Embedding = Embedding(en_vocab_size,
                                  config.embedding_size)
        self.Bi_LSTM = Bidirectional(LSTM(config.hidden_units,
                                            return_sequences = True,
                                            return_state = True,
                                            dropout = config.dropout),
                                       merge_mode = 'sum')
        
    def __call__(self, x):
        enc_embedding = self.Embedding(x)
        enc_outputs, fw_h, fw_c, bw_h, bw_c = self.Bi_LSTM(enc_embedding)
        state_h = fw_h + bw_h
        state_c = fw_c + bw_c
        enc_states = [[state_h, state_c], [state_h, state_c]]
        return enc_outputs, enc_states

* ### **Luong's attention layer**

In [11]:
class Luong_Attention(tf.keras.layers.Layer):
    def __init__(self, config):
        super(Luong_Attention, self).__init__()
        self.Wa = Dense(config.hidden_units)
        
    def __call__(self, enc_outputs, dec_outputs):
        score = tf.matmul(dec_outputs, self.Wa(enc_outputs), transpose_b = True)
        alignment = tf.nn.softmax(score, axis = 2)
        context = tf.matmul(alignment, enc_outputs)
        return context, score

* ### **Decoder block**

In [12]:
class Decoder(tf.keras.Model):
    def __init__(self, config, attention, vi_vocab_size):
        super(Decoder, self).__init__()
        self.Embedding = Embedding(vi_vocab_size,
                                  config.embedding_size)
        self.LSTM_1 = LSTM(config.hidden_units,
                          dropout = config.dropout,
                          return_sequences = True,
                          return_state = True)
        self.LSTM_2 = LSTM(config.hidden_units,
                          dropout = config.dropout,
                          return_sequences = True,
                          return_state = True)
        self.Fc = Dense(vi_vocab_size, activation = 'softmax')
        self.attention = attention
        
    def __call__(self, x, enc_outputs, states):
        x = tf.expand_dims(x, axis = 1)
        dec_embedding = self.Embedding(x)
        dec_outputs1 = self.LSTM_1(dec_embedding,
                                  initial_state = states[0])
        dec_outputs2 = self.LSTM_2(dec_outputs1[0],
                                   initial_state = states[1])
        
        dec_outputs = dec_outputs2[0]
        context, _ = self.attention(enc_outputs, dec_outputs)
        dec_concat = tf.concat([dec_outputs, context], axis = -1)
        final_concat = tf.reshape(dec_concat, (-1, dec_concat.shape[2]))
        
        final_outs = self.Fc(final_concat)
        dec_states = [dec_outputs1[1:], dec_outputs2[1:]]
        return final_outs, dec_states

* ### **Loss function**

In [13]:
class MaskedLoss(tf.keras.losses.Loss):
    def __init__(self):
        super(MaskedLoss, self).__init__()
        self.loss = SparseCategoricalCrossentropy(from_logits = True)

    def __call__(self, y_true, y_pred):
        mask = 1 - np.equal(y_true, 0)
        loss = self.loss(y_true, y_pred)*mask
        return tf.reduce_mean(loss)

* ### **Evaluates the model after each epoch**

In [14]:
class Evaluate:
    def __init__(
    self,
    config,
    vi_utils
    ):
        self.config = config
        self.vi_utils = vi_utils
        
    def BLEU(self, y_pred, y_true):
        smoothing_function = SmoothingFunction()
        bleu_score = 100*corpus_bleu(list_of_references = [[seq] for seq in y_true],
                                     hypotheses = y_pred,
                                     smoothing_function = smoothing_function.method0)
        return round(bleu_score, 2)
    
    def remove(self, sequence):
        sentence = self.vi_utils.decode(sequence)
        new_sequence = [word for word in sentence.split(" ") if word not in [self.config.pad_token,
                                                                             self.config.start_token,
                                                                             self.config.end_token]]
        return new_sequence
        
    def evaluation(self, encoder, decoder, dataset):
        y_true = []
        y_pred = []
        for x_test, Y_test in dataset.shuffle(buffer_size = 1, seed = 1).take(len(dataset)):
            X_test = tf.expand_dims(x_test, axis = 0)
            enc_outputs, last_states = encoder(X_test)
            dec_inputs = tf.constant([self.vi_utils.word_index[self.config.start_token]])
            sequence = []
            for _ in range(len(Y_test)):
                dec_outputs, last_states = decoder(dec_inputs, enc_outputs, last_states)
                pred_id = tf.argmax(dec_outputs, axis = 1).numpy()
                dec_inputs = pred_id
                sequence.append(pred_id[0])
            y_pred.append(self.remove(sequence))   
            y_true.append(self.remove(Y_test.numpy()))
            
        BLEU_score = self.BLEU(y_pred, y_true)
        return BLEU_score

* ### **Custom learning rate during training**

In [15]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, decay_rate, warmup_epoch):
        super(CustomSchedule, self).__init__()
        self.initial_learning_rate = initial_learning_rate
        self.decay_rate = decay_rate
        self.warmup_epoch = warmup_epoch
        
    def __call__(self, epoch):
        if epoch <= self.warmup_epoch:
            return self.initial_learning_rate
        else:
            return self.initial_learning_rate * (self.decay_rate**(epoch - self.warmup_epoch))
            

* ### **Custom training loop**

In [16]:
class Trainer:
    def __init__(
    self,
    encoder,
    decoder,
    config,
    train_ds,
    val_ds,
    test_ds,
    vi_utils,
    evaluate,
    maskedloss,
    lr_schedule,
    decay_rate = 0.5,
    warmup_epoch = 9,
    max_norm = 1.0
    ):
        self.encoder = encoder
        self.decoder = decoder
        self.config = config
        self.train_ds = train_ds
        self.val_ds = val_ds
        self.test_ds = test_ds
        self.vi_utils = vi_utils
        self.evaluate = evaluate
        self.loss = maskedloss
        self.lr_schedule = lr_schedule(config.learning_rate, 
                                       decay_rate, 
                                       warmup_epoch)
        self.optimizer = Adam()
        self.max_norm = max_norm
        
    def training(self):

        for epoch in range(self.config.epochs):
            start_time = time.time()
            total_loss = 0
            self.optimizer.lr.assign(self.lr_schedule(epoch))
            print('Current learning rate: ', self.optimizer.learning_rate.numpy())

            for _, (x, y) in tqdm(enumerate(self.train_ds.take(len(self.train_ds)))):
                loss = 0
                with tf.GradientTape() as tape:
                    enc_outputs, last_states = self.encoder(x)
                    dec_inputs = tf.constant([self.vi_utils.word_index[self.config.start_token]]*len(x))
                    for i in range(1, y.shape[1]):
                        dec_outputs, last_states = self.decoder(dec_inputs, enc_outputs, last_states)
                        loss += self.loss(y[:, i], dec_outputs)
                        dec_inputs = y[:, i]
                    
                    train_vars = self.encoder.trainable_variables + self.decoder.trainable_variables
                    grads = tape.gradient(loss, train_vars)
                    clipped_grads, _ = tf.clip_by_global_norm(grads, self.max_norm)
                    self.optimizer.apply_gradients(zip(clipped_grads, train_vars))   
                    
                total_loss += loss

            print(f'Epoch: {epoch + 1} -- Loss: {total_loss}')
            print('Time taken: %.2fs' % (time.time() - start_time))
            print('----------------------------------------------------------------')
        
        BLEU_2012 = self.evaluate.evaluation(self.encoder, self.decoder, self.val_ds)
        BLEU_2013 = self.evaluate.evaluation(self.encoder, self.decoder, self.test_ds)
        print()
        print('***************************************** END OF TRAINING *****************************************')
        print()
        print(f'BLEU score is calculated on test set 2012: {BLEU_2012}')
        print(f'BLEU score is calculated on test set 2013: {BLEU_2013}')

* ### **Parameter passing and training**

In [17]:
train_ds, val_ds, test_ds = Dataloader(en_utils,
                                       vi_utils,
                                       config).dataset()

2023-02-16 11:59:23.302398: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-16 11:59:23.380153: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-16 11:59:23.381045: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-02-16 11:59:23.384498: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Shape of train source tensor:  (140274, 52)
Shape of train target tensor:  (140274, 52)
Shape of val source tensor:  (1591, 52)
Shape of val target tensor:  (1591, 52)
Shape of test source tensor:  (1338, 52)
Shape of test target tensor:  (1338, 52)
Batch number of training set:  548


In [18]:
en_vocab_size = len(en_utils.word_index)
vi_vocab_size = len(vi_utils.word_index)

encoder = Encoder(config, en_vocab_size)
attention = Luong_Attention(config)
decoder = Decoder(config, attention, vi_vocab_size)
maskedloss = MaskedLoss()
evaluate = Evaluate(config, vi_utils)

print('EN_VOCAB_SIZE: ', en_vocab_size)
print('VI_VOCAB_SIZE: ', vi_vocab_size)

EN_VOCAB_SIZE:  17136
VI_VOCAB_SIZE:  7724


In [19]:
history = Trainer(encoder,
                  decoder,
                  config,
                  train_ds,
                  val_ds,
                  test_ds,
                  vi_utils,
                  evaluate,
                  maskedloss,
                  CustomSchedule)

In [20]:
history.training()

Current learning rate:  0.0008


0it [00:00, ?it/s]2023-02-16 11:59:29.083994: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005
  '"`sparse_categorical_crossentropy` received `from_logits=True`, but '
548it [09:51,  1.08s/it]


Epoch: 1 -- Loss: 52109.9140625
Time taken: 591.42s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [09:54,  1.08s/it]


Epoch: 2 -- Loss: 37052.95703125
Time taken: 594.43s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [09:55,  1.09s/it]


Epoch: 3 -- Loss: 30261.12109375
Time taken: 621.97s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [09:54,  1.08s/it]


Epoch: 4 -- Loss: 26543.806640625
Time taken: 621.96s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [09:55,  1.09s/it]


Epoch: 5 -- Loss: 24086.595703125
Time taken: 595.93s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [09:56,  1.09s/it]


Epoch: 6 -- Loss: 22326.26953125
Time taken: 621.97s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [09:57,  1.09s/it]


Epoch: 7 -- Loss: 20985.15625
Time taken: 621.95s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [09:57,  1.09s/it]


Epoch: 8 -- Loss: 19903.208984375
Time taken: 597.52s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [10:00,  1.10s/it]


Epoch: 9 -- Loss: 18982.298828125
Time taken: 621.94s
----------------------------------------------------------------
Current learning rate:  0.0008


548it [09:57,  1.09s/it]


Epoch: 10 -- Loss: 18162.23046875
Time taken: 597.50s
----------------------------------------------------------------
Current learning rate:  0.0004


548it [10:00,  1.10s/it]


Epoch: 11 -- Loss: 16886.11328125
Time taken: 621.95s
----------------------------------------------------------------
Current learning rate:  0.0002


548it [10:08,  1.11s/it]


Epoch: 12 -- Loss: 15947.720703125
Time taken: 608.13s
----------------------------------------------------------------
Current learning rate:  1e-04


548it [10:20,  1.13s/it]


Epoch: 13 -- Loss: 15402.076171875
Time taken: 620.14s
----------------------------------------------------------------
Current learning rate:  5e-05


548it [10:09,  1.11s/it]


Epoch: 14 -- Loss: 15105.5634765625
Time taken: 621.95s
----------------------------------------------------------------
Current learning rate:  2.5e-05


548it [10:12,  1.12s/it]


Epoch: 15 -- Loss: 14954.203125
Time taken: 621.96s
----------------------------------------------------------------
Current learning rate:  1.25e-05


548it [10:14,  1.12s/it]


Epoch: 16 -- Loss: 14878.4814453125
Time taken: 614.97s
----------------------------------------------------------------
Current learning rate:  6.25e-06


548it [10:16,  1.12s/it]


Epoch: 17 -- Loss: 14840.34765625
Time taken: 621.96s
----------------------------------------------------------------
Current learning rate:  3.125e-06


548it [10:17,  1.13s/it]


Epoch: 18 -- Loss: 14817.6591796875
Time taken: 621.97s
----------------------------------------------------------------
Current learning rate:  1.5625e-06


548it [10:20,  1.13s/it]


Epoch: 19 -- Loss: 14804.1142578125
Time taken: 620.21s
----------------------------------------------------------------
Current learning rate:  7.8125e-07


548it [10:17,  1.13s/it]


Epoch: 20 -- Loss: 14796.556640625
Time taken: 617.71s
----------------------------------------------------------------

***************************************** END OF TRAINING *****************************************

BLEU score is calculated on test set 2012: 22.21
BLEU score is calculated on test set 2013: 24.15


* ### **Inference**

In [21]:
class Translation:
    def __init__(self,
                encoder,
                decoder,
                en_utils,
                vi_utils,
                config):
        self.encoder = encoder
        self.decoder = decoder
        self.en_utils = en_utils
        self.vi_utils = vi_utils
        self.config = config
        
    def predict(self, input_sentence, redundant_max_len = 10):
        sequence = []
        input_sequence = self.en_utils.add_border(self.en_utils.encode(input_sentence))
        max_len = len(input_sequence) + redundant_max_len
        
        X_test = tf.expand_dims(input_sequence, axis = 0)
        enc_outputs, last_states = self.encoder(X_test)
        dec_inputs = tf.constant([self.vi_utils.word_index[self.config.start_token]])
        for _ in range(max_len):
            start_time = time.time()
            dec_outputs, last_states = self.decoder(dec_inputs, enc_outputs, last_states)
            pred_id = tf.argmax(dec_outputs, axis = 1).numpy()
            dec_inputs = pred_id
            sequence.append(pred_id[0])
            if pred_id[0] == self.vi_utils.word_index[self.config.end_token]:
                break
        translated = self.vi_utils.decode(sequence)
        translated_sentence = " ".join([word for word in translated.split(" ") if word not in [self.config.pad_token,
                                                                                               self.config.start_token,
                                                                                               self.config.end_token]])
            
            
        print(f'Input sentence: {input_sentence}')
        print(f'Translated sentence: {translated_sentence}')
        print('Translation time: %.2fs' % (time.time() - start_time))

In [22]:
translation = Translation(encoder,
                          decoder,
                          en_utils,
                          vi_utils,
                          config)

In [23]:
input_sentence = 'What is your name?'
translation.predict(input_sentence)

Input sentence: What is your name?
Translated sentence: Tên anh là gì ?
Translation time: 0.01s


In [24]:
input_sentence = "They had been dancing for an hour when there was a knock on the door"
translation.predict(input_sentence)

Input sentence: They had been dancing for an hour when there was a knock on the door
Translated sentence: Họ đã nhảy múa trong một giờ khi đó có một con gián ở cửa hàng .
Translation time: 0.01s


In [25]:
input_sentence = "I hope this project of mine will be successful"
translation.predict(input_sentence)

Input sentence: I hope this project of mine will be successful
Translated sentence: Tôi hy vọng rằng dự án này sẽ thành công .
Translation time: 0.01s


In [26]:
input_sentence = 'mathematics is the foundation of machine learning'
translation.predict(input_sentence)

Input sentence: mathematics is the foundation of machine learning
Translated sentence: toán là nền tảng của việc học tập
Translation time: 0.01s


In [27]:
input_sentence = 'Be quiet for a moment.'
translation.predict(input_sentence)

Input sentence: Be quiet for a moment.
Translated sentence: Hãy yên lặng trong một thời điểm .
Translation time: 0.01s


In [28]:
input_sentence = "The police appealed to the crowd not to panic"
translation.predict(input_sentence)

Input sentence: The police appealed to the crowd not to panic
Translated sentence: Cảnh sát đã thu hút đám đông không phải là hoang mang .
Translation time: 0.01s


In [29]:
input_sentence = "You have made the very same mistake again"
translation.predict(input_sentence)

Input sentence: You have made the very same mistake again
Translated sentence: Bạn đã làm ra một sai lầm rất giống nhau .
Translation time: 0.01s


In [30]:
input_sentence = "This is the mildest winter that we have ever experienced"
translation.predict(input_sentence)

Input sentence: This is the mildest winter that we have ever experienced
Translated sentence: Đây là mùa đông <unk> mà chúng ta đã từng trải qua .
Translation time: 0.01s


In [31]:
input_sentence = "Would you mind staying home and taking care of the children?"
translation.predict(input_sentence)

Input sentence: Would you mind staying home and taking care of the children?
Translated sentence: Bạn có phiền ở nhà và chăm sóc trẻ em ?
Translation time: 0.01s


In [32]:
input_sentence = "Unless it's something fairly impressive, I won't remember it"
translation.predict(input_sentence)

Input sentence: Unless it's something fairly impressive, I won't remember it
Translated sentence: Trừ khi nó gợi ra một cái gì đó khá ấn tượng , tôi đã đoạt giải nhớ rằng nó là một phần nhỏ
Translation time: 0.01s


In [33]:
input_sentence = "You can get to her house in a variety of different ways"
translation.predict(input_sentence)

Input sentence: You can get to her house in a variety of different ways
Translated sentence: Bạn có thể đến nhà mình ở một loạt những cách khác nhau .
Translation time: 0.01s


In [34]:
input_sentence = "I always end up looking up the same words in the dictionary"
translation.predict(input_sentence)

Input sentence: I always end up looking up the same words in the dictionary
Translated sentence: Tôi luôn luôn luôn nhìn lại từ điển hình .
Translation time: 0.01s


In [35]:
input_sentence = "I want you to return it to me as soon as possible."
translation.predict(input_sentence)

Input sentence: I want you to return it to me as soon as possible.
Translated sentence: Tôi muốn các bạn quay trở lại với tôi ngay sau khi có thể .
Translation time: 0.01s


In [36]:
input_sentence = "The company asked the bank to loan them some money to buy new machinery"
translation.predict(input_sentence)

Input sentence: The company asked the bank to loan them some money to buy new machinery
Translated sentence: Công ty yêu cầu tiền cho họ một số tiền để mua máy móc mới .
Translation time: 0.01s


In [37]:
input_sentence = "Just have passion and really try, the whole universe will help you"
translation.predict(input_sentence)

Input sentence: Just have passion and really try, the whole universe will help you
Translated sentence: Chỉ có đam mê và thử , toàn bộ vũ trụ sẽ giúp bạn .
Translation time: 0.01s
