# **Neural Machine Translation**
### Deep Learning Project

> ## ***Model 4:***
> ### **Encoder** - Embedding layer + 2 LSTM layers
> ### **Decoder** - Embedding + LSTM + **Attention Mechanism**

### **Preprocessing Pipeline:**
1. Load & examine the data.
2. Cleaning the data. 
  * Converting the data into an array for easy implementation.
  * Reducing the size of dataset to save the computation cost.(Only in this case)
  * Removing irrelevant text like attribution details
  * Splitting each sample/text into English-German pairs.
  * Removing punctuations.
  * Converting the text to lower case.
3. Tokenizing & vectorizing the text into numerical sequences.
4. Padding those sequences with 0’s to bring them to same length.

### **Importing required libraries**

In [None]:
import tensorflow as tf
import tensorflow.keras
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Input, dot, Activation, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers
from tensorflow.keras import initializers, regularizers, constraints

from sklearn.model_selection import train_test_split

!pip install mojimoji
!pip install sentencepiece
import mojimoji
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import io

import nltk
import unicodedata
import sentencepiece as spm

# ignore warning
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
print(tf.__version__)
# gpu
tf.test.is_gpu_available() 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def normalize_english(english_text, german_text):
    
    input_value = []
    target_value = []
    
    for en_text, deu_text in zip(english_text, german_text):
        
        # normalize English

        en_text = "start_ " + en_text + " _end"
        # input value doesn't need  a START and END sentence  
        input_value.append(en_text)

        # normalize Japanese

        # add StTART and END sentence
        deu_text = "start_ " + deu_text + " _end"
        
        target_value.append(deu_text)

    return input_value, target_value

**Importing the dataset**

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DLProject/deu_eng1.csv")
df1 = df
df1

Unnamed: 0.1,Unnamed: 0,English,German
0,0,go,geh
1,1,hi,hallo
2,2,hi,grüß gott
3,3,run,lauf
4,4,run,lauf
...,...,...,...
24995,24995,im not giving up,ich gebe nicht auf
24996,24996,im not going out,ich gehe nicht aus
24997,24997,im not going out,ich gehe nicht raus
24998,24998,im not in boston,ich bin nicht in boston


In [None]:
df1['English'] = "start_ " + df1['English'] + " _end"
df1

Unnamed: 0.1,Unnamed: 0,English,German
0,0,start_ go _end,geh
1,1,start_ hi _end,hallo
2,2,start_ hi _end,grüß gott
3,3,start_ run _end,lauf
4,4,start_ run _end,lauf
...,...,...,...
24995,24995,start_ im not giving up _end,ich gebe nicht auf
24996,24996,start_ im not going out _end,ich gehe nicht aus
24997,24997,start_ im not going out _end,ich gehe nicht raus
24998,24998,start_ im not in boston _end,ich bin nicht in boston


In [None]:
df1['German'] = "start_ " + df1['German'] + " _end"
df1

Unnamed: 0.1,Unnamed: 0,English,German
0,0,start_ go _end,start_ geh _end
1,1,start_ hi _end,start_ hallo _end
2,2,start_ hi _end,start_ grüß gott _end
3,3,start_ run _end,start_ lauf _end
4,4,start_ run _end,start_ lauf _end
...,...,...,...
24995,24995,start_ im not giving up _end,start_ ich gebe nicht auf _end
24996,24996,start_ im not going out _end,start_ ich gehe nicht aus _end
24997,24997,start_ im not going out _end,start_ ich gehe nicht raus _end
24998,24998,start_ im not in boston _end,start_ ich bin nicht in boston _end


# tokenize
tokenize each language word based on space

In [None]:
def tokenize(lang):
    # vectorize a text corpus
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
        filters=' ')

    # updates internal vocabulary based on a list of texts
    # e.g. "[this place is good ]"→{this:2, place:3, is:1, good:4} "
    lang_tokenizer.fit_on_texts(lang)

    # Transforms each text in texts to a sequence of integers.
    # e.g. this place is good → [[2, 3, 1, 4]]
    tensor = lang_tokenizer.texts_to_sequences(lang)

    # transform a list of num sample into a 2D Numpy array of shape 
    # Fixed length because length of sequence of integers are different
    # return (len(sequences), maxlen)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                          padding='post')
    return tensor, lang_tokenizer

# create clean dataset

In [None]:
# cleate a clean dataset
def create_dataset(en, ja):
    
    # input_tensor, target_tensor: 2d numpy array
    # input_lang_tokenize, target_lang_tokenize: word dictionary
    input_tensor, input_lang_tokenize = tokenize(en)
    target_tensor, target_lang_tokenize = tokenize(ja)

    return input_tensor, target_tensor, input_lang_tokenize, target_lang_tokenize

In [None]:
input_tensor, target_tensor, input_lang_tokenize, target_lang_tokenize = create_dataset(df1['English'], df1['German'])

In [None]:
target_lang_tokenize.word_index

{'start_': 1,
 '_end': 2,
 'ich': 3,
 'tom': 4,
 'ist': 5,
 'sie': 6,
 'es': 7,
 'das': 8,
 'nicht': 9,
 'du': 10,
 'bin': 11,
 'wir': 12,
 'hat': 13,
 'er': 14,
 'habe': 15,
 'ein': 16,
 'zu': 17,
 'war': 18,
 'mich': 19,
 'mir': 20,
 'sind': 21,
 'die': 22,
 'ihr': 23,
 'auf': 24,
 'kann': 25,
 'dich': 26,
 'hier': 27,
 'uns': 28,
 'haben': 29,
 'bist': 30,
 'eine': 31,
 'sich': 32,
 'an': 33,
 'werde': 34,
 'einen': 35,
 'was': 36,
 'wie': 37,
 'wer': 38,
 'jetzt': 39,
 'aus': 40,
 'gehen': 41,
 'der': 42,
 'gut': 43,
 'sehr': 44,
 'mag': 45,
 'in': 46,
 'mein': 47,
 'wird': 48,
 'ihn': 49,
 'dir': 50,
 'den': 51,
 'so': 52,
 'mit': 53,
 'sein': 54,
 'hast': 55,
 'noch': 56,
 'bitte': 57,
 'meine': 58,
 'euch': 59,
 'da': 60,
 'liebe': 61,
 'wurde': 62,
 'kein': 63,
 'können': 64,
 'lass': 65,
 'nach': 66,
 'geht': 67,
 'will': 68,
 'werden': 69,
 'alle': 70,
 'seid': 71,
 'hause': 72,
 'komm': 73,
 'mal': 74,
 'muss': 75,
 'mach': 76,
 'weiß': 77,
 'für': 78,
 'brauche': 79,
 'hatt

In [None]:
def max_length(input_tensor, target_tensor):

    # max length of input sentense and target sentense
    english_len = [len(i) for i in input_tensor]

    japanese_len = [len(i) for i in target_tensor]

     # print max length
    print("english length:", max(english_len))
    print("japanese length:", max(japanese_len))
    max_len_input =  max(english_len)
    max_len_target =  max(japanese_len)

    return max_len_input, max_len_target

In [None]:
# Calculate max_length of the target tensors
max_length_input, max_length_target = max_length(input_tensor, target_tensor)

english length: 8
japanese length: 12


In [None]:
# create trainnig set and validation set
X_train, X_test, \
    Y_train, Y_test = train_test_split(input_tensor, target_tensor, test_size=0.2, shuffle=True)

X_test, X_val, \
    Y_test, Y_val = train_test_split(X_test, Y_test, test_size=0.5, shuffle=True)


# show length
print(len(X_train), len(Y_train), len(X_test), len(Y_test), len(X_val), len(Y_val))

20000 20000 2500 2500 2500 2500


In [None]:
def convert(lang, tensor):
    for t in tensor:
        if t != 0:
            # Index number assigned to each word
            print("%d----->%s" % (t, lang.index_word[t]))

In [None]:
print("input lang: index to word mapping")
convert(input_lang_tokenize, X_train[10])
print("output lang: index to word mapping")
convert(target_lang_tokenize, Y_train[10])

input lang: index to word mapping
1----->start_
4----->i
379----->wanted
2706----->sympathy
2----->_end
output lang: index to word mapping
1----->start_
3----->ich
395----->wollte
6451----->sympathie
2----->_end


# define parameter

In [None]:
# BUFFER_SIZE >= dataset if smaller than dataset can't shuffle equally
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 32
dropout_rate = 0.3
# if None steps_per_epoch == mum of dataset
train_steps_per_epoch = len(X_train) // BATCH_SIZE
val_steps_per_epoch = len(X_val) // BATCH_SIZE
print("train step %d" % train_steps_per_epoch)
embedding_dim = 300
units = 512
vocab_inp_size = len(input_lang_tokenize.word_index) + 1
print('Total unique words in the input: %s' % len(input_lang_tokenize.word_index))
print('Total unique words in the target: %s' % len(target_lang_tokenize.word_index))
vocab_tar_size = len(target_lang_tokenize.word_index) + 1

# create train dataset
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# create validation dataset
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, Y_val)).batch(BATCH_SIZE, drop_remainder=True)

train step 625
Total unique words in the input: 4117
Total unique words in the target: 6514


# Encoder Model

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size, dropout_rate):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.dropout = Dropout(dropout_rate)
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.first_lstm = tf.keras.layers.LSTM(self.enc_units,
                                                            return_sequences=True,
                                                            recurrent_initializer='glorot_uniform')
        
        self.final_lstm = tf.keras.layers.LSTM(self.enc_units,
                                                    return_sequences=True,
                                                    return_state=True,
                                                    recurrent_initializer='glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        x = self.dropout(x)
        x = self.first_lstm(x, initial_state =hidden)
        output, state_h, state_c = self.final_lstm(x)
        state = [state_h, state_c ]

        return output, state
        
    def initialize_hidden_state(self):
            return tf.zeros((self.batch_size , self.enc_units)), tf.zeros((self.batch_size , self.enc_units))

In [None]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE, dropout_rate)

# attention

In [None]:
class Attention(tf.keras.models.Model):

    def __init__(self, units: int, *args, **kwargs):

        super().__init__(*args, **kwargs)
        self.units = units

        self.q_dense_layer = Dense(units, use_bias=False, name='q_dense_layer')
        self.k_dense_layer = Dense(units, use_bias=False, name='k_dense_layer')
        self.v_dense_layer = Dense(units, use_bias=False, name='v_dense_layer')
        self.output_dense_layer = Dense(units, use_bias=False, name='output_dense_layer')

    def call(self, input, memory):

        q = self.q_dense_layer(input) 
        k = self.k_dense_layer(memory) 
        v = self.v_dense_layer(memory)

        depth = self.units // 2
        q *= depth ** -0.5  # for scaled dot product

        # caluclate relation between query and key
        logit = tf.matmul(q, k, transpose_b=True) 

        attention_weight = tf.nn.softmax(logit)

        attention_output = tf.matmul(attention_weight, v) 
        return self.output_dense_layer(attention_output)

# Decoder Model

In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size, dropout_rate):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.dropout = Dropout(dropout_rate)
        self.first_lstm = tf.keras.layers.LSTM(self.dec_units,
                                                            return_sequences=True)
        self.final_lstm = tf.keras.layers.LSTM(self.dec_units,
                                                            return_sequences=True,
                                                            return_state=True)
                                                            
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        self.attention = Attention(self.dec_units)
    
    def call(self, x, hidden, enc_output):
        x = self.embedding(x)
        x = self.dropout(x)
        
        x =  self.first_lstm(x)
        output, state_h, state_c = self.final_lstm(x)
        state = [state_h, state_c]
        attention_weights = self.attention(output, enc_output)
        output = tf.concat([output, attention_weights], axis=-1)

                
        output = tf.reshape(output, (-1, output.shape[2]))
        
        output = self.fc(output)
        
        return  output, state

In [None]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE, dropout_rate)

# optimizer and the loss function

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.9, epsilon=1e-04, decay=1e-06)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

# Checkpoints

In [None]:
checkpoint_dir = './train_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# train model

In [None]:
@tf.function
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([target_lang_tokenize.word_index['start_']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the sladecoder
      predictions, dec_hidden = decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [None]:
# trained model for 45 epochs
EPOCHS = 5

for epoch in range(EPOCHS):
  

  enc_hidden = encoder.initialize_hidden_state()
  train_loss = 0
  val_loss = 0

  for (batch, (inp, targ)) in enumerate(train_dataset.take(train_steps_per_epoch)):
    train_batch_loss = train_step(inp, targ, enc_hidden)
    train_loss += train_batch_loss


  for (batch, (val_inp, val_tar)) in enumerate(val_dataset.take(val_steps_per_epoch)):
    val_batch_loss = train_step(val_inp, val_tar, enc_hidden)
    val_loss += val_batch_loss


  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Train Loss {:.4f}'.format(epoch + 1,
                                      train_loss / train_steps_per_epoch))
  print('Epoch {} Validation Loss {:.4f}'.format(epoch + 1,
                                      val_loss / val_steps_per_epoch))


Epoch 1 Train Loss 1.8013
Epoch 1 Validation Loss 1.5329
Epoch 2 Train Loss 1.4135
Epoch 2 Validation Loss 1.3033
Epoch 3 Train Loss 1.2274
Epoch 3 Validation Loss 1.1478
Epoch 4 Train Loss 1.0872
Epoch 4 Validation Loss 1.0166
Epoch 5 Train Loss 0.9833
Epoch 5 Validation Loss 0.9263


## **Evaluation:**

(Check bleu score)

In [None]:
def predict(sentence):
    inputs = tf.convert_to_tensor(sentence)
    result = ''
    inputs = tf.expand_dims(inputs, axis=0)
    hidden = [tf.zeros((1, units)), tf.zeros((1, units))]
    enc_out, state = encoder(inputs, hidden)
    hidden_state = state
    dec_input = tf.expand_dims([target_lang_tokenize.word_index['start_']], 0)
    for t in range(max_length_target):
        predictions, hidden_state = decoder(dec_input,
                                                             hidden_state,
                                                             enc_out)

        predicted_id = tf.argmax(predictions[0]).numpy()

        result += target_lang_tokenize.index_word[predicted_id] + ' '
        if target_lang_tokenize.index_word[predicted_id] == '_end' or len(result) > max_length_target:
            return result

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    return result

In [None]:
def create_reference(lang, tensor):
    all_sentence_list = []

    for word_list in tensor:
      sentence_list = []

      for t in word_list:
          if not t == 0:
              # Index number assigned to each word
              sentence_list.append(lang.index_word[t])
      all_sentence_list.append(sentence_list)
    return all_sentence_list

In [None]:
# create reference
reference = create_reference(target_lang_tokenize, Y_test.tolist())

In [None]:
from tqdm import tqdm
# create hypothesis
hypothesis = []
for i in tqdm(X_test):
  hypothesis.append(predict(i))

100%|██████████| 2500/2500 [02:14<00:00, 18.63it/s]


### Evaluating the performance of the model using BLEU Score:

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

score = 0
smoothie = SmoothingFunction().method2
for i in range(len(reference)):
    score += sentence_bleu([reference[i][1:-1]], hypothesis[i][:-5].strip().split(), smoothing_function=smoothie)

score /= len(reference)
print("The bleu score is: "+str(score))

The bleu score is: 0.29664571692216046


# Translation example

In [None]:
def preprocess_sentence(en_text):
        # normalize English
        en_text = en_text.lower()

        en_text = "start_ " + en_text + " _end"

        return en_text
        
def evaluate(sentence):
  
    attention_plot = np.zeros((max_length_target, max_length_input))

    sentence = preprocess_sentence(sentence)
    inputs = [input_lang_tokenize.word_index[i] for i in sentence.split(' ')]

    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                           maxlen=max_length_input,
                                                           padding='post')
    
    inputs = tf.convert_to_tensor(inputs)
    result = ''
    hidden = [tf.zeros((1, units)), tf.zeros((1, units))]
    enc_out, state = encoder(inputs, hidden)
    hidden_state = state
    dec_input = tf.expand_dims([target_lang_tokenize.word_index['start_']], 0)
    for t in range(max_length_target):
        predictions, hidden_state = decoder(dec_input,
                                                             hidden_state,
                                                             enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()

        result += target_lang_tokenize.index_word[predicted_id] + ' '
        if target_lang_tokenize.index_word[predicted_id] == '_end' or len(result) > max_length_target:
            return result, sentence

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
    return result, sentence

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def result(sentence):
    result, sentence = evaluate(sentence)

    return result, sentence

In [None]:
result, sentence = evaluate("he has a dog")
print('Input: %s' % (sentence))
print('Predicted translation: {}'.format(result))

Input: start_ he has a dog _end
Predicted translation: er hat einen 


In [None]:
result, sentence = evaluate("I love her")
print('Input: %s' % (sentence))
print('Predicted translation: {}'.format(result))

Input: start_ i love her _end
Predicted translation: ich liebe sie 


In [None]:
while(1):
  sent = input("Enter the sentence: ")
  if(sent == "quit"):
    break
  else:
    result, sentence = evaluate(sent)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}\n'.format(result))