<a href="https://colab.research.google.com/github/nuvita97/NLP-Machine-Translation/blob/main/NLP_Seq2Seq_%26_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/nuvita97/NLP-Machine-Translation.git

Cloning into 'NLP-Machine-Translation'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 18 (delta 5), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (18/18), done.


In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import random
import os
import io
import string
import re
from sklearn.model_selection import train_test_split

In [3]:
path = 'NLP-Machine-Translation'
os.chdir(path)
os.listdir()

['README.md',
 '.git',
 'train.vi.txt',
 'NLP_Seq2Seq_&_Attention.ipynb',
 'train.en.txt']

In [4]:
en_filename = 'train.en.txt'
vi_filename = 'train.vi.txt'

raw_en_lines = open(en_filename, encoding = "utf-8").read().strip().split("\n")
raw_vi_lines = open(vi_filename, encoding = "utf-8").read().strip().split("\n")

print(len(raw_en_lines))
print(len(raw_vi_lines))

133317
133317


In [5]:
display(raw_en_lines[:3])
display(raw_vi_lines[:3])

['Rachel Pike : The science behind a climate headline',
 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .',
 'I &apos;d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper .']

['Khoa học đằng sau một tiêu đề về khí hậu',
 'Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .',
 'Tôi muốn cho các bạn biết về sự to lớn của những nỗ lực khoa học đã góp phần làm nên các dòng tít bạn thường thấy trên báo .']

In [6]:
print("Punctuation : ", string.punctuation )
print("Digits : ", string.digits )

exclude = list(string.punctuation) + list(string.digits)
print("Exclude : ", exclude)

Punctuation :  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Digits :  0123456789
Exclude :  ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [7]:
def preprocess(sentence):
  sent = sentence.lower()
  sent = sent.strip()
  sent = re.sub("'", " ", sent)
  sent = re.sub("\s+", " ", sent)
  sent = ''.join([char for char in sent if char not in exclude])
  sent = "<start> " + sent + " <end>"
  return sent

preprocess("I go to school")

'<start> i go to school <end>'

In [8]:
en_lines = []
vi_lines = []

min_len, max_len = 10, 50

for eline, vline in zip(raw_en_lines, raw_vi_lines):
  eline = preprocess(eline)
  vline = preprocess(vline)
  if(min_len < len(eline.split()) < max_len and min_len < len(vline.split()) < max_len):
    en_lines.append(eline)
    vi_lines.append(vline)

print(len(en_lines))
print(len(vi_lines))

90500
90500


In [9]:
class Language():
  def __init__(self, lines):
    self.lines = lines
    self.word2id = {}
    self.id2word = {}
    self.vocab = set()
    self.max_len = 0
    self.min_len = 0
    self.vocab_size = 0
    self.init_language_param()

  def init_language_param(self):
    for line in self.lines:
      self.vocab.update(line.split(" "))
      self.word2id["<pad>"] = 0
      
    for id, word in enumerate(self.vocab):
      self.word2id[word] = id + 1
    
    for word, id in self.word2id.items():
      self.id2word[id] = word
    
    self.max_len = max([len(line.split(" ")) for line in self.lines])
    self.min_len = min([len(line.split(" ")) for line in self.lines])
    self.vocab_size = len(self.vocab) + 1
  
  def sentence_to_vector(self, sent):
    result = np.array([self.word2id[word] for word in sent.split(" ")])
    return result
 
  def vector_to_sentence(self, vector):
    result = " ".join([self.id2word[id] for id in vector])
    return result

In [11]:
inp_lang = Language(en_lines)
tar_lang = Language(vi_lines)

inp_vector = [inp_lang.sentence_to_vector(line) for line in inp_lang.lines]
tar_vector = [tar_lang.sentence_to_vector(line) for line in tar_lang.lines]

print(inp_lang.max_len, inp_lang.min_len)
print(tar_lang.max_len, tar_lang.min_len)
print(inp_vector[0])

74 11
72 11
[24037 36388 26326 19714 32038  1695 32038 34487 16689 15588 34892  6971
 26736 34892 13241   141 27122 33030 12152  2310 34892  3389 34487 23129
 36783 34892 28853     1  2133]


In [12]:
inp_tensor = tf.keras.preprocessing.sequence.pad_sequences(inp_vector, inp_lang.max_len, padding = 'post')
tar_tensor = tf.keras.preprocessing.sequence.pad_sequences(tar_vector, inp_lang.max_len, padding = 'post')

print(inp_tensor.shape)
print(tar_tensor.shape)
print(inp_tensor[0])

(90500, 74)
(90500, 74)
[24037 36388 26326 19714 32038  1695 32038 34487 16689 15588 34892  6971
 26736 34892 13241   141 27122 33030 12152  2310 34892  3389 34487 23129
 36783 34892 28853     1  2133     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


In [13]:
x_train, x_val, y_train, y_val = train_test_split(inp_tensor, tar_tensor, test_size = 0.2)

print(x_train.shape)
print(x_val.shape)

BATCH_SIZE = 32
BUFFER_SIZE = x_train.shape[0]
N_BATCH = BUFFER_SIZE//BATCH_SIZE
hidden_unit = 1024
embedding_size = 256
print(BUFFER_SIZE)

dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
dataset = dataset.batch(BATCH_SIZE)

tmp_x, tmp_y = next(iter(dataset))
print(tmp_x.shape, tmp_y.shape)

(72400, 74)
(18100, 74)
72400
(32, 74) (32, 74)


In [15]:
class Encode(tf.keras.Model):
  def __init__(self, embedding_size, vocab_size, hidden_units):
    super(Encode, self).__init__()
    self.Embedding = tf.keras.layers.Embedding(vocab_size,embedding_size)
    self.GRU = tf.keras.layers.GRU(
        hidden_units,
        return_sequences=True,
        return_state=True,
        recurrent_initializer='glorot_uniform')
    self.hidden_units = hidden_units
      
  def call(self, x, hidden_state):
    try:
        x = self.Embedding(x)
    except:
        print(x, print(inp_lang.vocab_size))          
    outputs, last_state = self.GRU(x, hidden_state)
    return outputs, last_state
  
  def init_hidden_state(self, batch_size):
    return tf.zeros([batch_size, self.hidden_units])

In [16]:
encoder = Encode(embedding_size, inp_lang.vocab_size, hidden_unit)
hidden_state = encoder.init_hidden_state(BATCH_SIZE)
tmp_outputs, last_state = encoder(tmp_x, hidden_state)
print(tmp_outputs.shape)
print(last_state.shape)

(32, 74, 1024)
(32, 1024)


In [17]:
class Attention(tf.keras.Model):
  def __init__(self, hidden_units):
    super(Attention, self).__init__()
    self.W_out_encode = tf.keras.layers.Dense(hidden_units)
    self.W_state = tf.keras.layers.Dense(hidden_units)
    self.V = tf.keras.layers.Dense(1)
  
  def call(self, encode_outs, pre_state):
    pre_state = tf.expand_dims(pre_state, axis = 1)
    pre_state = self.W_state(pre_state)
    encode_outs = self.W_out_encode(encode_outs)
    score = self.V(tf.nn.tanh(pre_state + encode_outs))
    context_vector = score * encode_outs
    context_vector = tf.reduce_sum(context_vector, axis = 1)
    return context_vector, score

In [18]:
attention = Attention(hidden_unit)
context_vector, attention_weight = attention(tmp_outputs, last_state)
print(context_vector.shape)
print(attention_weight.shape)

(32, 1024)
(32, 74, 1)


In [19]:
class Decode(tf.keras.Model):
  def __init__(self, vocab_size, embedding_size, hidden_units):
    super(Decode, self).__init__()
    self.hidden_units = hidden_units
    self.Embedding = tf.keras.layers.Embedding(vocab_size,embedding_size)
    self.Attention = Attention(hidden_units)
    self.GRU = tf.keras.layers.GRU(
        hidden_units,
        return_sequences=True,
        return_state=True,
        recurrent_initializer='glorot_uniform')
    self.Fc = tf.keras.layers.Dense(vocab_size)
    
  def call(self, x, encode_outs, pre_state):
    x = tf.expand_dims(x, axis = 1)
    try:
      x = self.Embedding(x)
    except:
      print(x)
    
    context_vector, attention_weight = self.Attention(encode_outs, pre_state)
    context_vector = tf.expand_dims(context_vector, axis = 1)
    gru_inp = tf.concat([x, context_vector], axis = -1)
    out_gru, state = self.GRU(gru_inp)
    out_gru = tf.reshape(out_gru, (-1, out_gru.shape[2]))
    return self.Fc(out_gru), state

In [20]:
decode = Decode(tar_lang.vocab_size, embedding_size, hidden_unit)
decode_out, state = decode(tmp_y[:,0], tmp_outputs, last_state)

print(decode_out.shape)
print(state.shape)

(32, 16093)
(32, 1024)


In [21]:
def loss_function(real, pred):
  mask = 1 - np.equal(real, 0)
  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels = real, logits = pred) * mask
  return tf.reduce_mean(loss_)

EPOCHS = 100
optimizer = tf.optimizers.Adam()
encoder = Encode(embedding_size = embedding_size, vocab_size = inp_lang.vocab_size, hidden_units = hidden_unit)
decoder = Decode(vocab_size = tar_lang.vocab_size, embedding_size = embedding_size, hidden_units = hidden_unit)

In [22]:
for epoch in range(EPOCHS):
  total_loss = 0
  for batch_id, (x, y) in enumerate(dataset.take(N_BATCH)):
    loss = 0
    with tf.GradientTape() as tape:
      first_state = encoder.init_hidden_state(batch_size= BATCH_SIZE)
      encode_outs, last_state = encoder(x, first_state)
      decoder_state = last_state
      decoder_input = [tar_lang.word2id["<start>"]]*BATCH_SIZE

      for i in range(1, y.shape[1]):
        decode_out, decode_state = decoder(decoder_input, encode_outs, decoder_state)
        loss += loss_function(y[:,i], decode_out)
        decode_input = y[:,i]

      train_vars = encoder.trainable_variables  + decoder.trainable_variables
      grads = tape.gradient(loss, train_vars)
      optimizer.apply_gradients(zip(grads, train_vars))
    total_loss += loss
    print(total_loss.numpy())

277.87543
509.47522
734.79004
945.0568
1180.0043
1378.4426
1563.8655
1769.3254
1964.481
2134.1228
2288.3542
2467.5786
2628.2097
2802.6226
2982.5234
3151.7625
3320.6353
3504.9578
3683.8303
3830.5964
4013.2722
4186.206
4360.291
4526.3276
4679.39
4852.975
5023.179
5186.1763
5345.3916


KeyboardInterrupt: ignored

In [23]:
def translate(inputs):
  print(inp_lang.vector_to_sentence(inputs[0].numpy()))
  result = ''

  hidden = encoder.init_hidden_state(batch_size=1)
  enc_out, enc_hidden = encoder(inputs, hidden)
  dec_hidden = enc_hidden
  print(enc_out.shape, dec_hidden.shape)
  
  dec_input = [tar_lang.word2id['<start>']]
  for t in range(tar_lang.max_len):
      predictions, dec_hidden = decoder(dec_input, enc_out, dec_hidden)
      predicted_id = tf.argmax(predictions[0]).numpy()
      result += tar_lang.id2word[predicted_id] + ' '
      dec_input = [predicted_id]
  return result

In [24]:
for inp, tar in dataset.take(N_BATCH):
  print(translate(inp[1:2,:]))
  break

<start> and you can get moral argument off the ground  then  because you aposre not treating moral principles as concrete entities  <end> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
(1, 74, 1024) (1, 1024)
                                                                        
