<a href="https://colab.research.google.com/github/mizzmir/NLP/blob/master/Transformer/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
collab = False
if collab:
    !pip install tensorflow-gpu --quiet
    !git clone https://github.com/mizzmir/NLP.git

In [2]:
import os
import sys
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

if collab:
  sys.path.insert(0, r"./NLP/utilities")
  data_dir = "./NLP/data"
else:
  sys.path.insert(0, r"../utilities")
  data_dir = "../data"

from utils import *
from model import *

In [3]:
data = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)
print("input shape ", data.shape)
padding_mask = makePaddingMask(data)

encoder = Encoder(embedding_size=10,
                  max_sentence_len=1000,
                  vocab_size=3000,
                  blocks_amount=3,
                  heads_number=5, 
                  dff=2048)
encoder_out  = encoder(data, mask=padding_mask)
print(encoder_out.shape)

input shape  (64, 62)
(64, 62, 10)


In [4]:
input_data = np.ones((64, 26))
mask = makeSequenceMask(input_data.shape[1])
print("Decoder input shape ", input_data.shape)
blocks_amount = 2
heads = 5
en_vocab_size = 100
fr_vocab_size = 200
decoder = Decoder(embedding_size=10,
                  max_sentence_len=1000,
                  vocab_size=100,
                  blocks_amount=3,
                  heads_number=5, 
                  dff=2048)
"""
decoder masks are :
- encoder_padding_mask - padding mask made on encoder input data
- decoder sequences mask - sequence mask made on decoder input data
"""
decoder_out  = decoder(encoder_out, input_data, pad_mask=None, elements_mask=None)
print("decoder_out ", decoder_out.shape)

Decoder input shape  (64, 26)
decoder_out  (64, 26, 10)


In [5]:
transformer_model = Transformer(embedding_size=512,
                                dff=2048,
                                input_max_seq_length=2000,
                                output_max_seq_length=1855,
                                input_vocab_size=4980,
                                output_vocab_size=7001,
                                encoder_blocks=4,
                                decoder_blocks=2,
                                heads=8)

# input_data and output_data
input_data = tf.random.uniform((64, 52), dtype=tf.int64, minval=0, maxval=100)
output_data = tf.random.uniform((64, 29), dtype=tf.int64, minval=0, maxval=250)

encoder_pad_mask = makePaddingMask(input_data)
elements_mask = makeSequenceMask(output_data.shape[1])
print("output_data ", output_data.shape)
print("elements_mask ", elements_mask.shape)
transformer_output = transformer_model(input_data, output_data, encoder_pad_mask, elements_mask)
print(transformer_output.shape)

output_data  (64, 29)
elements_mask  (29, 29)
(64, 29, 7001)


In [6]:
class customLearningRate(tf.keras.optimizers.schedules.LearningRateSchedule):
  """
  according to Attention is all you need paper learning rate has custom scheduler:
  there are two parameters : 
  - d_model
  - warmup_steps ( in paper set to 4000)
  according to paper https://arxiv.org/pdf/1706.03762.pdf
  point 5.3 Optimizer
  """
  def __init__(self, warmup_steps, d_model):
    super(customLearningRate, self).__init__()
    self.d_model = tf.cast(d_model, tf.float32)
    self.warmup_steps = warmup_steps
  
  def __call__(self, step):
    firstScheduler = tf.math.rsqrt(step)
    secondScheduler = step*(self.warmup_steps**-1.5)
    return tf.math.rsqrt(self.d_model)*tf.math.minimum(firstScheduler, secondScheduler)

In [7]:
BATCH_SIZE = 64
EPOCHS = 1000
num_layers = 6 # 4
d_model = 512 # 128
dff = 2048  # 512
num_heads = 8 

In [8]:
# reading data

#en_lines, fr_lines = read_data_files(data_dir, ("small_vocab_en", "small_vocab_fr"))

data = read_data(os.path.join(data_dir, "fra-eng"), "fra.txt")
en_lines, fr_lines = list(zip(*data))
"""
en_lines = en_lines[:30000]
fr_lines = fr_lines[:30000]
"""
en_lines = [normalize(line) for line in en_lines]
fr_lines = [normalize(line) for line in fr_lines]

en_train, en_test, fr_train, fr_test = train_test_split(en_lines, fr_lines, shuffle=True, test_size=0.1)

fr_train_in = ['<start> ' + line for line in fr_train]
fr_train_out = [line + ' <end>' for line in fr_train]

fr_test_in = ['<start> ' + line for line in fr_test]
fr_test_out = [line + ' <end>' for line in fr_test]

reading data from  ../data/fra-eng/fra.txt


In [9]:
fr_tokenizer = Tokenizer(filters='')
en_tokenizer = Tokenizer(filters='')

input_data = [fr_train_in, fr_train_out, fr_test_in, fr_test_out, fr_test, fr_train]
fr_train_in, fr_train_out, fr_test_in, fr_test_out, fr_test, fr_train = tokenizeInput(input_data, fr_tokenizer)

input_data = [en_train, en_test]
en_train, en_test = tokenizeInput(input_data, en_tokenizer)

en_vocab_size = len(en_tokenizer.word_index)+1
fr_vocab_size = len(fr_tokenizer.word_index)+1
print("en_vocab {}\nfr_vocab {}" .format(en_vocab_size, fr_vocab_size))

en_vocab 14086
fr_vocab 24855
end_tag 3


# New Section

In [10]:
strategy = tf.distribute.MirroredStrategy()

replicas_num = strategy.num_replicas_in_sync
GLOBAL_BATCH_SIZE = BATCH_SIZE*replicas_num
print("replicas number: ", replicas_num)

replicas number:  4


In [11]:
train_dataset = tf.data.Dataset.from_tensor_slices((en_train, fr_train_in, fr_train_out))
train_dataset = train_dataset.shuffle(len(en_train), reshuffle_each_iteration=True)\
                                 .batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
train_dataset_distr = strategy.experimental_distribute_dataset(train_dataset)
                                 
test_dataset = tf.data.Dataset.from_tensor_slices((en_test, fr_test_in, fr_test_out))
test_dataset = test_dataset.shuffle(len(en_test), reshuffle_each_iteration=True)\
                               .batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
test_dataset_distr = strategy.experimental_distribute_dataset(test_dataset)

In [12]:
# distributed train

test_losses = []
train_losses = []
train_accuracyVec = []
test_accuracyVec =[]
test_loss = tf.keras.metrics.Mean()
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
starting epoch = 0

with strategy.scope():
  custom_learning_rate = customLearningRate(warmup_steps=4000,
                                            d_model=d_model)

  optimizer = tf.keras.optimizers.Adam(learning_rate=custom_learning_rate,
                                      beta_1=0.9,
                                      beta_2=0.98,
                                      epsilon=1e-9)

  transformer_model = Transformer(embedding_size=d_model,
                                  dff=dff,
                                  input_max_seq_length=2000,
                                  output_max_seq_length=1855,
                                  input_vocab_size=en_vocab_size,
                                  output_vocab_size=fr_vocab_size,
                                  encoder_blocks=num_layers,
                                  decoder_blocks=num_layers,
                                  heads=num_heads)

  checkpoint_path = "./checkpoints/train"

  ckpt = tf.train.Checkpoint(transformer_model=transformer_model,
                           optimizer=optimizer)

  ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)


  # if a checkpoint exists, restore the latest checkpoint.
  if ckpt_manager.latest_checkpoint:
      ckpt.restore(ckpt_manager.latest_checkpoint)
      print ('Latest checkpoint restored!!')

    
  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction="none")

  def loss_fn(real, targets):
      mask = tf.math.logical_not(tf.math.equal(targets, 0))
      mask = tf.cast(mask, tf.int64)
      per_example_loss = loss_object(targets, real, sample_weight=mask)
      return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)  

  def train_step(input_data, real_data_in, real_data_out):
      encoder_pad_mask = makePaddingMask(input_data)
      elements_mask = makeSequenceMask(real_data_in.shape[1])
      with tf.GradientTape() as tape:
        predicted_data = transformer_model(
                                        input_data,
                                        real_data_in,
                                        encoder_pad_mask,
                                        elements_mask,
                                        training_enabled=True,
                                        training=True)
        loss = loss_fn(predicted_data, real_data_out)
    
      trainable_vars = transformer_model.trainable_variables
      grads = tape.gradient(loss, trainable_vars)
      optimizer.apply_gradients(zip(grads, trainable_vars))
      train_accuracy.update_state(real_data_out, predicted_data)
      return loss

  @tf.function
  def distributed_train_step(input_data, real_data_in, real_data_out):
      per_replica_losses = strategy.experimental_run_v2(train_step,
                                                      args=(input_data,
                                                            real_data_in,
                                                            real_data_out))
      return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)


  def test_step(input_data, real_data_in, real_data_out):
      encoder_pad_mask = makePaddingMask(input_data)
      elements_mask = makeSequenceMask(real_data_in.shape[1])
      predicted_data = transformer_model(
                                          input_data,
                                          real_data_in,
                                          encoder_pad_mask,
                                          elements_mask,
                                          training_enabled=False,
                                          training=False)
      loss = loss_fn(predicted_data, real_data_out)
    
      test_accuracy.update_state(real_data_out, predicted_data)
      return loss

  @tf.function
  def distributed_test_step(input_data, real_data_in, real_data_out):
      per_replica_losses = strategy.experimental_run_v2(test_step, args=(input_data,
                                                  real_data_in,
                                                  real_data_out,))
      return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)

  def predict(input_data, real_data_out):
      output_seq = []
      input_seq = en_tokenizer.sequences_to_texts([input_data])
      real_in = [fr_tokenizer.word_index['<start>']]
      real_in = tf.expand_dims(real_in, 0)
      end_tag = fr_tokenizer.texts_to_sequences(['<end>'])[0][0]
      input_data = tf.expand_dims(input_data, 0)
      for _ in range(input_data.shape[1]):
          encoder_pad_mask = makePaddingMask(input_data)
          elements_mask = makeSequenceMask(real_in.shape[1])
          predicted_data = transformer_model(input_data, real_in, encoder_pad_mask, elements_mask, training_enabled=False, training=True)
          predicted_data = tf.cast(tf.argmax(predicted_data[:, -1:, :], axis=-1), tf.int32)
          if predicted_data.numpy()[0][0] == end_tag:
              break
          real_in = tf.concat([real_in, predicted_data], axis = -1)
          output_seq.append(fr_tokenizer.index_word[predicted_data.numpy()[0][0]])  
      print("           English   :", input_seq)
      print("           Predicted :", " ".join(output_seq))
      print("           Correct   :", fr_tokenizer.sequences_to_texts([real_data_out]))

  idx = np.random.randint(low=0, high=len(en_test), size=1)[0]
  predict(en_test[idx], fr_test[idx])
    
  for epoch in range(EPOCHS):
      total_loss = 0
      num_batches = 0
      test_loss.reset_states()
      test_accuracy.reset_states()
      train_accuracy.reset_states()
    
      for _, (en_data, fr_data_in, fr_train_out) in enumerate(train_dataset_distr):
          loss = distributed_train_step(en_data, fr_data_in, fr_train_out)
          total_loss += loss
          num_batches += 1
      train_losses.append(total_loss/num_batches)

      total_loss = 0
      num_batches = 0
      for _, (en_data, fr_data_in, fr_data_out) in enumerate(test_dataset_distr):
          loss = distributed_test_step(en_data, fr_data_in, fr_data_out)
          total_loss += loss
          num_batches += 1
      test_losses.append(total_loss/num_batches)
          
      print ('Epoch {} training Loss {:.4f} Accuracy {:.4f}  test Loss {:.4f} Accuracy {:.4f}' .format( \
                                                  epoch + 1, 
                                                  train_losses[-1], 
                                                  train_accuracy.result(),
                                                  test_losses[-1],
                                                  test_accuracy.result()))
      train_accuracyVec.append(train_accuracy.result())
      test_accuracyVec.append(test_accuracy.result())

      idx = np.random.randint(low=0, high=len(en_test), size=1)[0]
      predict(en_test[idx], fr_test[idx])
    
      if (epoch + 1) % 10 == 0:
          ckpt_save_path = ckpt_manager.save()
          print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))

  ckpt_save_path = ckpt_manager.save()
  print ('Saving checkpoint for end at {}'.format(ckpt_save_path))

ValueError: `Checkpoint` was expecting a trackable object (an object derived from `TrackableBase`), got 0. If you believe this object should be trackable (i.e. it is part of the TensorFlow Python API and manages state), please open an issue.

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
fig_plot = fig.add_subplot()
fig_plot.plot(train_losses, label="train_loss")
fig_plot.plot(test_losses, label="test_loss")
fig_plot.legend(loc="upper right")
fig_plot.set_xlabel("epoch")
fig_plot.set_ylabel("loss")
fig_plot.grid(linestyle="--")
fig.savefig("losses_plot.png")

In [None]:
fig = plt.figure()
fig_plot = fig.add_subplot()
fig_plot.plot(train_accuracyVec, label="train_accuracy")
fig_plot.plot(test_accuracyVec, label="test_accuracy")
fig_plot.legend(loc="lower right")
fig_plot.set_xlabel("epoch")
fig_plot.set_ylabel("accuracy")
fig_plot.grid(linestyle="--")
fig.savefig("accuracy_plot.png")