<a href="https://colab.research.google.com/github/mizzmir/NLP/blob/master/Transformer/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
collab = False
!pip install tensorflow-gpu --quiet
!git clone https://github.com/mizzmir/NLP.git
collab = True

fatal: destination path 'NLP' already exists and is not an empty directory.


In [0]:
import os
import sys
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer

collab = True

if collab:
  sys.path.insert(0, r"./NLP/utilities")
  data_dir = "./NLP/data"
else:
  sys.path.insert(0, r"../utilities")
  data_dir = "../data"

from utils import *

In [0]:
class PositionalEncodingLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, max_sentence_len, dtype=tf.float32, **kwargs):
    super(PositionalEncodingLayer, self).__init__(dtype, **kwargs)
    if embedding_size%2 !=0:
      embedding_size+=1
    # embeddiPong size -> depth of model
    # positional encoding should have size : [1, max_sentence_len, embedding_size]
    # 1 is here to make broadcasting possible in call method
    PE = np.zeros((1, max_sentence_len, embedding_size))
    # pos should have shape [1, max_sentence_len] with values <0, max_sentence_len)
    pos = np.arange(start=0, stop=max_sentence_len, step=1)
    pos = pos.reshape(max_sentence_len, 1)
    # i should have shappe [1, embedding_size//2] with values <0, embedding_size//2)
    # we need half of embedding size, because half is needed for each sin/cos 
    # then we put it together into PE and we have [1, max_sentence_len, embedding_size]
    i = np.arange(start=0, stop=embedding_size//2, step=1)
    i = i.reshape(embedding_size//2, 1).T
    PE_sin = np.sin(pos/10000**(2*i/embedding_size))
    PE_cos = np.cos(pos/10000**(2*i/embedding_size))
    # we put sin into even indexes ::2 
    # we put cos into odd indexes, thats why we`re starting from 1 here : 1::2
    PE[0, ::, ::2] = PE_sin
    PE[0, ::, 1::2] = PE_cos
    self.PE = tf.constant(PE, dtype=dtype)
  def getPE(self):
    """
    only for debuging purposes
    """
    return self.PE
  def call(self, inputs):
    """
    inputs shape should be same as self.PE shape
        
      input_shape = tf.shape(inputs)
      return inputs + self.PE[:, :input_shape[-2], :]

    It has to be that way becuase we need to be able to get positional encoding for different lenght 
    for encoder and decoder, when we don`t know max lenght. SO we have to do encoding with bigger buffer
    and take what we need only.

    max_sentence_len in should be bigger or equal as longest input we predict we can get
    """

    input_shape = tf.shape(inputs)
    return inputs + self.PE[:, :input_shape[-2], :input_shape[-1]]

In [0]:
class MultiHeadAttentionLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, heads_number, dtype=tf.float32, **kwargs):
    super(MultiHeadAttentionLayer, self).__init__(dtype=tf.float32, **kwargs)
    """
    return shape : [batch_size, sequence_len, d_model]
    heads_number - tell how many heads will be processed at same time
    d_model - model size ; equal to embedding_size
    """
    self.heads_number = heads_number
    self.d_model = embedding_size
    self.w_q = tf.keras.layers.Dense(self.d_model)
    self.w_k = tf.keras.layers.Dense(self.d_model)
    self.w_v = tf.keras.layers.Dense(self.d_model)

    self.outputLayer = tf.keras.layers.Dense(self.d_model)

  # similar to dot attention but with scaling added
  def ScaledDotProductAttention(self, q, k, v, sequence_mask):
    """
    q shape [batch_size, num_heads, q_seq_len, depth_q]
    k shape [batch_size, num_heads, k_seq_len, depth_k]
    v shape [batch_size, num_heads, v_seq_len, depth_v]

    output contex shape [batch_size, num_heads, q_seq_len, depth_v]
    """
    # matmul(q,k,v)
    # resultion shape [batch_size, num_heads, q_seq_len, k_seq_len]
    qk_matmul = tf.matmul(q, k, transpose_b=True)
    # scaling tf.cast is needed here because tf.sqrt needs float32 type
    # score shape [batch_size, num_heads, q_seq_len, k_seq_len]
    score = qk_matmul/tf.math.sqrt(tf.cast(k.shape[-1], dtype=tf.float32))
    # optional mask
    # mask should be shape [batch_size, num_heads, q_seq_len, k_seq_len]
    # for example [
    #             [0, 1, 1]
    #             [0, 0, 1]
    #             ] shape == (2, 3)
    # we`re adding big negative number, because we only care about present/past words that are przedicted
    if sequence_mask is not None:
      #print(" mask is not none")
      #print("sequence_mask shape {}\nscore shape {}" .format(sequence_mask.shape, score.shape))
      score += sequence_mask*-1e9
    # softmax
    # attention_weights shape [batch_size, num_heads, q_seq_len, k_seq_len]
    attention_weights = tf.nn.softmax(score, axis=-1)
    # matmul(res, V)
    # contex shape [batch_size, num_heads, q_seq_len, depth_v]
    context = tf.matmul(attention_weights, v)
    return context

  def splitHeads(self, data):
    # new shape [batch_size, sequence_len, heads_number, d_model//heads_number]
    data = tf.reshape(data, (data.shape[0], data.shape[1], self.heads_number, data.shape[-1]//self.heads_number))
    # transpose dimentions to [batch_size, heads_number, sequence_len, d_model//heads_number]
    return tf.transpose(data, perm=[0,2,1,3])

  def call(self, q, k, v, sequence_mask):
    """
    q shape [batch_size, sequence_len, d_model]
    k shape [batch_size, sequence_len, d_model]
    v shape [batch_size, sequence_len, d_model]

    after first operations shapes are the same
    next we have to split d_model into heads_number of subbatches
    new shape after reshape only should be : [batch_size, sequence_len, heads_number, d_model//heads_number]
    next shape should be transposed to : [batch_size, heads_number, sequence_len, d_model//heads_number]
    where :
      new_d_model = d_model/heads_number
    
    next make scaled dot-product attention on resulting q,k,v

    next concat returning data to get shape : [batch_size, sequence_len, d_model]
    in order to do this we have to transpose context_vector to get [batch_size, sequence_len, heads_number, d_model//heads_number]

    next put it throug dense layer (d_model) in order to get output
    """
    #print("q shape {}\nk shape {}\n v shape {}" .format(q.shape, k.shape, v.shape))
    q = self.w_q(q)
    k = self.w_k(k)
    v = self.w_v(v)
    #print("AFTER Dense\n  q shape {}\n  k shape {}\n  v shape {}" .format(q.shape, k.shape, v.shape))

    q = self.splitHeads(q)
    k = self.splitHeads(k)
    v = self.splitHeads(v)
    #print("AFTER SPLIT\n  q shape {}\n  k shape {}\n  v shape {}" .format(q.shape, k.shape, v.shape))

    context_vector = self.ScaledDotProductAttention(q, k, v, sequence_mask)
    #print("context_vector shape :", context_vector.shape)

    context_vector = tf.transpose(context_vector, perm=[0,2,1,3])
    #print("context_vector  transposed shape :", context_vector.shape)
    context_vector = tf.reshape(context_vector, (context_vector.shape[0], context_vector.shape[1], self.d_model))
    #print("context_vector  reshapeed shape :", context_vector.shape)

    return self.outputLayer(context_vector)

embed_size = 10; max_steps = 3; vocab_size = 100

q = tf.random.uniform((1, max_steps, embed_size))  # shape [batch_size, sequence_len, embedding_size]
mhatt = MultiHeadAttentionLayer(embed_size, 5)
mhatt_output = mhatt(q, k=q, v=q, sequence_mask=None)

In [0]:
def feedForwardnetwork(dff, d_model):
  """
  according to paper dff=2048 and d_model =512
  but d_model should be same as embedding_size/d_model in MultiHeadAttention
  ffn(x) = max(0, xW_1 + b+1)W_2 + b_2
  where max(0, ...) -> relu activation
  """
  ffNetwork = tf.keras.Sequential()
  ffNetwork.add(tf.keras.layers.Dense(dff, activation="relu"))
  ffNetwork.add(tf.keras.layers.Dense(d_model))
  return ffNetwork

def makeSequenceMask(seq_len):
  """
  mask should be size [1, 1, seq_len, seq_len]
  first two sizes are batch_szie, num_heads to make this matrix broadcastable
  it should be in form 
  [
    [0, 1, 1, 1]
    [0, 0, 1, 1]
    [0, 0, 0, 1]
    [0, 0, 0, 0]
  ]
  
  mask_array = np.ones((seq_len, seq_len))
  mask_array = np.triu(mask_array, 1)
  return tf.constant(mask_array, dtype=tf.float32)
  """
  mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  return mask  # (seq_len, seq_len)

def makePaddingMask(sequence):
  mask = tf.math.equal(sequence, 0)
  mask =  tf.cast(mask, tf.float32)
  return mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [0]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, heads_number, dff, dtype=tf.float32, **kwargs):
    super(EncoderLayer, self).__init__(dtype, **kwargs)

    self.d_model = embedding_size
    self.multiHeadAttention = MultiHeadAttentionLayer(embedding_size, heads_number)

    self.normalizationFirst = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalizationSecond = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropoutFirst = tf.keras.layers.Dropout(0.2)
    self.dropoutSecond = tf.keras.layers.Dropout(0.2)

    self.ffNetwork = feedForwardnetwork(dff, self.d_model)

  def call(self, encoder_input, mask, training_enabled):
    # shortcut_data shape [batch_size, max_sentence_len, embedding_size]
    shortcut_data = encoder_input

    # mhatt_output shape [batch_size, max_sentence_len, embedding_size]
    mhatt_output = self.multiHeadAttention(encoder_input, encoder_input, encoder_input, mask)
    mhatt_output = self.dropoutFirst(mhatt_output, training=training_enabled)
    mhatt_output += shortcut_data
    mhatt_output = self.normalizationFirst(mhatt_output)

    shortcut_data = mhatt_output

    ffNet_output = self.ffNetwork(mhatt_output)
    ffNet_output = self.dropoutSecond(ffNet_output, training=training_enabled)
    ffNet_output += shortcut_data
    ffNet_output = self.normalizationSecond(ffNet_output)

    return ffNet_output

class Encoder(tf.keras.Model):
  """
  Encoder flow :

  - Embedding 
  - Positional Encoding
  - Input = Embedding + Positional Encoding
  --------------------REPEAT N Times--------------------
  - Multi-head Attention layer
  - Input + Multi-Head Attention layer added together 
  - previous Normalized (1)
  - Feed Forward Network (2)
  - (1) added to (2) and Normmalized
  ------------------------------------------------------
  - Encoder output 
  """
  def __init__(self, embedding_size, max_sentence_len, vocab_size, blocks_amount, heads_number, dff):
    super(Encoder, self).__init__()

    assert (embedding_size//heads_number)%2==0
    self.blocks_amount = blocks_amount
    self.d_model = embedding_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.positionalEncoding = PositionalEncodingLayer(embedding_size, max_sentence_len)

    self.encoderBlocks = [EncoderLayer(embedding_size, heads_number, dff) for _ in range(blocks_amount)]
  
  def call(self, encoder_input, mask, training_enabled=False):
    # sequence shape [batch_size, max_sentence_len]
    embedded_seq = self.embedding(encoder_input)
    # according to paper https://arxiv.org/pdf/1706.03762.pdf
    # embedding is multiplied by sqrt(d_model). Point 3.4
    embedded_seq*=tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    # embedded_seq shape [batch_szie, max_sentence_len, embedding_size]
    data = self.positionalEncoding(embedded_seq)
    #------------------------- loop though all blocks -------------------------
    for i in range(self.blocks_amount):
      #print("               BLOCK ", i+1)
      data = self.encoderBlocks[i](data, mask, training_enabled) 

    return data

In [7]:
data = tf.random.uniform((64, 62), dtype=tf.int64, minval=0, maxval=200)
print("input shape ", data.shape)
padding_mask = makePaddingMask(data)

encoder = Encoder(embedding_size=10,
                  max_sentence_len=1000,
                  vocab_size=3000,
                  blocks_amount=3,
                  heads_number=5, 
                  dff=2048)
encoder_out  = encoder(data, mask=padding_mask)
print(encoder_out.shape)

input shape  (64, 62)
(64, 62, 10)


In [0]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, heads_number, dff, dtype=tf.float32, **kwargs):
    super(DecoderLayer, self).__init__(dtype, **kwargs)

    self.d_model = embedding_size
    self.multiHeadAttentionFirst = MultiHeadAttentionLayer(embedding_size, heads_number)
    self.multiHeadAttentionSecond = MultiHeadAttentionLayer(embedding_size, heads_number)

    self.normalizationFirst = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalizationSecond = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalizationThird = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropoutFirst = tf.keras.layers.Dropout(0.2)
    self.dropoutSecond = tf.keras.layers.Dropout(0.2)
    self.dropoutThird = tf.keras.layers.Dropout(0.2)

    self.ffNetwork = feedForwardnetwork(dff, self.d_model)

  def call(self, decoder_input, encoder_output, pad_mask, elements_mask, training_enabled):
    # shortcut_data shape [batch_szie, max_sentence_len, embedding_size]
    shortcut_data = decoder_input
      
    # mhatt_output shape [batch_size, max_sentence_len, embedding_size]
    mhatt_output = self.multiHeadAttentionFirst(decoder_input, decoder_input, decoder_input, elements_mask)
    mhatt_output = self.dropoutFirst(mhatt_output, training=training_enabled)
    # add & Norm
    mhatt_output += shortcut_data
    mhatt_output = self.normalizationFirst(mhatt_output)

    shortcut_data = mhatt_output
    #print("decoder_input ", decoder_input.shape)
    #print("encoder_output ", encoder_output.shape)
    #print("mhatt_output ", mhatt_output.shape)
    mhatt_output2 = self.multiHeadAttentionSecond(mhatt_output, encoder_output, encoder_output, pad_mask)
    mhatt_output2 = self.dropoutSecond(mhatt_output2, training=training_enabled)
    mhatt_output2 += shortcut_data
    mhatt_output2 = self.normalizationSecond(mhatt_output2)

    shortcut_data = mhatt_output2
    ffn_output = self.ffNetwork(mhatt_output2)
    ffn_output = self.dropoutThird(ffn_output, training=training_enabled)
    ffn_output += shortcut_data
    ffNet_output = self.normalizationThird(ffn_output)

    return ffNet_output

class Decoder(tf.keras.models.Model):
  """
  Decoder flow :

  - Embedding 
  - Positional Encoding
  - Input = Embedding + Positional Encoding
  --------------------REPEAT N Times--------------------
  - Masked Multi-head Attention layer with elements_mask
  - Input + Masked Multi-Head Attention layer added together 
  - previous Normalized (1) 
  - Multi-head Attention layer v, k from Encoder output | q from previous point with padding mask
  - (1) + Multi-head Attention layer added together
  - previous normalized
  - Feed Forward Network (2)
  - (1) added to (2) and Normalized
  ------------------------------------------------------
  - Decoder output

  decoder masks are :
    - encoder_padding_mask - padding mask made on encoder input data
    - decoder sequences mask - sequence mask made on decoder input data
  """
  def __init__(self, embedding_size, max_sentence_len, vocab_size, blocks_amount, heads_number, dff):
    super(Decoder, self).__init__()

    assert (embedding_size//heads_number)%2==0
    self.blocks_amount = blocks_amount
    self.d_model = embedding_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.positionalEncoding = PositionalEncodingLayer(embedding_size, max_sentence_len)
    self.dropout = tf.keras.layers.Dropout(0.1)

    self.decoderBlocks = [DecoderLayer(embedding_size, heads_number, dff) for _ in range(blocks_amount)]

  def call(self, encoder_output, decoder_input, pad_mask, elements_mask, training_enabled=False):

    # sequence shape [batch_size, max_sentence_len]
    embedded_seq = self.embedding(decoder_input)
    # according to paper https://arxiv.org/pdf/1706.03762.pdf
    # embedding is multiplied by sqrt(d_model). Point 3.4
    embedded_seq*=tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    # embedded_seq shape [batch_szie, max_sentence_len, embedding_size]
    data = self.positionalEncoding(embedded_seq)
    data = self.dropout(data)
    #------------------------- loop though all blocks -------------------------
    for i in range(self.blocks_amount):
      #print("               BLOCK ", i+1)
      data = self.decoderBlocks[i](data, encoder_output, pad_mask, elements_mask, training_enabled)

    return data

In [9]:
input_data = np.ones((64, 26))
mask = makeSequenceMask(input_data.shape[1])
print("Decoder input shape ", input_data.shape)
blocks_amount = 2
heads = 5
en_vocab_size = 100
fr_vocab_size = 200
decoder = Decoder(embedding_size=10,
                  max_sentence_len=1000,
                  vocab_size=100,
                  blocks_amount=3,
                  heads_number=5, 
                  dff=2048)
"""
decoder masks are :
- encoder_padding_mask - padding mask made on encoder input data
- decoder sequences mask - sequence mask made on decoder input data
"""
decoder_out  = decoder(encoder_out, input_data, pad_mask=None, elements_mask=None)
print("decoder_out ", decoder_out.shape)

Decoder input shape  (64, 26)
decoder_out  (64, 26, 10)


In [0]:
class Transformer(tf.keras.models.Model):
  """
  Transformer flow:

  - Encoder
  - Decoder
  - Dense

   transformer_out shape = [batch_size, output_seq_len, output_vocab_size]
   default trainng_enabled == False
  """
  def __init__(self,
               embedding_size,
               dff,
               input_max_seq_length,
               output_max_seq_length,
               input_vocab_size,
               output_vocab_size,
               encoder_blocks,
               decoder_blocks,
               heads):
    super(Transformer, self).__init__()

    self.encoder = Encoder(embedding_size, input_max_seq_length, input_vocab_size, encoder_blocks, heads, dff)
    self.decoder = Decoder(embedding_size, output_max_seq_length, output_vocab_size, decoder_blocks, heads, dff)

    self.dense = tf.keras.layers.Dense(output_vocab_size)

  def call(self, input_seq, output_seq, pad_mask, words_mask, training_enabled=False):
    
    encoder_out = self.encoder(input_seq, mask=pad_mask, training_enabled=training_enabled)
    decoder_out = self.decoder(encoder_out, output_seq, pad_mask=pad_mask, elements_mask=words_mask, training_enabled=training_enabled)

    transformer_out = self.dense(decoder_out)
    return transformer_out

In [11]:
transformer_model = Transformer(embedding_size=512,
                                dff=2048,
                                input_max_seq_length=2000,
                                output_max_seq_length=1855,
                                input_vocab_size=4980,
                                output_vocab_size=7001,
                                encoder_blocks=4,
                                decoder_blocks=2,
                                heads=8)

# input_data and output_data
input_data = tf.random.uniform((64, 52), dtype=tf.int64, minval=0, maxval=100)
output_data = tf.random.uniform((64, 29), dtype=tf.int64, minval=0, maxval=250)

encoder_pad_mask = makePaddingMask(input_data)
elements_mask = makeSequenceMask(output_data.shape[1])
print("output_data ", output_data.shape)
print("elements_mask ", elements_mask.shape)
transformer_output = transformer_model(input_data, output_data, encoder_pad_mask, elements_mask)
print(transformer_output.shape)

output_data  (64, 29)
elements_mask  (29, 29)
(64, 29, 7001)


In [0]:
class customLearningRate(tf.keras.optimizers.schedules.LearningRateSchedule):
  """
  according to Attention is all you need paper learning rate has custom scheduler:
  there are two parameters : 
  - d_model
  - warmup_steps ( in paper set to 4000)
  according to paper https://arxiv.org/pdf/1706.03762.pdf
  point 5.3 Optimizer
  """
  def __init__(self, warmup_steps, d_model):
    super(customLearningRate, self).__init__()
    self.d_model = tf.cast(d_model, tf.float32)
    self.warmup_steps = warmup_steps
  
  def __call__(self, step):
    firstScheduler = tf.math.rsqrt(step)
    secondScheduler = step*(self.warmup_steps**-1.5)
    return tf.math.rsqrt(self.d_model)*tf.math.minimum(firstScheduler, secondScheduler)

In [0]:
BATCH_SIZE = 64
EPOCHS = 60
num_layers = 4 # 6
d_model = 128 # 512
dff = 512  # 2048
num_heads = 8 

In [14]:
# reading data

en_lines, fr_lines = read_data_files(data_dir, ("small_vocab_en", "small_vocab_fr"))
"""
data = read_data(os.path.join(data_dir, "fra-eng"), "fra.txt")
en_lines, fr_lines = list(zip(*data))

en_lines = en_lines[:30000]
fr_lines = fr_lines[:30000]
"""
en_lines = [normalize(line) for line in en_lines]
fr_lines = [normalize(line) for line in fr_lines]

en_train, en_test, fr_train, fr_test = train_test_split(en_lines, fr_lines, shuffle=True, test_size=0.1)

fr_train_in = ['<start> ' + line for line in fr_train]
fr_train_out = [line + ' <end>' for line in fr_train]

fr_test_in = ['<start> ' + line for line in fr_test]
fr_test_out = [line + ' <end>' for line in fr_test]

reading data from  ./NLP/data/small_vocab_en
reading data from  ./NLP/data/small_vocab_fr


In [15]:
fr_tokenizer = Tokenizer(filters='')
en_tokenizer = Tokenizer(filters='')

input_data = [fr_train_in, fr_train_out, fr_test_in, fr_test_out, fr_test, fr_train]
fr_train_in, fr_train_out, fr_test_in, fr_test_out, fr_test, fr_train = tokenizeInput(input_data, fr_tokenizer)

input_data = [en_train, en_test]
en_train, en_test = tokenizeInput(input_data, en_tokenizer)

en_vocab_size = len(en_tokenizer.word_index)+1
fr_vocab_size = len(fr_tokenizer.word_index)+1
print("en_vocab {}\nfr_vocab {}" .format(en_vocab_size, fr_vocab_size))
print("end_tag", fr_tokenizer.texts_to_sequences(['<end>'])[0][0])

en_vocab 203
fr_vocab 336
end_tag 10


# New Section

In [0]:
train_dataset = tf.data.Dataset.from_tensor_slices((en_train, fr_train_in, fr_train_out))
train_dataset = train_dataset.shuffle(len(en_train), reshuffle_each_iteration=True)\
                                 .batch(BATCH_SIZE, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((en_test, fr_test_in, fr_test_out))
test_dataset = test_dataset.shuffle(len(en_test), reshuffle_each_iteration=True)\
                               .batch(BATCH_SIZE, drop_remainder=True)

In [17]:
custom_learning_rate = customLearningRate(warmup_steps=4000,
                                            d_model=d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate=custom_learning_rate,
                                    beta_1=0.9,
                                    beta_2=0.98,
                                    epsilon=1e-9)

transformer_model = Transformer(embedding_size=d_model,
                                dff=dff,
                                input_max_seq_length=2000,
                                output_max_seq_length=1855,
                                input_vocab_size=en_vocab_size,
                                output_vocab_size=fr_vocab_size,
                                encoder_blocks=num_layers,
                                decoder_blocks=num_layers,
                                heads=num_heads)
test_losses = []
train_losses = []
train_accuracyVec = []
test_accuracyVec =[]
test_loss = tf.keras.metrics.Mean()
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

training_loss = tf.keras.metrics.Mean()
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_fn(predicted, targets):
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    loss = loss_object(targets, predicted)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *=mask
    return tf.reduce_mean(loss)    

@tf.function
def train_step(input_data, real_data_in, real_data_out):
    encoder_pad_mask = makePaddingMask(input_data)
    elements_mask = makeSequenceMask(real_data_in.shape[1])

    with tf.GradientTape() as tape:
        predicted_data = transformer_model(input_data, real_data_in, encoder_pad_mask, elements_mask, training_enabled=True, training=True)
        
        loss = loss_fn(predicted_data, real_data_out)
  
    trainable_vars = transformer_model.trainable_variables
    grads = tape.gradient(loss, trainable_vars)
    optimizer.apply_gradients(zip(grads, trainable_vars))
    train_accuracy(real_data_out, predicted_data)
    training_loss(loss)

@tf.function
def test_step(input_data, real_data_in, real_data_out):
    loss=0
    encoder_pad_mask = makePaddingMask(input_data)
    elements_mask = makeSequenceMask(real_data_in.shape[1])
    predicted_data = transformer_model(input_data, real_data_in, encoder_pad_mask, elements_mask, training_enabled=False, training=False)
        
    loss = loss_fn(predicted_data, real_data_out)
  
    test_accuracy(real_data_out, predicted_data)
    test_loss(loss)

def predict(input_data, real_data_out):
    output_seq = []
    input_seq = en_tokenizer.sequences_to_texts([input_data])
    real_in = [fr_tokenizer.word_index['<start>']]
    real_in = tf.expand_dims(real_in, 0)
    end_tag = fr_tokenizer.texts_to_sequences(['<end>'])[0][0]
    input_data = tf.expand_dims(input_data, 0)
    for _ in range(input_data.shape[1]):
        encoder_pad_mask = makePaddingMask(input_data)
        elements_mask = makeSequenceMask(real_in.shape[1])
        predicted_data = transformer_model(input_data, real_in, encoder_pad_mask, elements_mask, training_enabled=False, training=True)
        predicted_data = tf.cast(tf.argmax(predicted_data[:, -1:, :], axis=-1), tf.int32)
        if predicted_data.numpy()[0][0] == end_tag:
            break
        real_in = tf.concat([real_in, predicted_data], axis = -1)
        output_seq.append(fr_tokenizer.index_word[predicted_data.numpy()[0][0]])  
    print("English   :", input_seq)
    print("Predicted :", " ".join(output_seq))
    print("Correct   :", fr_tokenizer.sequences_to_texts([real_data_out]))
    
for epoch in range(EPOCHS):
    training_loss.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()
    train_accuracy.reset_states()
  
    for batch, (en_data, fr_data_in, fr_train_out) in enumerate(train_dataset):
        train_step(en_data, fr_data_in, fr_train_out)
        #if batch != 0 and (batch%500 == 0):
        #  print("   Epoch {} batch {} loss {:.4f} accuracy {:.4f}" .format(epoch+1, batch, training_loss.result(), train_accuracy.result()))
    for _, (en_data, fr_data_in, fr_data_out) in enumerate(test_dataset):
        test_step(en_data, fr_data_in, fr_data_out)
        
    print ('Epoch {} training Loss {:.4f} Accuracy {:.4f}  test Loss {:.4f} Accuracy {:.4f}' .format( \
                                                epoch + 1, 
                                                training_loss.result(), 
                                                train_accuracy.result(),
                                                test_loss.result(),
                                                test_accuracy.result()))
    test_losses.append(test_loss.result())
    train_losses.append(training_loss.result())
    train_accuracyVec.append(train_accuracy.result())
    test_accuracyVec.append(test_accuracy.result())

    idx = np.random.randint(low=0, high=len(en_test), size=1)[0]
    predict(en_test[idx], fr_test[idx])

Epoch 1 training Loss 0.7626 Accuracy 0.4663  test Loss 0.0505 Accuracy 0.6728
English   : ['would he like to go the grocery store ?']
Predicted : pourquoi aller a l ecole ?
Correct   : ['voudrait - il aller a l epicerie ?']
Epoch 2 training Loss 0.0490 Accuracy 0.6136  test Loss 0.0349 Accuracy 0.6782
English   : ['india is usually beautiful during august but it is sometimes hot in february .']
Predicted : l inde est generalement beau au mois d aout mais il est parfois chaud en fevrier
Correct   : ['l inde est generalement beau au mois d aout mais il est parfois chaud en fevrier .']
Epoch 3 training Loss 0.0338 Accuracy 0.6183  test Loss 0.0243 Accuracy 0.6819
English   : ['california is usually busy during june but it is sometimes quiet in december .']
Predicted : californie est generalement occupe en juin mais il est parfois calme en decembre .
Correct   : ['californie est generalement occupe en juin mais il est parfois calme en decembre .']
Epoch 4 training Loss 0.0237 Accuracy 0.6

KeyboardInterrupt: ignored

In [0]:
fig = plt.figure()
fig_plot = fig.add_subplot()
fig_plot.plot(train_losses, label="train_loss")
fig_plot.plot(test_losses, label="test_loss")
fig_plot.legend(location="lower left")
fig_plot.set_xlabel("epoch")
fig_plot.set_ylabel("loss")
fig_plot.grid(linestyle="--")
fig.savefig("losses_plot.png")

In [0]:
fig = plt.figure()
fig_plot = fig.add_subplot()
fig_plot.plot(train_accuracyVec, label="train_accuracy")
fig_plot.plot(test_accuracyVec, label="test_accuracy")
fig_plot.legend(location="lower left")
fig_plot.set_xlabel("epoch")
fig_plot.set_ylabel("accuracy")
fig_plot.grid(linestyle="--")
fig.savefig("accuracy_plot.png")


In [0]:
strategy = tf.distribute.MirroredStrategy()

replicas_num = strategy.num_replicas_in_sync
GLOBAL_BATCH_SIZE = BATCH_SIZE*replicas_num
print("replicas number: ", replicas_num)

In [0]:
train_dataset = tf.data.Dataset.from_tensor_slices((en_train, fr_train_in, fr_train_out))
train_dataset = train_dataset.shuffle(len(en_train), reshuffle_each_iteration=True)\
                                 .batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
train_dataset_distr = strategy.experimental_distribute_dataset(train_dataset)
                                 
test_dataset = tf.data.Dataset.from_tensor_slices((en_test, fr_test_in, fr_test_out))
test_dataset = test_dataset.shuffle(len(en_test), reshuffle_each_iteration=True)\
                               .batch(GLOBAL_BATCH_SIZE, drop_remainder=True)
test_dataset_distr = strategy.experimental_distribute_dataset(test_dataset)

In [0]:
# distributed train

test_losses = []
train_losses = []
train_accuracyVec = []
test_accuracyVec =[]
test_loss = tf.keras.metrics.Mean()
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

with strategy.scope():
  custom_learning_rate = customLearningRate(warmup_steps=4000,
                                            d_model=512)

  optimizer = tf.keras.optimizers.Adam(learning_rate=custom_learning_rate,
                                      beta_1=0.9,
                                      beta_2=0.98,
                                      epsilon=1e-9)

  transformer_model = Transformer(embedding_size=d_model,
                                  dff=dff,
                                  input_max_seq_length=2000,
                                  output_max_seq_length=1855,
                                  input_vocab_size=en_vocab_size,
                                  output_vocab_size=fr_vocab_size,
                                  encoder_blocks=num_layers,
                                  decoder_blocks=num_layers,
                                  heads=num_heads)

  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
      from_logits=True, reduction="none")

  def loss_fn(real, targets):
      mask = tf.math.logical_not(tf.math.equal(targets, 0))
      mask = tf.cast(mask, tf.int64)
      per_example_loss = loss_object(targets, real, sample_weight=mask)
      return tf.nn.compute_average_loss(per_example_loss, global_batch_size=GLOBAL_BATCH_SIZE)  

  def train_step(input_data, real_data_in, real_data_out):
      with tf.GradientTape() as tape:
          encoder_pad_mask = makePaddingMask(input_data)
          elements_mask = makeSequenceMask(real_data_in.shape[1])
          predicted_data = transformer_model(input_data, real_data_in, encoder_pad_mask, elements_mask, training_enabled=True)
          
          loss = loss_fn(predicted_data, real_data_out)
    
      trainable_vars = transformer_model.trainable_variables
      grads = tape.gradient(loss, trainable_vars)
      optimizer.apply_gradients(zip(grads, trainable_vars))
      train_accuracy.update_state(real_data_out, predicted_data)
      return loss

  @tf.function
  def distributed_train_step(input_data, real_data_in, real_data_out):
      per_replica_losses = strategy.experimental_run_v2(train_step,
                                                      args=(input_data,
                                                            real_data_in,
                                                            real_data_out))
      return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)


  def test_step(input_data, real_data_in, real_data_out):
      encoder_pad_mask = makePaddingMask(input_data)
      elements_mask = makeSequenceMask(real_data_in.shape[1])
      predicted_data = transformer_model(input_data, real_data_in, encoder_pad_mask, elements_mask, training_enabled=False)
      
      loss = loss_fn(predicted_data, real_data_out)
    
      test_accuracy.update_state(real_data_out, predicted_data)
      return loss

  @tf.function
  def distributed_test_step(input_data, real_data_in, real_data_out):
      per_replica_losses = strategy.experimental_run_v2(test_step, args=(input_data,
                                                  real_data_in,
                                                  real_data_out,))
      return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)

  def predict(input_data, real_data_out):
      output_seq = []
      input_seq = en_tokenizer.sequences_to_texts([input_data])
      real_in = tf.constant(fr_tokenizer.texts_to_sequences(['<start>']))
      end_tag = fr_tokenizer.texts_to_sequences(['<end>'])[0][0]
      input_data = tf.expand_dims(input_data, 0)
      for _ in range(input_data.shape[1]):
          encoder_pad_mask = makePaddingMask(input_data)
          elements_mask = makeSequenceMask(real_in.shape[1])
          predicted_data = transformer_model(input_data, real_in, encoder_pad_mask, elements_mask, training_enabled=False)
          predicted_data = tf.cast(tf.argmax(predicted_data[:, -1:, :], axis=-1), tf.int32)
          if predicted_data.numpy()[0][0] == end_tag:
              break
          real_in = tf.concat([real_in, predicted_data], axis = -1)
          output_seq.append(fr_tokenizer.index_word[predicted_data.numpy()[0][0]])  
      print("English   :", input_seq)
      print("Predicted :", " ".join(output_seq))
      print("Correct   :", fr_tokenizer.sequences_to_texts([real_data_out]))


  for epoch in range(EPOCHS):
      total_loss = 0
      num_batches = 0
      test_loss.reset_states()
      test_accuracy.reset_states()
      train_accuracy.reset_states()
    
      for batch, (en_data, fr_data_in, fr_train_out) in enumerate(train_dataset_distr):
          loss = distributed_train_step(en_data, fr_data_in, fr_train_out)
          total_loss += loss
          num_batches += 1
      train_losses.append(total_loss/num_batches)

      total_loss = 0
      num_batches = 0
      for _, (en_data, fr_data_in, fr_data_out) in enumerate(test_dataset_distr):
          loss = distributed_test_step(en_data, fr_data_in, fr_data_out)
          total_loss += loss
          num_batches += 1
      test_losses.append(total_loss/num_batches)
          
      print ('Epoch {} training Loss {:.4f} Accuracy {:.4f}  test Loss {:.4f} Accuracy {:.4f}' .format( \
                                                  epoch + 1, 
                                                  train_losses[-1], 
                                                  train_accuracy.result(),
                                                  test_losses[-1],
                                                  test_accuracy.result()))
      train_accuracyVec.append(train_accuracy.result())
      test_accuracyVec.append(test_accuracy.result())

      idx = np.random.randint(low=0, high=len(en_test), size=1)[0]
      predict(en_test[idx], fr_test[idx])