<a href="https://colab.research.google.com/github/mizzmir/NLP/blob/master/Transformer/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#!pip install tensorflow-gpu

In [2]:
import tensorflow as tf
import numpy as np

from sklearn.model_selection import train_test_split
from utils import *

embed_size = 10; max_steps = 3; vocab_size = 100

In [3]:
class PositionalEncodingSimpleLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_dim, max_sentence_len, dtype=tf.float32, **kwargs):
    super(PositionalEncodingSimpleLayer, self).__init__(dtype=tf.float32, **kwargs)
    if embedding_dim %2 != 0:
      embedding_dim+=1
    PE = np.zeros((1, max_sentence_len, embedding_dim))
    for pos in range(max_sentence_len):
      for i in range(embedding_dim//2):
        PE[:, pos, 2*i] = np.sin(pos/10000**(2*i/embedding_dim))
        PE[:, pos, 2*i+1] = np.cos(pos/10000**(2*i/embedding_dim))
    tf.print(PE.shape)
    self.PE = PE
  def call(self, input):
    return self.PE

In [4]:
positional_encoding_layer = PositionalEncodingSimpleLayer(embed_size, max_sentence_len=max_steps)
res2 = positional_encoding_layer([1,2,3,4,5])
print(res2)

(1, 3, 10)
[[[ 0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
    0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
    0.00000000e+00  1.00000000e+00]
  [ 8.41470985e-01  5.40302306e-01  1.57826640e-01  9.87466836e-01
    2.51162229e-02  9.99684538e-01  3.98106119e-03  9.99992076e-01
    6.30957303e-04  9.99999801e-01]
  [ 9.09297427e-01 -4.16146837e-01  3.11697146e-01  9.50181503e-01
    5.02165994e-02  9.98738351e-01  7.96205928e-03  9.99968302e-01
    1.26191435e-03  9.99999204e-01]]]


In [5]:
class PositionalEncodingArangePos(tf.keras.layers.Layer):
  def __init__(self, embedding_size, max_sentence_len, dtype=tf.float32, **kwargs):
    super(PositionalEncodingArangePos, self).__init__(dtype, **kwargs)
    if embedding_size%2 !=0:
      embedding_size+=1
    PE = np.zeros((1, max_sentence_len, embedding_size))
    pos = np.arange(start=0, stop=max_sentence_len, step=1)
    for i in range(embedding_size//2):
      PE[0, ::, 2*i] = np.sin(pos/10000**(2*i/embedding_size))
      PE[0, ::, 2*i+1] = np.cos(pos/10000**(2*i/embedding_size))
    self.PE = PE
  def call(self, inputs):
    return self.PE

In [6]:
peLayer = PositionalEncodingArangePos(embed_size, max_sentence_len=max_steps)
res3 = peLayer([1,2,3,4,5])
print(res3)

[[[ 0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
    0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
    0.00000000e+00  1.00000000e+00]
  [ 8.41470985e-01  5.40302306e-01  1.57826640e-01  9.87466836e-01
    2.51162229e-02  9.99684538e-01  3.98106119e-03  9.99992076e-01
    6.30957303e-04  9.99999801e-01]
  [ 9.09297427e-01 -4.16146837e-01  3.11697146e-01  9.50181503e-01
    5.02165994e-02  9.98738351e-01  7.96205928e-03  9.99968302e-01
    1.26191435e-03  9.99999204e-01]]]


In [7]:
print("comparing arrays: ", np.allclose(res2, res3))

comparing arrays:  True


In [8]:
"""
Encoder flow :

- Embedding 
- Positional Encoding
- Input = Embedding + Positional Encoding
--------------------REPEAT N Times--------------------
- Multi-head Attention layer
- Input + Multi-Head Attention layer added together 
- previous Normalized (1)
- Feed Forward Network (2)
- (1) added to (2) and Normmalized
------------------------------------------------------
- Encoder output 
"""

'\nEncoder flow :\n\n- Embedding \n- Positional Encoding\n- Input = Embedding + Positional Encoding\n--------------------REPEAT N Times--------------------\n- Multi-head Attention layer\n- Input + Multi-Head Attention layer added together \n- previous Normalized (1)\n- Feed Forward Network (2)\n- (1) added to (2) and Normmalized\n------------------------------------------------------\n- Encoder output \n'

In [9]:
class PositionalEncodingLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, max_sentence_len, dtype=tf.float32, **kwargs):
    super(PositionalEncodingLayer, self).__init__(dtype, **kwargs)
    if embedding_size%2 !=0:
      embedding_size+=1
    # embedding size -> depth of model
    # positional encoding should have size : [1, max_sentence_len, embedding_size]
    # 1 is here to make broadcasting possible in call method
    PE = np.zeros((1, max_sentence_len, embedding_size))
    # pos should have shape [1, max_sentence_len] with values <0, max_sentence_len)
    pos = np.arange(start=0, stop=max_sentence_len, step=1)
    pos = pos.reshape(max_sentence_len, 1)
    # i should have shappe [1, embedding_size//2] with values <0, embedding_size//2)
    # we need half of embedding size, because half is needed for each sin/cos 
    # then we put it together into PE and we have [1, max_sentence_len, embedding_size]
    i = np.arange(start=0, stop=embedding_size//2, step=1)
    i = i.reshape(embedding_size//2, 1).T
    PE_sin = np.sin(pos/10000**(2*i/embedding_size))
    PE_cos = np.cos(pos/10000**(2*i/embedding_size))
    # we put sin into even indexes ::2 
    # we put cos into odd indexes, thats why we`re starting from 1 here : 1::2
    PE[0, ::, ::2] = PE_sin
    PE[0, ::, 1::2] = PE_cos
    self.PE = tf.constant(PE, dtype=dtype)
  def getPE(self):
    """
    only for debuging purposes
    """
    return self.PE
  def call(self, inputs):
    """
    inputs shape should be same as self.PE shape
        
      input_shape = tf.shape(inputs)
      return inputs + self.PE[:, :input_shape[-2], :]

    It has to be that way becuase we need to be able to get positional encoding for different lenght 
    for encoder and decoder, when we don`t know max lenght. SO we have to do encoding with bigger buffer
    and take what we need only.

    max_sentence_len in should be bigger or equal as longest input we predict we can get
    """

    input_shape = tf.shape(inputs)
    return inputs + self.PE[:, :input_shape[-2], :]

peLayerAll = PositionalEncodingLayer(embedding_size=10,
                                     max_sentence_len=max_steps)
res4 = peLayerAll.getPE()
print(res4.shape)
print("comparing arrays: ", np.allclose(res2, res4))

(1, 3, 10)
comparing arrays:  True


In [10]:
class MultiHeadAttentionLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, heads_number, dtype=tf.float32, **kwargs):
    super(MultiHeadAttentionLayer, self).__init__(dtype=tf.float32, **kwargs)
    """
    return shape : [batch_size, sequence_len, d_model]
    heads_number - tell how many heads will be processed at same time
    d_model - model size ; equal to embedding_size
    """
    self.heads_number = heads_number
    self.d_model = embedding_size
    self.w_q = tf.keras.layers.Dense(self.d_model)
    self.w_k = tf.keras.layers.Dense(self.d_model)
    self.w_v = tf.keras.layers.Dense(self.d_model)

    self.outputLayer = tf.keras.layers.Dense(self.d_model)

  # similar to dot attention but with scaling added
  def ScaledDotProductAttention(self, v, k, q, sequence_mask):
    """
    q shape [batch_size, num_heads, q_seq_len, depth_q]
    k shape [batch_size, num_heads, k_seq_len, depth_k]
    v shape [batch_size, num_heads, v_seq_len, depth_v]
    """
    # matmul(q,k,v)
    # resultion shape [batch_size, num_heads, q_seq_len, k_seq_len]
    qk_matmul = tf.matmul(q, k, transpose_b=True)
    # scaling tf.cast is needed here because tf.sqrt needs float32 type
    # score shape [batch_size, num_heads, q_seq_len, k_seq_len]
    score = qk_matmul*tf.math.sqrt(tf.cast(k.shape[-1], dtype=tf.float32))
    # optional mask
    # mask should be shape [batch_size, num_heads, q_seq_len, k_seq_len]
    # for example [
    #             [0, 1, 1]
    #             [0, 0, 1]
    #             ] shape == (2, 3)
    # we`re adding big negative number, because we only care about present/past words that are przedicted
    if sequence_mask is not None:
      #print(" mask is not none")
      #print("sequence_mask shape {}\nscore shape {}" .format(sequence_mask.shape, score.shape))
      score += sequence_mask*-1e-8
    # softmax
    # attention_weights shape [batch_size, num_heads, q_seq_len, k_seq_len]
    attention_weights = tf.nn.softmax(score, axis=-1)
    # matmul(res, V)
    # contex shape [batch_size, num_heads, q_seq_len, depth_v]
    context = tf.matmul(attention_weights, v)
    return context

  def splitHeads(self, data):
    # new shape [batch_size, sequence_len, heads_number, d_model//heads_number]
    data = tf.reshape(data, (data.shape[0], data.shape[1], self.heads_number, data.shape[-1]//self.heads_number))
    # transpose dimentions to [batch_size, heads_number, sequence_len, d_model//heads_number]
    return tf.transpose(data, perm=[0,2,1,3])

  def call(self, q, k, v, sequence_mask):
    """
    q shape [batch_size, sequence_len, d_model]
    k shape [batch_size, sequence_len, d_model]
    v shape [batch_size, sequence_len, d_model]

    after first operations shapes are the same
    next we have to split d_model into heads_number of subbatches
    new shape after reshape only should be : [batch_size, sequence_len, heads_number, d_model//heads_number]
    next shape should be transposed to : [batch_size, heads_number, sequence_len, d_model//heads_number]
    where :
      new_d_model = d_model/heads_number
    
    next make scaled dot-product attention on resulting q,k,v

    next concat returning data to get shape : [batch_size, sequence_len, d_model]
    in order to do this we have to transpose context_vector to get [batch_size, sequence_len, heads_number, d_model//heads_number]

    next put it throug dense layer (d_model) in order to get output
    """
    #print("q shape {}\nk shape {}\n v shape {}" .format(q.shape, k.shape, v.shape))
    q = self.w_q(q)
    k = self.w_k(k)
    v = self.w_v(v)
    #print("AFTER Dense\n  q shape {}\n  k shape {}\n  v shape {}" .format(q.shape, k.shape, v.shape))

    q = self.splitHeads(q)
    k = self.splitHeads(k)
    v = self.splitHeads(v)
    #print("AFTER SPLIT\n  q shape {}\n  k shape {}\n  v shape {}" .format(q.shape, k.shape, v.shape))

    context_vector = self.ScaledDotProductAttention(q, k, v, sequence_mask)
    #print("context_vector shape :", context_vector.shape)

    context_vector = tf.transpose(context_vector, perm=[0,2,1,3])
    #print("context_vector  transposed shape :", context_vector.shape)
    context_vector = tf.reshape(context_vector, (context_vector.shape[0], context_vector.shape[1], self.d_model))
    #print("context_vector  reshapeed shape :", context_vector.shape)

    return self.outputLayer(context_vector)

embed_size = 10; max_steps = 3; vocab_size = 100

q = tf.random.uniform((1, max_steps, embed_size))  # shape [batch_size, sequence_len, embedding_size]
mhatt = MultiHeadAttentionLayer(embed_size, 5)
mhatt_output = mhatt(q, k=q, v=q, sequence_mask=None)

In [11]:
def feedForwardnetwork(dff, d_model):
  """
  according to paper dff=2048 and d_model =512
  but d_model should be same as embedding_size/d_model in MultiHeadAttention
  ffn(x) = max(0, xW_1 + b+1)W_2 + b_2
  where max(0, ...) -> relu activation
  """
  ffNetwork = tf.keras.Sequential()
  ffNetwork.add(tf.keras.layers.Dense(dff, activation="relu"))
  ffNetwork.add(tf.keras.layers.Dense(d_model))
  return ffNetwork

def makeSequenceMask(seq_len):
  """
  mask should be size [1, 1, seq_len, seq_len]
  first two sizes are batch_szie, num_heads to make this matrix broadcastable
  it should be in form 
  [
    [0, 1, 1, 1]
    [0, 0, 1, 1]
    [0, 0, 0, 1]
    [0, 0, 0, 0]
  ]
  """
  mask_array = np.ones((seq_len, seq_len))
  mask_array = np.triu(mask_array, 1)
  return tf.constant(mask_array, dtype=tf.float32)

def makePaddingMask(sequence):
  mask = tf.math.equal(sequence, 0)
  mask =  tf.cast(mask, tf.float32)
  return mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [12]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, heads_number, dff, dtype=tf.float32, **kwargs):
    super(EncoderLayer, self).__init__(dtype, **kwargs)

    self.d_model = embedding_size
    self.multiHeadAttention = MultiHeadAttentionLayer(embedding_size, heads_number)

    self.normalizationFirst = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalizationSecond = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.ffNetwork = feedForwardnetwork(dff, self.d_model)

  def call(self, encoder_input, mask):
    # shortcut_data shape [batch_szie, max_sentence_len, embedding_size]
    shortcut_data = encoder_input

    # mhatt_output shape [batch_size, max_sentence_len, embedding_size]
    mhatt_output = self.multiHeadAttention(encoder_input, encoder_input, encoder_input, mask)
    mhatt_output += shortcut_data
    mhatt_output = self.normalizationFirst(mhatt_output)

    shortcut_data = mhatt_output

    ffNet_output = self.ffNetwork(mhatt_output)
    ffNet_output += shortcut_data
    ffNet_output = self.normalizationSecond(ffNet_output)

    return ffNet_output

class Encoder(tf.keras.Model):
  def __init__(self, embedding_size, max_sentence_len, vocab_size, blocks_amount, heads_number, dff):
    super(Encoder, self).__init__()

    assert (embedding_size//heads_number)%2==0
    self.blocks_amount = blocks_amount
    self.d_model = embedding_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.positionalEncoding = PositionalEncodingLayer(embedding_size, max_sentence_len)

    self.encoderBlocks = [EncoderLayer(embedding_size, heads_number, dff) for _ in range(blocks_amount)]
  
  def call(self, encoder_input, mask):
    # sequence shape [batch_size, max_sentence_len]
    embedded_seq = self.embedding(encoder_input)
    # according to paper https://arxiv.org/pdf/1706.03762.pdf
    # embedding is multiplied by sqrt(d_model). Point 3.4
    embedded_seq*=tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    # embedded_seq shape [batch_szie, max_sentence_len, embedding_size]
    data = self.positionalEncoding(embedded_seq)
    #------------------------- loop though all blocks -------------------------
    for i in range(self.blocks_amount):
      #print("               BLOCK ", i+1)
      data = self.encoderBlocks[i](data, mask) 

    return data

In [13]:
data = np.ones((32, max_steps))
print("input shape ", data.shape)
padding_mask = makePaddingMask(data)

encoder = Encoder(embedding_size=10,
                  max_sentence_len=1000,
                  vocab_size=100,
                  blocks_amount=3,
                  heads_number=5, 
                  dff=2048)
encoder_out  = encoder(data, mask=padding_mask)
print(encoder_out.shape)

input shape  (32, 3)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

(32, 3, 10)


In [14]:
"""
Decoder flow :

- Embedding 
- Positional Encoding
- Input = Embedding + Positional Encoding
--------------------REPEAT N Times--------------------
- Masked Multi-head Attention layer
- Input + Masked Multi-Head Attention layer added together 
- previous Normalized (1) 
- Multi-head Attention layer v, k from Encoder output | q from previous point
- (1) + Multi-head Attention layer added together
- previous normalized
- Feed Forward Network (2)
- (1) added to (2) and Normalized
------------------------------------------------------
- Decoder output
- Linear layer
- softmax
"""

'\nDecoder flow :\n\n- Embedding \n- Positional Encoding\n- Input = Embedding + Positional Encoding\n--------------------REPEAT N Times--------------------\n- Masked Multi-head Attention layer\n- Input + Masked Multi-Head Attention layer added together \n- previous Normalized (1) \n- Multi-head Attention layer v, k from Encoder output | q from previous point\n- (1) + Multi-head Attention layer added together\n- previous normalized\n- Feed Forward Network (2)\n- (1) added to (2) and Normalized\n------------------------------------------------------\n- Decoder output\n- Linear layer\n- softmax\n'

In [15]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, heads_number, dff, dtype=tf.float32, **kwargs):
    super(DecoderLayer, self).__init__(dtype, **kwargs)

    self.d_model = embedding_size
    self.multiHeadAttentionFirst = MultiHeadAttentionLayer(embedding_size, heads_number)
    self.multiHeadAttentionSecond = MultiHeadAttentionLayer(embedding_size, heads_number)

    self.normalizationFirst = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalizationSecond = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.normalizationThird = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.ffNetwork = feedForwardnetwork(dff, self.d_model)

  def call(self, decoder_input, encoder_output, pad_mask, elements_mask):
    # shortcut_data shape [batch_szie, max_sentence_len, embedding_size]
    shortcut_data = decoder_input
      
    # mhatt_output shape [batch_size, max_sentence_len, embedding_size]
    mhatt_output = self.multiHeadAttentionFirst(decoder_input, decoder_input, decoder_input, elements_mask)
    # add & Norm
    mhatt_output += shortcut_data
    mhatt_output = self.normalizationFirst(mhatt_output)

    shortcut_data = mhatt_output
    mhatt_output = self.multiHeadAttentionSecond(encoder_output, encoder_output, mhatt_output, pad_mask)
    mhatt_output += shortcut_data
    mhatt_output = self.normalizationSecond(mhatt_output)

    shortcut_data = mhatt_output
    ffn_output = self.ffNetwork(mhatt_output)
    ffn_output += shortcut_data
    ffNet_output = self.normalizationThird(ffn_output)

    return ffNet_output

class Decoder(tf.keras.models.Model):
  def __init__(self, embedding_size, max_sentence_len, vocab_size, blocks_amount, heads_number, dff):
    super(Decoder, self).__init__()

    assert (embedding_size//heads_number)%2==0
    self.blocks_amount = blocks_amount
    self.d_model = embedding_size
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.positionalEncoding = PositionalEncodingLayer(embedding_size, max_sentence_len)

    self.decoderBlocks = [DecoderLayer(embedding_size, heads_number, dff) for _ in range(blocks_amount)]

  def call(self, encoder_output, decoder_input, pad_mask, elements_mask):

    # sequence shape [batch_size, max_sentence_len]
    embedded_seq = self.embedding(decoder_input)
    # according to paper https://arxiv.org/pdf/1706.03762.pdf
    # embedding is multiplied by sqrt(d_model). Point 3.4
    embedded_seq*=tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    # embedded_seq shape [batch_szie, max_sentence_len, embedding_size]
    data = self.positionalEncoding(embedded_seq)
    #------------------------- loop though all blocks -------------------------
    for i in range(self.blocks_amount):
      #print("               BLOCK ", i+1)
      data = self.decoderBlocks[i](data, encoder_output, pad_mask, elements_mask)

    return data

In [16]:
input_data = np.ones((32, max_steps))
output_data = tf.random.uniform((32, 15))
mask = makeSequenceMask(output_data.shape[1])
print("Decoder input shape ", data.shape)
blocks_amount = 2
heads = 5
en_vocab_size = 100
fr_vocab_size = 200
decoder = Decoder(embedding_size=10,
                  max_sentence_len=1000,
                  vocab_size=100,
                  blocks_amount=3,
                  heads_number=5, 
                  dff=2048)
"""
decoder masks are :
- encoder_padding_mask
- decoder sequences mask
"""
decoder_out  = decoder(encoder_out, output_data, pad_mask=padding_mask, elements_mask=mask)
print("decoder_out ", decoder_out.shape)

Decoder input shape  (32, 3)
decoder_out  (32, 15, 10)


In [17]:
class Transformer(tf.keras.models.Model):
  def __init__(self,
               embedding_size,
               dff,
               input_max_seq_length,
               output_max_seq_length,
               input_vocab_size,
               output_vocab_size,
               encoder_blocks,
               decoder_blocks,
               heads):
    super(Transformer, self).__init__()

    self.encoder = Encoder(embedding_size, input_max_seq_length, input_vocab_size, encoder_blocks, heads, dff)
    self.decoder = Decoder(embedding_size, output_max_seq_length, output_vocab_size, decoder_blocks, heads, dff)

    self.dense = tf.keras.layers.Dense(output_vocab_size)

  def call(self, input_seq, output_seq, pad_mask, words_mask):
    
    encoder_out = self.encoder(input_seq, mask=pad_mask)
    decoder_out = self.decoder(encoder_out, output_seq, pad_mask=pad_mask, elements_mask=words_mask)

    # transformer_out shape = [batch_size, ]
    transformer_out = self.dense(decoder_out)
    return transformer_out

In [18]:
transformer_model = Transformer(embedding_size=512,
                                dff=2048,
                                input_max_seq_length=2000,
                                output_max_seq_length=1855,
                                input_vocab_size=4980,
                                output_vocab_size=7001,
                                encoder_blocks=4,
                                decoder_blocks=2,
                                heads=8)

# input_data and output_data
input_data = tf.random.uniform((64, 52), dtype=tf.int64, minval=0, maxval=100)
output_data = tf.random.uniform((64, 29), dtype=tf.int64, minval=0, maxval=250)

encoder_pad_mask = makePaddingMask(input_data)
elements_mask = makeSequenceMask(output_data.shape[1])
print("output_data ", output_data.shape)
print("elements_mask ", elements_mask.shape)
transformer_output = transformer_model(input_data, output_data, encoder_pad_mask, elements_mask)
print(transformer_output.shape)

output_data  (64, 29)
elements_mask  (29, 29)
(64, 29, 7001)


In [19]:
"""
according to Attention is all you need paper learning rate has custom scheduler:
there are two parameters : 
- d_model
- warmup_steps ( in paper set to 4000)
"""
class customLearningRate(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, warmup_steps, d_model):
    super(customLearningRate, self).__init__()
    self.d_model = d_model
    self.warmup_steps = warmup_steps
  
  def __call__(self, step):
    firstScheduler = step**(-0.5)
    secondScheduler = step*self.warmup_steps**(-0.5)
    return self.d_model**(-0.5)*tf.minimum(firstScheduler, secondScheduler)

custom_learning_rate = customLearningRate(warmup_steps=4000,
                                          d_model=512)

optimizer = tf.keras.optimizers.Adam(learning_rate=custom_learning_rate,
                                    beta_1=0.9,
                                    beta_2=0.98,
                                    epsilon=1e-9)

In [20]:
# reading data

BATCH_SIZE = 64
EPOCHS = 10

en_lines, fr_lines = read_data_files("data", ("small_vocab_en", "small_vocab_fr"))

#data = read_data("data/fra-eng", "fra.txt")
#en_lines, fr_lines = list(zip(*data))

en_lines = [normalize(line) for line in en_lines]
fr_lines = [normalize(line) for line in fr_lines]

en_train, en_test, fr_train, fr_test = train_test_split(en_lines, fr_lines, shuffle=True, test_size=0.1)

fr_train_in = ['<start> ' + line for line in fr_train]
fr_train_out = [line + ' <end>' for line in fr_train]

fr_test_in = ['<start> ' + line for line in fr_test]
fr_test_out = [line + ' <end>' for line in fr_test]

reading data from  data/small_vocab_en
reading data from  data/small_vocab_fr


In [21]:
fr_tokenizer = Tokenizer(filters='')
en_tokenizer = Tokenizer(filters='')

input_data = [fr_train_in, fr_train_out, fr_test_in, fr_test_out, fr_test, fr_train]
fr_train_in, fr_train_out, fr_test_in, fr_test_out, fr_test, fr_train = tokenizeInput(input_data, fr_tokenizer)

input_data = [en_train, en_test]
en_train, en_test = tokenizeInput(input_data, en_tokenizer)

en_vocab_size = len(en_tokenizer.word_index)+1
fr_vocab_size = len(fr_tokenizer.word_index)+1
print("en_vocab {}\nfr_vocab {}" .format(en_vocab_size, fr_vocab_size))

en_vocab 203
fr_vocab 336


In [22]:
train_dataset = tf.data.Dataset.from_tensor_slices((en_train, fr_train_in, fr_train_out))
train_dataset = train_dataset.shuffle(len(en_train), reshuffle_each_iteration=True)\
                                 .batch(BATCH_SIZE, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((en_test, fr_test_in, fr_test_out))
test_dataset = test_dataset.shuffle(len(en_test), reshuffle_each_iteration=True)\
                               .batch(BATCH_SIZE, drop_remainder=True)

In [23]:
print("Training batches per epoch :", len(en_train)//BATCH_SIZE)

Training batches per epoch : 1938


In [24]:
transformer_model = Transformer(embedding_size=512,
                                dff=1024,
                                input_max_seq_length=2000,
                                output_max_seq_length=1855,
                                input_vocab_size=en_vocab_size,
                                output_vocab_size=fr_vocab_size,
                                encoder_blocks=3,
                                decoder_blocks=3,
                                heads=8)
test_losses = []
train_losses = []
test_loss = tf.keras.metrics.Mean()
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

training_loss = tf.keras.metrics.Mean()
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True)

def loss_fn(real, targets):
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, tf.int64)
    loss = loss_object(targets, real, sample_weight=mask)
    return tf.reduce_mean(loss)    

@tf.function
def train_step(input_data, real_data_in, real_data_out):
    with tf.GradientTape() as tape:
        encoder_pad_mask = makePaddingMask(input_data)
        elements_mask = makeSequenceMask(real_data_in.shape[1])
        predicted_data = transformer_model(input_data, real_data_in, encoder_pad_mask, elements_mask)
        
        loss = loss_fn(predicted_data, real_data_out)
  
    trainable_vars = transformer_model.trainable_variables
    grads = tape.gradient(loss, trainable_vars)
    optimizer.apply_gradients(zip(grads, trainable_vars))
    train_accuracy(real_data_out, predicted_data)
    training_loss(loss)

@tf.function
def test_step(input_data, real_data_in, real_data_out):
    with tf.GradientTape() as tape:
        encoder_pad_mask = makePaddingMask(input_data)
        elements_mask = makeSequenceMask(real_data_in.shape[1])
        predicted_data = transformer_model(input_data, real_data_in, encoder_pad_mask, elements_mask, training=False)
        
        loss = loss_fn(predicted_data, real_data_out)
  
    test_accuracy(real_data_out, predicted_data)
    test_loss(loss)

for epoch in range(EPOCHS):
    training_loss.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()
    train_accuracy.reset_states()
  
    for batch, (en_data, fr_data_in, fr_train_out) in enumerate(train_dataset):
        train_step(en_data, fr_data_in, fr_train_out)
    for _, (en_data, fr_data_in, fr_data_out) in enumerate(test_dataset):
        test_step(en_data, fr_data_in, fr_data_out)
        
    print ('Epoch {} training Loss {:.4f} Accuracy {:.4f}  test Loss {:.4f} Accuracy {:.4f}' .format( \
                                                epoch + 1, 
                                                training_loss.result(), 
                                                train_accuracy.result(),
                                                test_loss.result(),
                                                test_accuracy.result()))

Epoch 1 training Loss 2.6365 Accuracy 0.0618  test Loss 2.8668 Accuracy 0.0677
Epoch 2 training Loss 2.6214 Accuracy 0.0621  test Loss 2.8657 Accuracy 0.0678
Epoch 3 training Loss 2.6207 Accuracy 0.0621  test Loss 2.8653 Accuracy 0.0678
Epoch 4 training Loss 2.6204 Accuracy 0.0621  test Loss 2.8651 Accuracy 0.0678
Epoch 5 training Loss 2.6203 Accuracy 0.0621  test Loss 2.8651 Accuracy 0.0678


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-24-2b8d4754ac5b>", line 61, in <module>
    train_step(en_data, fr_data_in, fr_train_out)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py", line 457, in __call__
    result = self._call(*args, **kwds)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/def_function.py", line 487, in _call
    return self._stateless_fn(*args, **kwds)  # pylint: disable=not-callable
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py", line 1823, in __call__
    return graph_function._filtered_call(args, kwargs)  # pylint: disable=protected-access
  File "/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/eager/function.py", line 1141, in _filtered_call
    self.captured_inputs)

KeyboardInterrupt: 