In [0]:
!pip install tensorflow-gpu



In [0]:
import tensorflow as tf
import numpy as np
tf.__version__

'2.0.0'

[[0.90648814 0.9491688 ]
 [0.09278559 0.39020795]
 [0.66258825 0.47905481]]
[[0.90648814 0.09278559 0.66258825]
 [0.9491688  0.39020795 0.47905481]]


In [0]:
nx, ny = (8,4)
print(np.arange(nx))
print(np.arange(ny))
p, i = np.meshgrid(np.arange(nx), np.arange(ny))
print("p = ", p)
print("i = ", i)

[0 1 2 3 4 5 6 7]
[0 1 2 3]
p =  [[0 1 2 3 4 5 6 7]
 [0 1 2 3 4 5 6 7]
 [0 1 2 3 4 5 6 7]
 [0 1 2 3 4 5 6 7]]
i =  [[0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1]
 [2 2 2 2 2 2 2 2]
 [3 3 3 3 3 3 3 3]]


In [0]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        if max_dims % 2 == 1: max_dims += 1 # max_dims must be even
        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))
        tf.print("p shape ", p.shape, "  p = ", p)
        tf.print("i shape ", i.shape, "  i = ", i)
        pos_emb = np.empty((1, max_steps, max_dims))
        tf.print("pos_emb shape ", pos_emb.shape)
        sins = np.sin(p/10000**(2*i/max_dims))
        coss = np.cos(p/10000**(2*i/max_dims))
        tf.print("sins shape {}   Transposed shape {}" .format(sins.shape, sins.T.shape))
        tf.print("coss shape {}   Transposed shape {}" .format(coss.shape, coss.T.shape))
        pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T
        pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T
        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))
    def call(self, inputs):
        return self.positional_embedding

In [0]:
embed_size = 10; max_steps = 3; vocab_size = 100
positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)
res = positional_encoding([1,2,3,4])
print(res)

p shape  (5, 3)   p =  array([[0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2]])
i shape  (5, 3)   i =  array([[0, 0, 0],
       [1, 1, 1],
       [2, 2, 2],
       [3, 3, 3],
       [4, 4, 4]])
pos_emb shape  (1, 3, 10)
sins shape (5, 3)   Transposed shape (3, 5)
coss shape (5, 3)   Transposed shape (3, 5)
tf.Tensor(
[[[ 0.0000000e+00  1.0000000e+00  0.0000000e+00  1.0000000e+00
    0.0000000e+00  1.0000000e+00  0.0000000e+00  1.0000000e+00
    0.0000000e+00  1.0000000e+00]
  [ 8.4147096e-01  5.4030228e-01  1.5782665e-01  9.8746681e-01
    2.5116222e-02  9.9968451e-01  3.9810613e-03  9.9999207e-01
    6.3095731e-04  9.9999982e-01]
  [ 9.0929741e-01 -4.1614684e-01  3.1169716e-01  9.5018148e-01
    5.0216600e-02  9.9873835e-01  7.9620592e-03  9.9996829e-01
    1.2619144e-03  9.9999923e-01]]], shape=(1, 3, 10), dtype=float32)


In [0]:
class PositionalEncodingSimpleLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_dim, max_sentence_len, dtype=tf.float32, **kwargs):
    super(PositionalEncodingSimpleLayer, self).__init__(dtype=tf.float32, **kwargs)
    if embedding_dim %2 != 0:
      embedding_dim+=1
    PE = np.zeros((1, max_sentence_len, embedding_dim))
    for pos in range(max_sentence_len):
      for i in range(embedding_dim//2):
        PE[:, pos, 2*i] = np.sin(pos/10000**(2*i/embedding_dim))
        PE[:, pos, 2*i+1] = np.cos(pos/10000**(2*i/embedding_dim))
    tf.print(PE.shape)
    self.PE = PE
  def call(self, input):
    return self.PE

In [0]:
positional_encoding_layer = PositionalEncodingSimpleLayer(embed_size, max_sentence_len=max_steps)
res2 = positional_encoding_layer([1,2,3,4,5])
print(res2)

(1, 3, 10)
[[[ 0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
    0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
    0.00000000e+00  1.00000000e+00]
  [ 8.41470985e-01  5.40302306e-01  1.57826640e-01  9.87466836e-01
    2.51162229e-02  9.99684538e-01  3.98106119e-03  9.99992076e-01
    6.30957303e-04  9.99999801e-01]
  [ 9.09297427e-01 -4.16146837e-01  3.11697146e-01  9.50181503e-01
    5.02165994e-02  9.98738351e-01  7.96205928e-03  9.99968302e-01
    1.26191435e-03  9.99999204e-01]]]


In [0]:
print("comparing arrays: ", np.allclose(res, res2))

comparing arrays:  True


In [0]:
class PositionalEncodingArangePos(tf.keras.layers.Layer):
  def __init__(self, embedding_size, max_sentence_len, dtype=tf.float32, **kwargs):
    super(PositionalEncodingArangePos, self).__init__(dtype, **kwargs)
    if embedding_size%2 !=0:
      embedding_size+=1
    PE = np.zeros((1, max_sentence_len, embedding_size))
    print(PE.shape)
    pos = np.arange(start=0, stop=max_sentence_len, step=1)
    print(pos.shape)
    for i in range(embedding_size//2):
      PE[0, ::, 2*i] = np.sin(pos/10000**(2*i/embedding_size))
      PE[0, ::, 2*i+1] = np.cos(pos/10000**(2*i/embedding_size))
    self.PE = PE
  def call(self, inputs):
    return self.PE

In [0]:
peLayer = PositionalEncodingArangePos(embed_size, max_sentence_len=max_steps)
res3 = peLayer([1,2,3,4,5])
print(res3)

(1, 3, 10)
(3,)
[[[ 0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
    0.00000000e+00  1.00000000e+00  0.00000000e+00  1.00000000e+00
    0.00000000e+00  1.00000000e+00]
  [ 8.41470985e-01  5.40302306e-01  1.57826640e-01  9.87466836e-01
    2.51162229e-02  9.99684538e-01  3.98106119e-03  9.99992076e-01
    6.30957303e-04  9.99999801e-01]
  [ 9.09297427e-01 -4.16146837e-01  3.11697146e-01  9.50181503e-01
    5.02165994e-02  9.98738351e-01  7.96205928e-03  9.99968302e-01
    1.26191435e-03  9.99999204e-01]]]


In [0]:
print("comparing arrays: ", np.allclose(res, res3))

comparing arrays:  True


In [0]:
"""
Encoder flow :

- Embedding 
- Positional Encoding
- Input = Embedding + Positional Encoding
--------------------REPEAT N Times--------------------
- Multi-head Attention layer
- Input + Multi-Head Attention layer added together 
- previous Normalized (1)
- Feed Forward Network (2)
- (1) added to (2) and Normmalized
------------------------------------------------------
- Encoder output 
"""

'\nEncoder flow :\n\n- Embedding \n- Positional Encoding\n- Input = Embedding + Positional Encoding\n--------------------REPEAT N Times--------------------\n- Multi-head Attention layer\n- Input + Multi-Head Attention layer added together \n- previous Normalized (1)\n- Feed Forward Network (2)\n- (1) added to (2) and Normmalized\n------------------------------------------------------\n- Encoder output \n'

In [0]:
class PositionalEncodingLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, sentence_len, dtype=tf.float32, **kwargs):
    super(PositionalEncodingLayer, self).__init__(dtype, **kwargs)
    if embedding_size%2 !=0:
      embedding_size+=1
    # embedding size -> depth of model
    # positional encoding should have size : [1, sentence_len, embedding_size]
    # 1 is here to make broadcasting possible in call method
    PE = np.zeros((1, sentence_len, embedding_size))
    # pos should have shape [1, sentence_len] with values <0, sentence_len)
    pos = np.arange(start=0, stop=sentence_len, step=1)
    pos = pos.reshape(sentence_len, 1)
    # i should have shappe [1, embedding_size//2] with values <0, embedding_size//2)
    # we need half of embedding size, because half is needed for each sin/cos 
    # then we put it together into PE and we have [1, sentence_len, embedding_size]
    i = np.arange(start=0, stop=embedding_size//2, step=1)
    i = i.reshape(embedding_size//2, 1).T
    PE_sin = np.sin(pos/10000**(2*i/embedding_size))
    PE_cos = np.cos(pos/10000**(2*i/embedding_size))
    # we put sin into even indexes ::2 
    # we put cos into odd indexes, thats why we`re starting from 1 here : 1::2
    PE[0, ::, ::2] = PE_sin
    PE[0, ::, 1::2] = PE_cos
    self.PE = tf.constant(PE, dtype=dtype)
  def getPE(self):
    """
    only for debuging purposes
    """
    return self.PE
  def call(self, inputs):
    """
    inputs shape should be same as self.PE shape
    In case that this is not assured please add:
    
      input_shape = tf.shape(inputs)
      return inputs + self.PE[:, input_shape[-2], input_shape[-1]]

    instead of below return line
    """
    return inputs + self.PE[:,:,:]

peLayerAll = PositionalEncodingLayer(embed_size, sentence_len=max_steps)
res4 = peLayerAll.getPE()
print(res4.shape)
print("comparing arrays: ", np.allclose(res, res4))

(1, 3, 10)
comparing arrays:  True


In [0]:
class MultiHeadAttentionLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_size, heads_number, dtype=tf.float32, **kwargs):
    super(MultiHeadAttentionLayer, self).__init__(dtype=tf.float32, **kwargs)
    """
    return shape : [batch_size, sequence_len, d_model]
    heads_number - tell how many heads will be processed at same time
    d_model - model size ; equal to embedding_size
    """
    self.heads_number = heads_number
    self.d_model = embedding_size
    self.w_q = tf.keras.layers.Dense(self.d_model)
    self.w_k = tf.keras.layers.Dense(self.d_model)
    self.w_v = tf.keras.layers.Dense(self.d_model)

    self.outputLayer = tf.keras.layers.Dense(self.d_model)

  # similar to dot attention but with scaling added
  def ScaledDotProductAttention(self, v, k, q, sequence_mask):
    """
    q shape [batch_size, num_heads, q_seq_len, depth_q]
    k shape [batch_size, num_heads, k_seq_len, depth_k]
    v shape [batch_size, num_heads, v_seq_len, depth_v]
    """
    # matmul(q,k,v)
    # resultion shape [batch_size, num_heads, q_seq_len, k_seq_len]
    qk_matmul = tf.matmul(q, k, transpose_b=True)
    # scaling tf.cast is needed here because tf.sqrt needs float32 type
    # score shape [batch_size, num_heads, q_seq_len, k_seq_len]
    score = qk_matmul*tf.math.sqrt(tf.cast(k.shape[-1], dtype=tf.float32))
    # optional mask
    # mask should be shape [batch_size, num_heads, q_seq_len, k_seq_len]
    # for example [
    #             [0, 1, 1]
    #             [0, 0, 1]
    #             ] shape == (2, 3)
    # we`re adding big negative number, because we only care about present/past words that are przedicted
    print(sequence_mask)
    if sequence_mask != None:
      print(" mask is not none")
      score += sequence_mask*-1e-8
    # softmax
    # attention_weights shape [batch_size, num_heads, q_seq_len, k_seq_len]
    attention_weights = tf.nn.softmax(score, axis=-1)
    # matmul(res, V)
    # contex shape [batch_size, num_heads, q_seq_len, depth_v]
    context = tf.matmul(attention_weights, v)
    return context

  def splitHeads(self, data):
    # new shape [batch_size, sequence_len, heads_number, d_model//heads_number]
    data = tf.reshape(data, (data.shape[0], data.shape[1], self.heads_number, data.shape[-1]//self.heads_number))
    # transpose dimentions to [batch_size, heads_number, sequence_len, d_model//heads_number]
    return tf.transpose(data, perm=[0,2,1,3])

  def call(self, q, k, v, sequence_mask):
    """
    q shape [batch_size, sequence_len, d_model]
    k shape [batch_size, sequence_len, d_model]
    v shape [batch_size, sequence_len, d_model]

    after first operations shapes are the same
    next we have to split d_model into heads_number of subbatches
    new shape after reshape only should be : [batch_size, sequence_len, heads_number, d_model//heads_number]
    next shape should be transposed to : [batch_size, heads_number, sequence_len, d_model//heads_number]
    where :
      new_d_model = d_model/heads_number
    
    next make scaled dot-product attention on resulting q,k,v

    next concat returning data to get shape : [batch_size, sequence_len, d_model]
    in order to do this we have to transpose context_vector to get [batch_size, sequence_len, heads_number, d_model//heads_number]

    next put it throug dense layer (d_model) in order to get output
    """
    print("q shape {}\nk shape {}\n v shape {}" .format(q.shape, k.shape, v.shape))
    q = self.w_q(q)
    k = self.w_k(k)
    v = self.w_v(v)
    print("AFTER Dense\n  q shape {}\n  k shape {}\n  v shape {}" .format(q.shape, k.shape, v.shape))

    q = self.splitHeads(q)
    k = self.splitHeads(k)
    v = self.splitHeads(v)
    print("AFTER SPLIT\n  q shape {}\n  k shape {}\n  v shape {}" .format(q.shape, k.shape, v.shape))

    context_vector = self.ScaledDotProductAttention(q, k, v, sequence_mask)
    print("context_vector shape :", context_vector.shape)

    context_vector = tf.transpose(context_vector, perm=[0,2,1,3])
    print("context_vector  transposed shape :", context_vector.shape)
    context_vector = tf.reshape(context_vector, (context_vector.shape[0], context_vector.shape[1], self.d_model))
    print("context_vector  reshapeed shape :", context_vector.shape)

    return self.outputLayer(context_vector)

embed_size = 10; max_steps = 3; vocab_size = 100

q = tf.random.uniform((1, max_steps, embed_size))  # shape [batch_size, sequence_len, embedding_size]
mhatt = MultiHeadAttentionLayer(embed_size, 5)
mhatt_output = mhatt(q, k=q, v=q, sequence_mask=None)

q shape (1, 3, 10)
k shape (1, 3, 10)
 v shape (1, 3, 10)
AFTER Dense
  q shape (1, 3, 10)
  k shape (1, 3, 10)
  v shape (1, 3, 10)
AFTER SPLIT
  q shape (1, 5, 3, 2)
  k shape (1, 5, 3, 2)
  v shape (1, 5, 3, 2)
None
context_vector shape : (1, 5, 3, 2)
context_vector  transposed shape : (1, 3, 5, 2)
context_vector  reshapeed shape : (1, 3, 10)


In [0]:
def feedForwardnetwork(dff, d_model):
  """
  according to paper dff=2048 and d_model =512
  but d_model should be same as embedding_size/d_model in MultiHeadAttention
  ffn(x) = max(0, xW_1 + b+1)W_2 + b_2
  where max(0, ...) -> relu activation
  """
  ffNetwork = tf.keras.Sequential()
  ffNetwork.add(tf.keras.layers.Dense(dff, activation="relu"))
  ffNetwork.add(tf.keras.layers.Dense(d_model))
  return ffNetwork

def makeSequenceMask(seq_len):
  """
  mask should be size [1, 1, seq_len, seq_len]
  first two sizes are batch_szie, num_heads to make this matrix broadcastable
  it should be in form 
  [
    [0, 1, 1, 1]
    [0, 0, 1, 1]
    [0, 0, 0, 1]
    [0, 0, 0, 0]
  ]
  """
  mask_array = np.ones((seq_len, seq_len))
  mask_array = np.triu(mask_array, 1)
  return tf.reshape(mask_array, (1, 1, *mask_array.shape))


In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, embedding_size, max_sentence_len, vocab_size, blocks_amount, heads_number):
    super(Encoder, self).__init__()

    assert (embedding_size//heads_number)%2==0
    self.d_model = embedding_size
    self.blocks_amount = blocks_amount
    self.seq_len = max_sentence_len
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
    self.positionalEncoding = PositionalEncodingLayer(embedding_size, max_sentence_len)
    self.multiHeadAttention = MultiHeadAttentionLayer(embedding_size, heads_number)
    self.ffNetwork = feedForwardnetwork(1024, self.d_model)

    self.w_q = tf.keras.layers.Dense(self.d_model)
    self.w_k = tf.keras.layers.Dense(self.d_model)
    self.w_v = tf.keras.layers.Dense(self.d_model)

    self.normalizationFirst = tf.keras.layers.LayerNormalization()
    self.normalizationSecond = tf.keras.layers.LayerNormalization()
  
  def call(self, sequence):
    # sequence shape [batch_size, max_sentence_len]
    embedded_seq = self.embedding(sequence)
    # embedded_seq shape [batch_szie, max_sentence_len, embedding_size]
    block_input = self.positionalEncoding(embedded_seq)
    #------------------------- loop though all blocks -------------------------
    for i in range(self.blocks_amount):
      shortcut_data = block_input
      # shortcut_data shape [batch_szie, max_sentence_len, embedding_size]

      q = self.w_q(block_input)
      k = self.w_k(block_input)
      v = self.w_v(block_input)

      mask = makeSequenceMask(self.seq_len)
      
      # mhatt_output shape [batch_size, max_sentence_len, embedding_size]
      mhatt_output = self.multiHeadAttention(v, k, q, mask)
      print("mhatt_output shape ", mhatt_output.shape)

      # add & Norm
      mhatt_output += shortcut_data
      mhatt_output = self.normalizationFirst(mhatt_output)

      shortcut_data = mhatt_output
      
      # put Feed forward ntwork here
      ffn_output = self.ffNetwork(mhatt_output)
      ffn_output += shortcut_data
      block_output = self.normalizationSecond(ffn_output)
      block_input = block_output

    return block_output

In [0]:
data = np.ones((32, max_steps))
print("input shape ", data.shape)
blocks_amount = 2
heads = 5
encoder = Encoder(embed_size, max_steps, vocab_size, blocks_amount, heads)
encoder_out  = encoder(data)
print("max_steps {}\nembedding_size/d_model {}\nvocab_size {}\nheads_number {}\nblocks_amount {}" .format(max_steps, embed_size, vocab_size, heads, blocks_amount))
print(encoder_out.shape)

input shape  (32, 3)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

q shape (32, 3, 10)
k shape (32, 3, 10)
 v shape (32, 3, 10)
AFTER Dense
  q shape (32, 3, 10)
  k shape (32, 3, 10)
  v shape (32, 3, 10)
AFTER SPLIT
  q shape (32, 5, 3, 2)
  k shape (32, 5, 3, 2)
  v shape (32, 5, 3, 2)
tf.Tensor(
[[[[0. 1. 1.]
   [0. 0. 1.]
   [0. 0. 0.]]]], shape=(1, 1, 3, 3), dtype=float64)


ValueError: ignored

In [0]:
"""
Decoder flow :

- Embedding 
- Positional Encoding
- Input = Embedding + Positional Encoding
--------------------REPEAT N Times--------------------
- Masked Multi-head Attention layer
- Input + Masked Multi-Head Attention layer added together 
- previous Normalized (1) 
- Multi-head Attention layer v, k from Encoder output | q from previous point
- (1) + Multi-head Attention layer added together
- previous normalized
- Feed Forward Network (2)
- (1) added to (2) and Normalized
------------------------------------------------------
- Decoder output
- Linear layer
- softmax
"""

'\nDecoder flow :\n\n- Embedding \n- Positional Encoding\n- Input = Embedding + Positional Encoding\n--------------------REPEAT N Times--------------------\n- Masked Multi-head Attention layer\n- Input + Masked Multi-Head Attention layer added together \n- previous Normalized (1) \n- Multi-head Attention layer v, k from Encoder output | q from previous point\n- (1) + Multi-head Attention layer added together\n- previous normalized\n- Feed Forward Network (2)\n- (1) added to (2) and Normalized\n------------------------------------------------------\n- Decoder output\n- Linear layer\n- softmax\n'

<tf.Tensor: id=505, shape=(1, 1, 6, 6), dtype=float64, numpy=
array([[[[0., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1.],
         [0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0.]]]])>