In [81]:
import tensorflow as tf

In [165]:
def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead) 
    but it must be broadcastable for addition.
    
    Args:
      q: query shape == (..., seq_len_q, depth)
      k: key shape == (..., seq_len_k, depth)
      v: value shape == (..., seq_len_v, depth_v)
      mask: Float tensor with shape broadcastable 
            to (..., seq_len_q, seq_len_k). Defaults to None.
      
    Returns:
      output, attention_weights
    """
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  
 
    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
 
    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)
 
    return output, attention_weights

In [265]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

In [266]:

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [267]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [268]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)
    print('dec,',x.shape, 'enc_out', enc_output.shape)
    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    print('devc2,',attn1.shape, 'dec', x.shape)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

In [269]:
#to add x + attn1, attn1 must have shape of input data

In [270]:
class Transformer(tf.keras.Model):
  def __init__(self,
                 #d_input: int,
                 d_model: int,
                 d_output: int,
                 num_heads: int,
                 num_layers: int,
                 #attention_size: int = None,
                 dropout: float = 0.3,
                 dff: int = 2048,
                 pe: str = None):
    super(Transformer, self).__init__()

    self.encoder = Encoder(d_model, num_heads, num_layers,dff,
                            dropout=dropout)#attention_size=attention_size,

    self.decoder = Decoder(d_model, num_heads, num_layers,dff,
                            dropout=dropout)#attention_size=attention_size,

    self.first_layer = tf.keras.layers.Dense(d_model)
    self.final_layer = tf.keras.layers.Dense(d_output)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):
    
    print(inp.shape)
    linear_layer_out = self.first_layer(inp) #nn.Linear(d_input, d_model)
    print(linear_layer_out.shape)
    enc_output = self.encoder(linear_layer_out, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    print(enc_output.shape)
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

In [271]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self,
                 d_model: int,
                 num_heads: int,
                 num_layers: int,
                 dff: int,
                 dropout: float = 0.3):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    #self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    #self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, dropout) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}
    
    #x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    #x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    #x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In [272]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self,
                 d_model: int,
                 num_heads: int,
                 num_layers: int,
                 dff: int,
                 dropout: float = 0.3):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    #self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    #self.pos_encoding = positional_encoding(maximum_position_encoding, 
    #                                        self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, dropout) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(dropout)
        
  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    
    # adding embedding and position encoding.
    #x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    #x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    #x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

In [294]:
batch_size, timesteps, input_dim = None, 20, 10
out_time=1
out_dim = 10
def get_x_y(size=1000):
    import numpy as np
    pos_indices = np.random.choice(size, size=int(size // 2), replace=False)
    x_train = np.zeros(shape=(size, timesteps, input_dim))
    y_train = np.zeros(shape=(size, out_time, out_dim))
    x_train[pos_indices, 0] = 1.0
    y_train[pos_indices, 0] = 1.0
    return x_train, y_train
x, y = get_x_y()


In [295]:
#BATCH_SIZE = 8
#NUM_WORKERS = 0
LR = 2e-4
EPOCHS = 30

# Model parameters
d_model = 10 # Lattent dim
num_heads = 2 # Number of heads
num_layers = 4 # Number of encoder and decoder to stack
#attention_size = 12 # Attention window size
dropout = 0.2 # Dropout rate
pe = None # Positional encoding
chunk_mode = None

d_input = input_dim # From dataset
d_output = input_dim # From dataset#
net = Transformer( d_model, d_output, num_heads, num_layers, dropout=dropout, pe=pe)

In [296]:
x.shape

(1000, 20, 10)

In [297]:
y.shape


(1000, 1, 10)

In [298]:
net(x, y, training=False, 
                               enc_padding_mask=None, 
                               look_ahead_mask=None,
                               dec_padding_mask=None)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

(1000, 20, 10)
(1000, 20, 10)
(1000, 20, 10)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

dec, (1000, 1, 10) enc_out (1000, 20, 10)
devc2, (1000, 1, 10) dec (1000, 1, 10)
dec, (1000, 1, 10) enc_out (1000, 20, 10)
devc2, (1000, 1, 10) dec (1000, 1, 10)
dec, (1000, 1, 10) enc_out (1000, 20, 10)
devc2, (1000, 1, 10) dec (1000, 1, 10)
dec, (1000, 1, 10) enc_out (1000, 20, 10)
devc2, (1000, 1, 10) dec (1000, 1, 10)


(<tf.Tensor: shape=(1000, 1, 10), dtype=float32, numpy=
 array([[[ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ]],
 
        [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ]],
 
        [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
           0.        ,  0.        ]],
 
        ...,
 
        [[ 0.73731077, -0.33892933,  1.4367894 , ..., -1.034082  ,
           0.34154773,  0.71647257]],
 
        [[ 0.73731077, -0.33892933,  1.4367894 , ..., -1.034082  ,
           0.34154773,  0.71647257]],
 
        [[ 0.73731077, -0.33892933,  1.4367894 , ..., -1.034082  ,
           0.34154773,  0.71647257]]], dtype=float32)>,
 {'decoder_layer1_block1': <tf.Tensor: shape=(1000, 2, 1, 1), dtype=float32, numpy=
  array([[[[1.]],
  
          [[1.]]],
  
  
         [[[1.]],
  
          [[1.]]],
  
  
         [[[1.]],
  
          [[1.]]],
  
  
         ...,
  
  
         [[[1.]],
  
        

In [299]:
net.fit(x,y )

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.