# Transformer Notebook
Goal: Using SMILES of known CB1 ligands from the ChEMBL database, train an unsupervised Transformer to learn embeddings for these tokenized SMILES sequences to be used in other deep learning models

## Data Preparation

In [141]:
cb1_smiles = open("./data/CB1_SMILES.txt", 'r')
cb1_smiles = cb1_smiles.read().splitlines()

len(cb1_smiles), cb1_smiles[0]

(3151, 'N=C(NS(=O)(=O)c1ccc(F)cc1)N1CC(c2ccccc2)C(c2ccc(Cl)cc2)=N1')

In [2]:
from tokenizer import SmilesTokenizer

tokenizer = SmilesTokenizer("./data/vocab.txt")
print( cb1_smiles[0], len(cb1_smiles[0]) )
print( tokenizer.encode(cb1_smiles[0]), len(tokenizer.encode(cb1_smiles[0])) )

N=C(NS(=O)(=O)c1ccc(F)cc1)N1CC(c2ccccc2)C(c2ccc(Cl)cc2)=N1 58
[12, 23, 22, 16, 17, 23, 34, 17, 22, 19, 18, 17, 22, 19, 18, 15, 20, 15, 15, 15, 17, 27, 18, 15, 15, 20, 18, 23, 20, 16, 16, 17, 15, 21, 15, 15, 15, 15, 15, 21, 18, 16, 17, 15, 21, 15, 15, 15, 17, 28, 18, 15, 15, 21, 18, 22, 23, 20, 13] 59


In [3]:
for ts in tokenizer.encode(cb1_smiles[0]):
  print ('{} ----> {}'.format(ts, tokenizer.decode([ts])))

12 ----> [CLS]
23 ----> N
22 ----> =
16 ----> C
17 ----> (
23 ----> N
34 ----> S
17 ----> (
22 ----> =
19 ----> O
18 ----> )
17 ----> (
22 ----> =
19 ----> O
18 ----> )
15 ----> c
20 ----> 1
15 ----> c
15 ----> c
15 ----> c
17 ----> (
27 ----> F
18 ----> )
15 ----> c
15 ----> c
20 ----> 1
18 ----> )
23 ----> N
20 ----> 1
16 ----> C
16 ----> C
17 ----> (
15 ----> c
21 ----> 2
15 ----> c
15 ----> c
15 ----> c
15 ----> c
15 ----> c
21 ----> 2
18 ----> )
16 ----> C
17 ----> (
15 ----> c
21 ----> 2
15 ----> c
15 ----> c
15 ----> c
17 ----> (
28 ----> Cl
18 ----> )
15 ----> c
15 ----> c
21 ----> 2
18 ----> )
22 ----> =
23 ----> N
20 ----> 1
13 ----> [SEP]


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences

tokenized_smiles = []

for smiles in cb1_smiles:
    tokenized_smiles.append( tokenizer.encode(smiles) )

len(tokenized_smiles), len(tokenized_smiles[0]), len(tokenized_smiles[1])

(3151, 59, 69)

In [5]:
padded_smiles = pad_sequences(tokenized_smiles, padding='post')
padded_smiles = tf.cast(padded_smiles, dtype="float32")
padded_smiles = np.stack(padded_smiles, axis=0)

padded_smiles.shape, padded_smiles[0].shape

((3151, 123), (123,))

## Model Building

#### Positional Encoding

In [6]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

#### Mask Creation

In [7]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

In [8]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

#### Attention Layers

In [9]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

#### Encoder + Decoder

In [10]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [12]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [13]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

In [14]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    
    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

In [15]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}
    
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

#### Full Model

In [16]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, pe_input, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

## Training

In [17]:
num_layers = 4
d_model = 32
dff = 512
num_heads = 8

input_vocab_size = 510
target_vocab_size = 510
dropout_rate = 0.2

In [18]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    
    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [19]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.legacy.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [20]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [21]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')

In [22]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size,
                          rate=dropout_rate)

In [23]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)
  
  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)
  
  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
  return enc_padding_mask, combined_mask, dec_padding_mask

In [24]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

Latest checkpoint restored!!


In [24]:
EPOCHS = 100

# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]
  
  enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
  with tf.GradientTape() as tape:
    predictions, _ = transformer(inp, tar_inp, 
                                 True, 
                                 enc_padding_mask, 
                                 combined_mask, 
                                 dec_padding_mask)
    loss = loss_function(tar_real, predictions)

  gradients = tape.gradient(loss, transformer.trainable_variables)    
  optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
  train_loss(loss)
  train_accuracy(tar_real, predictions)

In [25]:
import time

batch_size = 64

for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()

  for batch in range(batch_size, padded_smiles.shape[0], batch_size):
    inp = padded_smiles[batch - batch_size : batch]
    train_step(inp, inp)
    
    if batch % batch_size == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))
  np.random.shuffle(padded_smiles)

Epoch 1 Batch 64 Loss 6.1865 Accuracy 0.0005
Epoch 1 Batch 128 Loss 6.1924 Accuracy 0.0003
Epoch 1 Batch 192 Loss 6.2112 Accuracy 0.0002
Epoch 1 Batch 256 Loss 6.2067 Accuracy 0.0003
Epoch 1 Batch 320 Loss 6.2115 Accuracy 0.0002
Epoch 1 Batch 384 Loss 6.2147 Accuracy 0.0002
Epoch 1 Batch 448 Loss 6.2116 Accuracy 0.0002
Epoch 1 Batch 512 Loss 6.2088 Accuracy 0.0002
Epoch 1 Batch 576 Loss 6.2089 Accuracy 0.0002
Epoch 1 Batch 640 Loss 6.2074 Accuracy 0.0002
Epoch 1 Batch 704 Loss 6.2074 Accuracy 0.0002
Epoch 1 Batch 768 Loss 6.2067 Accuracy 0.0002
Epoch 1 Batch 832 Loss 6.2068 Accuracy 0.0002
Epoch 1 Batch 896 Loss 6.2057 Accuracy 0.0002
Epoch 1 Batch 960 Loss 6.2032 Accuracy 0.0002
Epoch 1 Batch 1024 Loss 6.2010 Accuracy 0.0002
Epoch 1 Batch 1088 Loss 6.1999 Accuracy 0.0002
Epoch 1 Batch 1152 Loss 6.2005 Accuracy 0.0002
Epoch 1 Batch 1216 Loss 6.1989 Accuracy 0.0002
Epoch 1 Batch 1280 Loss 6.1973 Accuracy 0.0002
Epoch 1 Batch 1344 Loss 6.1948 Accuracy 0.0002
Epoch 1 Batch 1408 Loss 6.193

## Prediction

#### Generation

In [138]:
def generate_smiles(transformer):
    start_token = 12
    end_token = 13

    input_sequence = np.random.randint(0, 60, size=(1, 123))
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input_sequence, input_sequence)
    enc_output = transformer.encoder(input_sequence, False, enc_padding_mask)

    generated_smiles = []
    for _ in range(123):
        input_sequence = np.random.randint(0, 510, size=(1, 123))
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input_sequence, input_sequence)
        enc_output = transformer.encoder(input_sequence, False, enc_padding_mask)
    
        decoder_output, _ = transformer.decoder(input_sequence, enc_output, False, combined_mask, dec_padding_mask)
        final_output = transformer.final_layer(decoder_output)
        
        predicted_token = np.argmax(final_output[:, -1, :], axis=-1)
        # print(predicted_token)
        generated_smiles.append(predicted_token[0])

        if predicted_token[0] == end_token:
            break

        input_sequence = generated_smiles
        input_sequence = pad_sequences([input_sequence], maxlen=123, padding='post')
        input_sequence = tf.cast(input_sequence, dtype="float32")       

    return generated_smiles

input_sequence = np.random.randint(0, 510, size=(1, 123))
enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input_sequence, input_sequence)
enc_output = transformer.encoder(input_sequence, False, enc_padding_mask)
decoder_output, _ = transformer.decoder(input_sequence, enc_output, False, combined_mask, dec_padding_mask)
final_output = transformer.final_layer(decoder_output)

smiles = []
for row in final_output[0]:
    smiles.append(np.argmax(row))

print(smiles)
print( tokenizer.decode(smiles).replace(' ', '') )

# for _ in range(50):
#     print( tokenizer.decode(generate_smiles(transformer)).replace(' ', '') )

[17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 17, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 17, 17, 17, 17, 17, 17, 17, 17, 17, 34, 34, 17, 34, 34, 34, 34, 17, 34, 34, 34, 17, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 28, 28, 28, 28, 34, 28, 28, 28, 28, 28, 34, 34, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 13, 13, 28, 28, 28, 28, 28, 13, 28, 28, 28, 28, 28, 28, 28, 28]
(((((((((((((((((((((((((((((((((((S()(((((((((((S(((((((((SS(SSSS(SSS(SSSSSSSSSSSSSClClClClSClClClClClSSClClClClClClClClClClCl[SEP][SEP]ClClClClCl[SEP]ClClClClClClClCl


#### Using Full Transformer

In [24]:
test_smiles = "COc1cccc2c3c(=O)n([C@@H]4C(C)(C)[C@@H]5CC[C@@]4(C)C5)ccc3n(CCN3CCOCC3)c12"

In [25]:
from tokenizer import SmilesTokenizer

tokenizer = SmilesTokenizer("./data/vocab.txt")
tokenized_test = tokenizer.encode(test_smiles)
tokenized_test_padded = pad_sequences([tokenized_test], maxlen=123, padding='post')
tokenized_test_tensor = tf.constant(tokenized_test_padded)

tokenized_test_tensor

<tf.Tensor: shape=(1, 123), dtype=int32, numpy=
array([[12, 16, 19, 15, 20, 15, 15, 15, 15, 21, 15, 26, 15, 17, 22, 19,
        18, 25, 17, 35, 32, 16, 17, 16, 18, 17, 16, 18, 35, 43, 16, 16,
        56, 32, 17, 16, 18, 16, 43, 18, 15, 15, 15, 26, 25, 17, 16, 16,
        23, 26, 16, 16, 19, 16, 16, 26, 18, 15, 20, 21, 13,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)>

In [26]:
enc_padding_mask, combined_mask, dec_padding_mask = create_masks(tokenized_test_tensor, tokenized_test_tensor)

predictions, _ = transformer(tokenized_test_tensor, tokenized_test_tensor, 
                                False, 
                                enc_padding_mask, 
                                combined_mask, 
                                dec_padding_mask)

In [27]:
preds = []

for token in range(123):
    preds.append( int( tf.argmax( predictions[:, token, :], axis=-1 )[0] ) )
pred_smiles = tokenizer.decode(preds).replace(' ', '')

pred_smiles[:pred_smiles.find('[SEP]')], test_smiles

('COc1ccc22c3c(=O)n([C@@H]4C(C)(C)[C@@H]4CC[C@@]4(C)C5)ccccn(CCN3CCOC33)c12',
 'COc1cccc2c3c(=O)n([C@@H]4C(C)(C)[C@@H]5CC[C@@]4(C)C5)ccc3n(CCN3CCOCC3)c12')

#### Encoder Only

In [28]:
from tokenizer import SmilesTokenizer

test_smiles = "COc1cccc2c3c(=O)n([C@@H]4C(C)(C)[C@@H]5CC[C@@]4(C)C5)ccc3n(CCN3CCOCC3)c12"
tokenizer = SmilesTokenizer("./data/vocab.txt")
tokenizer = SmilesTokenizer("./data/vocab.txt")
tokenized_test = tokenizer.encode(test_smiles)
tokenized_test_padded = pad_sequences([tokenized_test], maxlen=123, padding='post')
tokenized_test_tensor = tf.constant(tokenized_test_padded)

enc_padding_mask, combined_mask, dec_padding_mask = create_masks(tokenized_test_tensor, tokenized_test_tensor)

test_smiles, tokenized_test_tensor

('COc1cccc2c3c(=O)n([C@@H]4C(C)(C)[C@@H]5CC[C@@]4(C)C5)ccc3n(CCN3CCOCC3)c12',
 <tf.Tensor: shape=(1, 123), dtype=int32, numpy=
 array([[12, 16, 19, 15, 20, 15, 15, 15, 15, 21, 15, 26, 15, 17, 22, 19,
         18, 25, 17, 35, 32, 16, 17, 16, 18, 17, 16, 18, 35, 43, 16, 16,
         56, 32, 17, 16, 18, 16, 43, 18, 15, 15, 15, 26, 25, 17, 16, 16,
         23, 26, 16, 16, 19, 16, 16, 26, 18, 15, 20, 21, 13,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)>)

In [29]:
enc_output = transformer.encoder(tokenized_test_tensor, False, enc_padding_mask)

enc_output, enc_output.shape

(<tf.Tensor: shape=(1, 123, 32), dtype=float32, numpy=
 array([[[-7.5602907e-01,  9.8976588e-01,  3.8249230e-01, ...,
           8.8623172e-01, -7.7111667e-01,  4.2068535e-01],
         [-4.5703232e-01, -4.1441080e-01, -8.0486238e-02, ...,
           3.2696372e-01,  1.2151197e+00, -5.1542324e-01],
         [-2.9639983e-01, -4.8903784e-01,  1.1130489e+00, ...,
           8.6220801e-01,  2.9730543e-01, -1.8603158e-01],
         ...,
         [ 3.2808669e-03, -6.5462244e-01, -1.0051614e-01, ...,
           2.8973863e-01, -2.6029491e-01,  4.2959553e-01],
         [-7.1666911e-03, -7.1080154e-01, -5.6212269e-02, ...,
           2.3785900e-01, -2.8803337e-01,  4.1768509e-01],
         [-5.7289515e-02, -7.3378289e-01,  1.8717349e-04, ...,
           2.0797640e-01, -3.1194839e-01,  3.9206657e-01]]], dtype=float32)>,
 TensorShape([1, 123, 32]))

#### Decoder Only

In [60]:
from tokenizer import SmilesTokenizer

test_smiles = "COc1cccc2c3c(=O)n([C@@H]4C(C)(C)[C@@H]5CC[C@@]4(C)C5)ccc3n(CCN3CCOCC3)c12"
tokenizer = SmilesTokenizer("./data/vocab.txt")
tokenized_test = tokenizer.encode(test_smiles)
tokenized_test_padded = pad_sequences([tokenized_test], maxlen=123, padding='post')
tokenized_test_tensor = tf.constant(tokenized_test_padded)

enc_padding_mask, combined_mask, dec_padding_mask = create_masks(tokenized_test_tensor, tokenized_test_tensor)
enc_output = transformer.encoder(tokenized_test_tensor, False, enc_padding_mask)

enc_output.shape

TensorShape([1, 123, 32])

In [65]:
print(tokenized_test_tensor.shape, enc_output.shape, combined_mask.shape, dec_padding_mask.shape)

dec_output, attention_weights = transformer.decoder(tokenized_test_tensor, enc_output, False, combined_mask, dec_padding_mask)

print(tokenized_test_tensor.shape)

final_output = transformer.final_layer(dec_output)

final_output

(1, 123) (1, 123, 32) (1, 1, 123, 123) (1, 1, 1, 123)
(1, 123)


<tf.Tensor: shape=(1, 123, 510), dtype=float32, numpy=
array([[[-17.852684 , -18.149971 , -18.008026 , ..., -17.878656 ,
         -17.98216  , -17.889906 ],
        [-19.29191  , -19.641819 , -19.401522 , ..., -19.469503 ,
         -19.326366 , -19.449482 ],
        [-20.745901 , -20.88299  , -20.733458 , ..., -20.643974 ,
         -20.70992  , -20.718563 ],
        ...,
        [-14.977754 , -15.05572  , -14.617012 , ..., -14.735315 ,
         -14.626514 , -14.753969 ],
        [-15.126015 , -15.224129 , -14.801893 , ..., -14.895902 ,
         -14.794842 , -14.946787 ],
        [-15.134567 , -15.252741 , -14.823038 , ..., -14.926474 ,
         -14.8180065, -14.976839 ]]], dtype=float32)>

In [32]:
preds = []

for token in range(123):
    preds.append( int( tf.argmax( final_output[:, token, :], axis=-1 )[0] ) )
pred_smiles = tokenizer.decode(preds).replace(' ', '')

pred_smiles[:pred_smiles.find('[SEP]')], test_smiles

('COc1ccc22c3c(=O)n([C@@H]4C(C)(C)[C@@H]4CC[C@@]4(C)C5)ccccn(CCN3CCOC33)c12',
 'COc1cccc2c3c(=O)n([C@@H]4C(C)(C)[C@@H]5CC[C@@]4(C)C5)ccc3n(CCN3CCOCC3)c12')

## Dataset Creation

In [33]:
cb2_smiles = open('./data/X_SMILES.txt', 'r')
cb2_smiles = cb2_smiles.read().splitlines()

cb2_smiles[:5], len(cb2_smiles)

(['CCCCC/C=C\\C/C=C\\C/C=C\\C/C=C\\CCCC(=O)NCCc1ccoc1',
  'CCCCC/C=C\\C/C=C\\C/C=C\\C/C=C\\CCCC(=O)NCc1ccoc1',
  'CCCCC/C=C\\C/C=C\\C/C=C\\C/C=C\\CCCC(=O)NCc1cccn1C',
  'Cc1c(C(=O)c2cccc3ccccc23)c2cccc3c2n1[C@H](CN1CCOCC1)CO3',
  'COc1ccccc1CNC(=O)c1nn(CCN2CCOCC2)c2c(OC)cccc12'],
 2723)

In [142]:
d2_smiles = open('./data/D2_SMILES.txt', 'r')
d2_smiles = d2_smiles.read().splitlines()

d2_smiles[:5], len(d2_smiles)

(['NC(=O)[C@H]1CS[C@@H]2CC[C@]3(CCCN3C(=O)[C@@H]3CCCN3)C(=O)N12',
  'CC1Cc2cccc3c2N1C(=O)C(N1CCN(Cc2ccc(Cl)cc2)CC1)CC3',
  'CC1(C)Cc2cccc3c2N1C(=O)C(N1CCN(Cc2ccc(Cl)cc2)CC1)CC3',
  'Nc1cccc(-c2ccc(CCN3CCN(c4cccc5cccnc45)CC3)cc2)n1',
  'Cc1ccc(CN2CCN(C3CCc4cccc5c4N(CC5)C3=O)CC2)cc1'],
 7252)

In [34]:
from tokenizer import SmilesTokenizer
tokenizer = SmilesTokenizer("./data/vocab.txt")

TRAINING = False
LENGTH = 128
DFF = 32
preprocessed_cb2 = []

curr = 1
for smiles in cb2_smiles:
    tokenized = tokenizer.encode(smiles)
    padded = pad_sequences([tokenized], maxlen=LENGTH, padding='post')
    smiles_tensor = tf.constant(padded)
    
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(smiles_tensor, smiles_tensor)
    enc_output = transformer.encoder(smiles_tensor, TRAINING, enc_padding_mask)
    enc_output = enc_output.numpy()
    enc_output = enc_output.reshape((LENGTH, DFF))
    
    preprocessed_cb2.append(enc_output)

    if curr % 100 == 0:
        print(f"{curr} SMILES processed")

    curr += 1

print("Done!")

100 SMILES processed
200 SMILES processed
300 SMILES processed
400 SMILES processed
500 SMILES processed
600 SMILES processed
700 SMILES processed
800 SMILES processed
900 SMILES processed
1000 SMILES processed
1100 SMILES processed
1200 SMILES processed
1300 SMILES processed
1400 SMILES processed
1500 SMILES processed
1600 SMILES processed
1700 SMILES processed
1800 SMILES processed
1900 SMILES processed
2000 SMILES processed
2100 SMILES processed
2200 SMILES processed
2300 SMILES processed
2400 SMILES processed
2500 SMILES processed
2600 SMILES processed
2700 SMILES processed
Done!


In [35]:
preprocessed_cb2 = np.array(preprocessed_cb2)
preprocessed_cb2.shape, preprocessed_cb2[0].shape

((2723, 128, 32), (128, 32))

In [36]:
np.save('./data/processed_cb2', preprocessed_cb2)

In [145]:
from tokenizer import SmilesTokenizer
tokenizer = SmilesTokenizer("./data/vocab.txt")

TRAINING = False
LENGTH = 128
DFF = 32
preprocessed_d2 = []

curr = 1
for smiles in d2_smiles:
    tokenized = tokenizer.encode(smiles)
    padded = pad_sequences([tokenized], maxlen=LENGTH, padding='post')
    smiles_tensor = tf.constant(padded)
    
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(smiles_tensor, smiles_tensor)
    enc_output = transformer.encoder(smiles_tensor, TRAINING, enc_padding_mask)
    enc_output = enc_output.numpy()
    enc_output = enc_output.reshape((LENGTH, DFF))
    
    preprocessed_d2.append(enc_output)

    if curr % 100 == 0:
        print(f"{curr} SMILES processed")

    curr += 1

print("Done!")

100 SMILES processed
200 SMILES processed
300 SMILES processed
400 SMILES processed
500 SMILES processed
600 SMILES processed
700 SMILES processed
800 SMILES processed
900 SMILES processed
1000 SMILES processed
1100 SMILES processed
1200 SMILES processed
1300 SMILES processed
1400 SMILES processed
1500 SMILES processed
1600 SMILES processed
1700 SMILES processed
1800 SMILES processed
1900 SMILES processed
2000 SMILES processed
2100 SMILES processed
2200 SMILES processed
2300 SMILES processed
2400 SMILES processed
2500 SMILES processed
2600 SMILES processed
2700 SMILES processed
2800 SMILES processed
2900 SMILES processed
3000 SMILES processed
3100 SMILES processed
3200 SMILES processed
3300 SMILES processed
3400 SMILES processed
3500 SMILES processed
3600 SMILES processed
3700 SMILES processed
3800 SMILES processed
3900 SMILES processed
4000 SMILES processed
4100 SMILES processed
4200 SMILES processed
4300 SMILES processed
4400 SMILES processed
4500 SMILES processed
4600 SMILES process

In [146]:
preprocessed_d2 = np.array(preprocessed_d2)
preprocessed_d2.shape, preprocessed_d2[0].shape

((7252, 128, 32), (128, 32))

In [147]:
np.save('./data/processed_d2', preprocessed_d2)

## Previous Attempts

In [None]:
from tensorflow.keras.utils import pad_sequences

padded_smiles = pad_sequences(tokenized_smiles, padding='post')
padded_smiles = tf.cast(padded_smiles, dtype="float32")
padded_smiles = np.stack(padded_smiles, axis=0)

padded_smiles.shape

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  
  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
  pos_encoding = angle_rads[np.newaxis, ...]
    
  return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.
  
  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

In [None]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)
 
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)
    
    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)
    
    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)
    
    return out3, attn_weights_block1, attn_weights_block2

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]
    
    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    
    return x  # (batch_size, input_seq_len, d_model)

In [None]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
    
    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)
    
  def call(self, x, enc_output, training, 
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}
    
    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]
    
    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)
      
      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In [None]:
class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, pe_input, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
    
    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)
    
    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
    
    return final_output, attention_weights

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8

input_vocab_size = 510
dropout_rate = 0.2

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.constant(d_model, dtype=tf.float32)
        self.warmup_steps = tf.constant(warmup_steps, dtype=tf.float32)

    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)  # Convert step to float32
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)


In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  
  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.MeanSquaredError(
    name='train_accuracy')

In [None]:
transformer = Transformer(num_layers, d_model, num_heads, dff,
                          input_vocab_size, input_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=input_vocab_size,
                          rate=dropout_rate)

In [None]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)
  
  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)
  
  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
  return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
checkpoint_path = "./checkpoints/train"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print ('Latest checkpoint restored!!')

In [None]:
EPOCHS = 20
MAX_LEN = 123

# The @tf.function trace-compiles train_step into a TF graph for faster
# execution. The function specializes to the precise shape of the argument
# tensors. To avoid re-tracing due to the variable sequence lengths or variable
# batch sizes (the last batch is smaller), use input_signature to specify
# more generic shapes.

train_step_signature = [
    tf.TensorSpec(shape=(None, MAX_LEN), dtype=tf.int64),
    tf.TensorSpec(shape=(None, MAX_LEN), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
  
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
  
    with tf.GradientTape() as tape:
        predictions, _ = transformer(inp, tar_inp, 
                                     True, 
                                     enc_padding_mask, 
                                     combined_mask, 
                                     dec_padding_mask)
        # Create padding mask (True for non-padded elements, False for padded elements)
        padding_mask = tf.math.logical_not(tf.math.equal(tar_real, 0))
        # Apply the padding mask to the ground truth and predictions
        tar_real_masked = tf.boolean_mask(tar_real, padding_mask)
        predictions_masked = tf.boolean_mask(predictions, padding_mask)
        
        # Calculate the mean squared error with masking
        loss = tf.keras.losses.mean_squared_error(tar_real_masked, predictions_masked)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
  
    train_loss(loss)
    train_accuracy(tar_real, predictions)



In [None]:
import time
batch_size = 64

for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()

  for i in range(0, len(padded_smiles), batch_size):
    inp_batch = padded_smiles[i : i + batch_size]
    train_step(inp_batch, inp_batch)

    if batch % 50 == 0:
      print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
          epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
  if (epoch + 1) % 5 == 0:
    ckpt_save_path = ckpt_manager.save()
    print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
                                                         ckpt_save_path))
    
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))

  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

# for epoch in range(EPOCHS):
#   start = time.time()
  
#   train_loss.reset_states()
#   train_accuracy.reset_states()
  
#   # inp -> portuguese, tar -> english
#   for (32, (inp)) in enumerate(padded_smiles):
#     print(inp.shape)
      
#     train_step(tf.cast(inp, dtype=tf.int64), tf.cast(inp, dtype=tf.int64))
    
#     if batch % 50 == 0:
#       print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(
#           epoch + 1, batch, train_loss.result(), train_accuracy.result()))
      
#   if (epoch + 1) % 5 == 0:
#     ckpt_save_path = ckpt_manager.save()
#     print ('Saving checkpoint for epoch {} at {}'.format(epoch+1,
#                                                          ckpt_save_path))
    
#   print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
#                                                 train_loss.result(), 
#                                                 train_accuracy.result()))

#   print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

class PositionalEncoding(layers.Layer):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.encoding = self.positional_encoding(max_len, d_model)

    def positional_encoding(self, max_len, d_model):
        pos = tf.range(max_len, dtype=tf.float32)
        i = tf.range(d_model, dtype=tf.float32)
        angle_rates = 1 / tf.pow(10000, (2 * i // 2) / d_model)
        angles = pos[:, tf.newaxis] * angle_rates[tf.newaxis, :]
        angle_rads = angles * tf.constant(np.pi / 180.0)
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, x):
        return x + self.encoding[:, :tf.shape(x)[1], :]

In [None]:
class TransformerEncoder(layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate

        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = self.point_wise_feed_forward_network(d_model, d_ff)
        self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def point_wise_feed_forward_network(self, d_model, d_ff):
        return tf.keras.Sequential([
            layers.Dense(d_ff, activation='relu'),
            layers.Dense(d_model)
        ])

    def call(self, x):
        attn_output = self.mha(x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layer_norm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layer_norm2(out1 + ffn_output)

In [None]:
class TransformerDecoder(layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, target_vocab_size, max_len, dropout_rate=0.1):
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.target_vocab_size = target_vocab_size
        self.max_len = max_len
        self.dropout_rate = dropout_rate

        self.mha1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.mha2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = self.point_wise_feed_forward_network(d_model, d_ff)
        self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm3 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
        self.dropout3 = layers.Dropout(dropout_rate)
        self.embedding = layers.Embedding(target_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)

    def point_wise_feed_forward_network(self, d_model, d_ff):
        return tf.keras.Sequential([
            layers.Dense(d_ff, activation='relu'),
            layers.Dense(d_model)
        ])

    def call(self, x, encoder_output, padding_mask, look_ahead_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = self.positional_encoding(x)

        x = self.dropout1(x)
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout2(attn1)
        out1 = self.layer_norm1(x + attn1)

        attn2, attn_weights_block2 = self.mha2(encoder_output, encoder_output, out1, padding_mask)
        attn2 = self.dropout3(attn2)
        out2 = self.layer_norm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout2(ffn_output)
        out3 = self.layer_norm3(out2 + ffn_output)

        return out3, attn_weights_block1, attn_weights_block2

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, d_model, num_heads, d_ff, input_vocab_size, target_vocab_size, max_len, dropout_rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = TransformerEncoder(d_model, num_heads, d_ff, dropout_rate)
        self.decoder = TransformerDecoder(d_model, num_heads, d_ff, target_vocab_size, max_len, dropout_rate)
        self.final_layer = layers.Dense(target_vocab_size)

    def call(self, inputs):
        inp, padding_mask, look_ahead_mask = inputs['inp_data'], inputs['padding_mask'], inputs['look_ahead_mask']
        encoder_output = self.encoder(inp)
        decoder_output, _, _ = self.decoder(inp, encoder_output, padding_mask, look_ahead_mask)
        final_output = self.final_layer(decoder_output)
        return final_output

In [None]:
# Data preparation and masking functions
def prepare_data(inputs, max_len):
    inp_data = pad_sequences(inputs, padding='post')
    inp_data = tf.cast(inp_data, dtype="float32")
    padding_mask = create_padding_mask(inp_data)
    look_ahead_mask = create_look_ahead_masks(inp_data)
    return inp_data, padding_mask, look_ahead_mask

def create_padding_mask(seq):
    mask = tf.math.equal(seq, 0)
    mask = tf.cast(mask, tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask[tf.newaxis, tf.newaxis, :size, :size]

def create_look_ahead_masks(input_data):
    max_len = tf.shape(input_data)[1]
    mask = create_look_ahead_mask(max_len)
    return tf.tile(mask, [tf.shape(input_data)[0], 1, 1, 1])

In [None]:
input_vocab_size = 510
max_len = max(len(seq) for seq in tokenized_smiles)

max_len

In [None]:
d_model = 64  # You can adjust this based on your needs
num_heads = 4  # You can adjust this based on your needs
d_ff = 128  # You can adjust this based on your needs
dropout_rate = 0.2  # You can adjust this based on your needs

# Prepare data and train the model
inp_data, padding_mask, look_ahead_mask = prepare_data(tokenized_smiles, max_len)

In [None]:
# Create the Transformer model
transformer = Transformer(d_model, num_heads, d_ff, input_vocab_size, input_vocab_size, max_len, dropout_rate)

# Define loss function (unsupervised loss, e.g., mean squared error or binary cross-entropy)
loss_function = tf.keras.losses.MeanSquaredError()

# Define optimizer
optimizer = tf.keras.optimizers.Adam()

# Compile the model
transformer.compile(optimizer=optimizer, loss=loss_function)

input_data = {
    "inp_data": inp_data,
    "padding_mask": padding_mask,
    "look_ahead_mask": look_ahead_mask
}

print(inp_data.shape, padding_mask.shape, look_ahead_mask.shape)

# Train the model on the data (use the same inp_data as both input and target)
transformer.fit({'inp_data': inp_data, 'padding_mask': padding_mask, 'look_ahead_mask': look_ahead_mask}, inp_data, batch_size=3, epochs=10)

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences

class PositionalEncoding(layers.Layer):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        self.encoding = self.positional_encoding(max_len, d_model)

    def positional_encoding(self, max_len, d_model):
        pos = tf.range(max_len, dtype=tf.float32)
        i = tf.range(d_model, dtype=tf.float32)
        angle_rates = 1 / tf.pow(10000, (2 * i // 2) / d_model)
        angles = pos[:, tf.newaxis] * angle_rates[tf.newaxis, :]
        angle_rads = angles * tf.constant(np.pi / 180.0)
        sines = tf.math.sin(angle_rads[:, 0::2])
        cosines = tf.math.cos(angle_rads[:, 1::2])
        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, x):
        return x + self.encoding[:, :tf.shape(x)[1], :]

class TransformerEncoder(layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout_rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.dropout_rate = dropout_rate

        self.mha = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = self.point_wise_feed_forward_network(d_model, d_ff)
        self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)

    def point_wise_feed_forward_network(self, d_model, d_ff):
        return tf.keras.Sequential([
            layers.Dense(d_ff, activation='relu'),
            layers.Dense(d_model)
        ])

    def call(self, x):
        attn_output = self.mha(x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layer_norm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layer_norm2(out1 + ffn_output)

class TransformerDecoder(layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, target_vocab_size, max_len, dropout_rate=0.1):
        super(TransformerDecoder, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_ff = d_ff
        self.target_vocab_size = target_vocab_size
        self.max_len = max_len
        self.dropout_rate = dropout_rate

        self.mha1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.mha2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=d_model)
        self.ffn = self.point_wise_feed_forward_network(d_model, d_ff)
        self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm3 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout_rate)
        self.dropout2 = layers.Dropout(dropout_rate)
        self.dropout3 = layers.Dropout(dropout_rate)
        self.embedding = layers.Embedding(target_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_len)

    def point_wise_feed_forward_network(self, d_model, d_ff):
        return tf.keras.Sequential([
            layers.Dense(d_ff, activation='relu'),
            layers.Dense(d_model)
        ])

    def call(self, x, encoder_output, padding_mask, look_ahead_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = self.positional_encoding(x)

        x = self.dropout1(x)
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout2(attn1)
        out1 = self.layer_norm1(x + attn1)

        attn2, attn_weights_block2 = self.mha2(encoder_output, encoder_output, out1, padding_mask)
        attn2 = self.dropout3(attn2)
        out2 = self.layer_norm2(out1 + attn2)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout2(ffn_output)
        out3 = self.layer_norm3(out2 + ffn_output)

        return out3, attn_weights_block1, attn_weights_block2

class Transformer(tf.keras.Model):
    def __init__(self, d_model, num_heads, d_ff, input_vocab_size, target_vocab_size, max_len, dropout_rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = TransformerEncoder(d_model, num_heads, d_ff, dropout_rate)
        self.decoder = TransformerDecoder(d_model, num_heads, d_ff, target_vocab_size, max_len, dropout_rate)
        self.final_layer = layers.Dense(target_vocab_size)

    def call(self, inputs):
        inp, padding_mask, look_ahead_mask = inputs
        encoder_output = self.encoder(inp)
        decoder_output, _, _ = self.decoder(inp, encoder_output, padding_mask, look_ahead_mask)
        final_output = self.final_layer(decoder_output)
        return final_output

# Data preparation and masking functions
def prepare_data(inputs, max_len):
    inp_data = pad_sequences(inputs, padding='post')
    padding_mask = create_padding_mask(inp_data)
    look_ahead_mask = create_look_ahead_masks(inp_data)
    return inp_data, padding_mask, look_ahead_mask

def create_padding_mask(seq):
    mask = tf.math.equal(seq, 0)
    mask = tf.cast(mask, tf.float32)
    return mask[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask[tf.newaxis, tf.newaxis, :size, :size]

def create_look_ahead_masks(input_data):
    max_len = tf.shape(input_data)[1]
    mask = create_look_ahead_mask(max_len)
    return tf.tile(mask, [tf.shape(input_data)[0], 1, 1, 1])

input_vocab_size = 510
max_len = max(len(seq) for seq in tokenized_smiles)

max_len

d_model = 64  # You can adjust this based on your needs
num_heads = 4  # You can adjust this based on your needs
d_ff = 128  # You can adjust this based on your needs
dropout_rate = 0.2  # You can adjust this based on your needs

# Prepare data and train the model
inp_data, padding_mask, look_ahead_mask = prepare_data(tokenized_smiles, max_len)

# Create the Transformer model
transformer = Transformer(d_model, num_heads, d_ff, input_vocab_size, input_vocab_size, max_len, dropout_rate)

# Define loss function (unsupervised loss, e.g., mean squared error or binary cross-entropy)
loss_function = tf.keras.losses.MeanSquaredError()

# Define optimizer
optimizer = tf.keras.optimizers.Adam()

# Compile the model
transformer.compile(optimizer=optimizer, loss=loss_function)

input_data = {
    "inp_data": inp_data,
    "padding_mask": padding_mask,
    "look_ahead_mask": look_ahead_mask
}

print(inp_data.shape, padding_mask.shape, look_ahead_mask.shape)

# Train the model on the data (use the same inp_data as both input and target)
transformer.fit({'inp_data': inp_data, 'padding_mask': padding_mask, 'look_ahead_mask': look_ahead_mask}, inp_data, batch_size=3, epochs=10)

#