In [None]:
!nvidia-smi

Mon Sep  6 01:34:17 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    25W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Basic transformer Model**

In [None]:
import numpy as np
import tensorflow as tf


def get_angles(position, i, d_model):
    angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32))
    return position * angles

def positional_encoding(position, d_model):
    #tf.newaxis : expand dimensions
    angle_rads = get_angles(
        position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
        i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
        d_model=d_model)
    # list[<start>:<end>:<step>] even indices '0::2'
    sines = tf.math.sin(angle_rads[:, 0::2])

    # list[<start>:<end>:<step>] even indices '1::2'
    cosines = tf.math.cos(angle_rads[:, 1::2])

    angle_rads = np.zeros(angle_rads.shape)
    angle_rads[:, 0::2] = sines
    angle_rads[:, 1::2] = cosines
    pos_encoding = tf.constant(angle_rads)
    pos_encoding = pos_encoding[tf.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)


def create_look_ahead_mask(size):
    look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return look_ahead_mask

def scaled_dot_product_attention(q, k, v, mask):
    """Calculate the attention weights.
    q, k, v must have matching leading dimensions.
    k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
    The mask has different shapes depending on its type(padding or look ahead)
    but it must be broadcastable for addition.
    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable
          to (..., seq_len_q, seq_len_k). Defaults to None.
    Returns:
    output, attention_weights
    """

    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights


class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

    return out2

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)

  def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)

    attn2, attn_weights_block2 = self.mha2(
        enc_output, enc_output, out1, padding_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

    return out3, attn_weights_block1, attn_weights_block2

class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff,
               input_vocab_size, maximum_position_encoding, dropout_rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(input_vocab_size, self.d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate)
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]

    # adding embedding and position encoding.
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

    return x  # (batch_size, input_seq_len, d_model)


class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff,
               target_vocab_size, maximum_position_encoding, dropout_rate=0.1):

    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, dropout_rate)
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):

    seq_len = tf.shape(x)[1]
    attention_weights = {}

    x = self.embedding(x)  # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)

      attention_weights[f'decoder_layer{i+1}_block1'] = block1
      attention_weights[f'decoder_layer{i+1}_block2'] = block2

    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights


class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff,
               input_vocab_size,target_vocab_size,
               positional_encoding_input,positional_encoding_target,
               dropout_rate=0.1):
    super(Transformer, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff,
                             input_vocab_size,positional_encoding_input, dropout_rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff,
                           target_vocab_size, positional_encoding_target, dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, input, target, training, enc_padding_mask,
           look_ahead_mask, dec_padding_mask):

    enc_output = self.encoder(input, training, enc_padding_mask)  # (batch_size, input_seq_len, d_model)

    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        target, enc_output, training, look_ahead_mask, dec_padding_mask)

    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

    return enc_output, dec_output, final_output, attention_weights

## How to schedule learning rate

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

# **Pre-training for theoretical data**

In [None]:
import time

feature_description = {
            'sequence': tf.io.VarLenFeature(tf.int64),
            'mz': tf.io.VarLenFeature(tf.int64),
            }

def parse_function(example_proto):
    parsed_example = tf.io.parse_single_example(example_proto,feature_description)
    mz = parsed_example['mz'].values
    sequence = parsed_example['sequence'].values
    return mz, sequence

path_train_data='/content/drive/MyDrive/translateMS/data/theoretical_preprocessed_train_data.tfrecords'
path_valid_data='/content/drive/MyDrive/translateMS/data/theoretical_preprocessed_valid_data.tfrecords'

#size_train_dataset = 990000
train_dataset = tf.data.TFRecordDataset(path_train_data)#.take(size_train_dataset)
valid_dataset = tf.data.TFRecordDataset(path_valid_data).map(parse_function)

#Set batchs
theoretical_data_size = 7448762
BATCH_SIZE = 150
NUM_BATCHS = int(int(theoretical_data_size*0.99)/BATCH_SIZE)
train_batches = (train_dataset
                 .map(parse_function)
                 .padded_batch(BATCH_SIZE)
                 .prefetch(tf.data.AUTOTUNE))


def create_masks(input, target):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(input)
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(input)
    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
    dec_target_padding_mask = create_padding_mask(target)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.argmax(pred, axis=2))

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')


'''
d_model : input(embedding), ouput 차원
num_layers : 인코더, 디코더 층
num_heads : 멀티헤드 수
d_ff : feedforward 차원 
'''
D_MODEL = 128
NUM_LAYERS = 2
NUM_HEADS = 8
DFF = 512
DROPOUT_RATE = 0.2

learning_rate = CustomSchedule(D_MODEL)
optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

transformer = Transformer(
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dff=DFF,
    input_vocab_size=600000,
    target_vocab_size=30,
    positional_encoding_input = 2000,
    positional_encoding_target = 50,
    dropout_rate=DROPOUT_RATE)

#save checkpoint
checkpoint_path = "/content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2"

ckpt = tf.train.Checkpoint(transformer=transformer,
                           optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=10)

#ckpt_num = '/content/drive/MyDrive/translateMS/checkpoints/train/ckpt-12'
#ckpt.restore(ckpt_num)
#print('Checkpoint restored!!')
# if a checkpoint exists, restore the latest checkpoint.
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

train_step_signature = [
  tf.TensorSpec(shape=(None, None), dtype=tf.int64),
  tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature)
def train_step(input, target):

    target_input = target[:, :-1]
    target_real = target[:, 1:]
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input, target_input)

    with tf.GradientTape() as tape:
        _, _, predictions, _ = transformer(input, target_input,
                                   True,
                                   enc_padding_mask,
                                   combined_mask,
                                   dec_padding_mask)

        loss = loss_function(target_real, predictions)


    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(accuracy_function(target_real, predictions))


def evaluate_aminoacid_level(dataset):
    batch_size = 64
    num_batchs = 0
    accuracy = 0
    loss = 0
    dataset_batchs = dataset.padded_batch(batch_size = batch_size, drop_remainder=True)

    for batch, (input, target) in enumerate(dataset_batchs):
        num_batchs = batch+1

        target_input = target[:, :-1]
        target_real = target[:, 1:]
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input, target_input)

        _, _, predictions, _ = transformer(input, target_input,
                                   False,
                                   enc_padding_mask,
                                   combined_mask,
                                   dec_padding_mask)
        loss += loss_function(target_real, predictions)
        accuracy += accuracy_function(target_real, predictions)

    return loss/num_batchs, accuracy/num_batchs


Latest checkpoint restored!!


In [None]:
print(ckpt_manager.latest_checkpoint)

/content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-16


In [None]:
for mz, seq in valid_dataset:
  print(mz,seq)
  break  


for batch, (input, target) in enumerate(train_batches):
  print(input, target)
  break  

tf.Tensor(
[     1  11105  11203  12906  15710  15809  17511  21012  21110  22813
  28615  28713  30416  32320  32419  34121  39923  40021  41724  43629
  43727  45430  52827  52926  54628  54937  55035  56738  62932  63030
  64733  68643  68741  70444  76638  76736  78439  78748  78846  80549
  87946  88045  89747  91652  91750  93453  99255  99353 101056 102960
 103059 104761 110563 110662 112364 115865 115963 117666 120470 120568
 122271 134976      2], shape=(63,), dtype=int64) tf.Tensor([ 1 15  5  8  8  8 21 16  7  8  7 18  2], shape=(13,), dtype=int64)
tf.Tensor(
[[    1  2051  2100 ...     0     0     0]
 [    1  1868  1901 ...     0     0     0]
 [    1  3269  3302 ...     0     0     0]
 ...
 [    1  4854  4903 ...     0     0     0]
 [    1  6553  6602 ...     0     0     0]
 [    1 13909 14008 ...     0     0     0]], shape=(150, 669), dtype=int64) tf.Tensor(
[[ 1 19 11 ...  0  0  0]
 [ 1 13  8 ...  0  0  0]
 [ 1  8  7 ...  0  0  0]
 ...
 [ 1  8 15 ...  0  0  0]
 [ 1 11 19 .

In [None]:
epoch = 0
#for epoch in range(EPOCHS):
while True:
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()

    for batch, (input, target) in enumerate(train_batches):
        train_step(input, target)
        print('\r',f'Epoch {epoch + 1} | batch {batch+1}/{NUM_BATCHS} Loss {train_loss.result():.3f} Accuracy {train_accuracy.result():.4f}',end='')

    print('\r',f'Epoch {epoch + 1} : Time {time.time() - start:.2f}s')

    print(f'\tTrain | Loss {train_loss.result():.3f}, Accuracy {train_accuracy.result():.3f}')

    valid_loss, valid_accuracy = evaluate_aminoacid_level(valid_dataset)
    print(f'\tValid | Loss {valid_loss:.3f}, Accuracy {valid_accuracy:.3f}')

    ckpt_save_path = ckpt_manager.save()
    print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')
 
    epoch+=1



 Epoch 1 : Time 12028.05s
	Train | Loss 0.017, Accuracy 0.995
	Valid | Loss 0.004, Accuracy 0.999
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-15
 Epoch 2 : Time 11972.85s
	Train | Loss 0.016, Accuracy 0.995
	Valid | Loss 0.003, Accuracy 0.999
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-16
 Epoch 3 | batch 316/49161 Loss 0.016 Accuracy 0.9949

KeyboardInterrupt: ignored

In [None]:

def evaluate_aminoacid_level(dataset):
    batch_size = 64
    num_batchs = 0
    accuracy = 0
    loss = 0
    dataset_batchs = dataset.padded_batch(batch_size = batch_size, drop_remainder=True)

    for batch, (input, target) in enumerate(dataset_batchs):
        num_batchs = batch+1

        target_input = target[:, :-1]
        target_real = target[:, 1:]
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input, target_input)

        _, _, predictions, _ = transformer(input, target_input,
                                   False,
                                   enc_padding_mask,
                                   combined_mask,
                                   dec_padding_mask)
        loss += loss_function(target_real, predictions)
        accuracy += accuracy_function(target_real, predictions)

    return loss/num_batchs, accuracy/num_batchs

In [None]:
  valid_loss, valid_accuracy = evaluate_aminoacid_level(valid_dataset)
  print(f'\tValid | Loss {valid_loss:.3f}, Accuracy {valid_accuracy:.3f}')

	Valid | Loss 0.022, Accuracy 0.993


##Info. Evaluation

In [None]:
'''
D_MODEL = 128
NUM_LAYERS = 2
NUM_HEADS = 8
DFF = 512
DROPOUT_RATE = 0.2


 Epoch 1 : Time 2700.33s
	Train | Loss  0.432, Accuracy 0.843
	Valid | Loss 0.057, Accuracy 0.982
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-1
 Epoch 2 : Time 2700.33s
	Train | Loss 0.243, Accuracy 0.951
	Valid | Loss 0.022, Accuracy 0.993
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-2
 Epoch 1 : Time 24998.77s
	Train | Loss 0.056, Accuracy 0.982
	Valid | Loss 0.013, Accuracy 0.996
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-3
 Epoch 2 : Time 24959.29s
	Train | Loss 0.043, Accuracy 0.986
	Valid | Loss 0.011, Accuracy 0.996
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-4
 Epoch 1 : Time 24926.82s
	Train | Loss 0.036, Accuracy 0.989
	Valid | Loss 0.009, Accuracy 0.997
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-5
 Epoch 2 : Time 24905.39s
	Train | Loss 0.031, Accuracy 0.990
	Valid | Loss 0.010, Accuracy 0.997
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-6
 Epoch 1 : Time 12009.18s
	Train | Loss 0.028, Accuracy 0.991
	Valid | Loss 0.007, Accuracy 0.998
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-7
 Epoch 2 : Time 11995.47s
	Train | Loss 0.025, Accuracy 0.992
	Valid | Loss 0.007, Accuracy 0.998
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-8
 Epoch 3 : Time 11983.60s
	Train | Loss 0.023, Accuracy 0.993
	Valid | Loss 0.006, Accuracy 0.998
Saving checkpoint for epoch 3 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-9
 Epoch 4 : Time 11986.56s
	Train | Loss 0.022, Accuracy 0.993
	Valid | Loss 0.005, Accuracy 0.998
Saving checkpoint for epoch 4 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-10
 Epoch 5 : Time 11984.53s
	Train | Loss 0.021, Accuracy 0.993
	Valid | Loss 0.004, Accuracy 0.999
Saving checkpoint for epoch 5 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-11
 Epoch 6 : Time 11969.97s
	Train | Loss 0.019, Accuracy 0.994
	Valid | Loss 0.005, Accuracy 0.999
Saving checkpoint for epoch 6 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-12
 Epoch 1 : Time 11993.11s
	Train | Loss 0.018, Accuracy 0.994
	Valid | Loss 0.004, Accuracy 0.999
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-13
 Epoch 2 : Time 11965.59s
	Train | Loss 0.018, Accuracy 0.994
	Valid | Loss 0.004, Accuracy 0.999
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-14
Epoch 1 : Time 12028.05s
	Train | Loss 0.017, Accuracy 0.995
	Valid | Loss 0.004, Accuracy 0.999
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-15
 Epoch 2 : Time 11972.85s
	Train | Loss 0.016, Accuracy 0.995
	Valid | Loss 0.003, Accuracy 0.999
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_128_2_8_512_0.2/ckpt-16

'''


'''
D_MODEL = 64
NUM_LAYERS = 1
NUM_HEADS = 1
DFF = 128
DROPOUT_RATE = 0.2
 Epoch 1 : Time 2700.33s
	Train | Loss 1.763, Accuracy 0.457
	Valid | Loss 1.354, Accuracy 0.572
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-1
 Epoch 2 : Time 2688.46s
	Train | Loss 1.496, Accuracy 0.532
	Valid | Loss 1.235, Accuracy 0.612
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-2
 Epoch 3 : Time 2700.66s
	Train | Loss 1.434, Accuracy 0.551
	Valid | Loss 1.166, Accuracy 0.634
Saving checkpoint for epoch 3 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-3
 Epoch 4 : Time 2706.47s
	Train | Loss 1.398, Accuracy 0.562
	Valid | Loss 1.110, Accuracy 0.650
Saving checkpoint for epoch 4 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-4
 Epoch 5 : Time 2702.19s
	Train | Loss 1.371, Accuracy 0.571
	Valid | Loss 1.080, Accuracy 0.661
Saving checkpoint for epoch 5 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-5
 Epoch 6 : Time 2700.80s
	Train | Loss 1.349, Accuracy 0.578
	Valid | Loss 1.054, Accuracy 0.670
Saving checkpoint for epoch 6 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-6
 Epoch 7 : Time 2701.57s
	Train | Loss 1.328, Accuracy 0.585
	Valid | Loss 1.006, Accuracy 0.685
Saving checkpoint for epoch 7 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-7
 Epoch 8 : Time 2701.76s
	Train | Loss 1.307, Accuracy 0.592
	Valid | Loss 0.978, Accuracy 0.695
Saving checkpoint for epoch 8 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-8
 Epoch 9 : Time 2710.60s
	Train | Loss 1.284, Accuracy 0.600
	Valid | Loss 0.941, Accuracy 0.707
Saving checkpoint for epoch 9 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-9
 Epoch 10 : Time 2705.58s
	Train | Loss 1.259, Accuracy 0.608
	Valid | Loss 0.907, Accuracy 0.717
Saving checkpoint for epoch 10 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-10
 Epoch 11 : Time 2701.31s
	Train | Loss 1.234, Accuracy 0.616
	Valid | Loss 0.872, Accuracy 0.728
Saving checkpoint for epoch 11 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-11
 Epoch 12 : Time 2700.67s
	Train | Loss 1.210, Accuracy 0.624
	Valid | Loss 0.851, Accuracy 0.735
Saving checkpoint for epoch 12 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-12
 Epoch 13 : Time 2701.21s
	Train | Loss 1.187, Accuracy 0.632
	Valid | Loss 0.818, Accuracy 0.745
Saving checkpoint for epoch 13 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-13
 Epoch 14 : Time 2701.20s
	Train | Loss 1.166, Accuracy 0.639
	Valid | Loss 0.787, Accuracy 0.755
Saving checkpoint for epoch 14 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-14
 Epoch 15 : Time 2701.08s
	Train | Loss 1.146, Accuracy 0.645
	Valid | Loss 0.762, Accuracy 0.762
Saving checkpoint for epoch 15 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-15
 Epoch 16 : Time 2701.46s
	Train | Loss 1.129, Accuracy 0.651
	Valid | Loss 0.742, Accuracy 0.768
Saving checkpoint for epoch 16 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-16
 Epoch 17 : Time 2703.07s
	Train | Loss 1.112, Accuracy 0.656
	Valid | Loss 0.722, Accuracy 0.775
Saving checkpoint for epoch 17 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-17
 Epoch 18 : Time 2701.19s
	Train | Loss 1.098, Accuracy 0.661
	Valid | Loss 0.710, Accuracy 0.779
Saving checkpoint for epoch 18 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-18
 Epoch 19 : Time 2700.99s
	Train | Loss 1.084, Accuracy 0.665
	Valid | Loss 0.700, Accuracy 0.783
Saving checkpoint for epoch 19 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-19
 Epoch 20 : Time 2703.05s
	Train | Loss 1.072, Accuracy 0.669
	Valid | Loss 0.688, Accuracy 0.786
Saving checkpoint for epoch 20 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-20
 Epoch 21 : Time 2706.26s
	Train | Loss 1.061, Accuracy 0.673
	Valid | Loss 0.675, Accuracy 0.789
Saving checkpoint for epoch 21 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-21
 Epoch 22 : Time 2710.75s
	Train | Loss 1.051, Accuracy 0.676
	Valid | Loss 0.666, Accuracy 0.792
Saving checkpoint for epoch 22 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-22
 Epoch 23 : Time 2709.67s
	Train | Loss 1.042, Accuracy 0.679
	Valid | Loss 0.660, Accuracy 0.794
Saving checkpoint for epoch 23 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-23
 Epoch 24 : Time 2704.67s
	Train | Loss 1.034, Accuracy 0.682
	Valid | Loss 0.650, Accuracy 0.797
Saving checkpoint for epoch 24 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-24
 Epoch 25 : Time 2706.52s
	Train | Loss 1.027, Accuracy 0.684
	Valid | Loss 0.640, Accuracy 0.800
Saving checkpoint for epoch 25 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-25
 Epoch 26 : Time 2706.80s
	Train | Loss 1.020, Accuracy 0.686
	Valid | Loss 0.628, Accuracy 0.805
Saving checkpoint for epoch 26 at /content/drive/MyDrive/translateMS/checkpoints/pretraining_64_1_1_128_0.2/ckpt-26
'''


'''
D_MODEL = 64
NUM_LAYERS = 2
NUM_HEADS = 2
DFF = 128
DROPOUT_RATE = 0.5

  Epoch 1 : Time 11854.26s
	Train | Loss 1.910, Accuracy 0.418
	Valid | Loss 1.532, Accuracy 0.548
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-1
 Epoch 2 : Time 11864.57s
	Train | Loss 1.576, Accuracy 0.515
	Valid | Loss 1.398, Accuracy 0.580
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-2
 Epoch 3 : Time 11875.01s
	Train | Loss 1.516, Accuracy 0.534
	Valid | Loss 1.325, Accuracy 0.601
Saving checkpoint for epoch 3 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-3
 Epoch 4 : Time 11860.26s
	Train | Loss 1.478, Accuracy 0.546
	Valid | Loss 1.279, Accuracy 0.615
Saving checkpoint for epoch 4 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-4
 Epoch 5 : Time 11867.08s
	Train | Loss 1.450, Accuracy 0.555
	Valid | Loss 1.277, Accuracy 0.618
Saving checkpoint for epoch 5 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-5
 Epoch 6 : Time 11867.80s
	Train | Loss 1.427, Accuracy 0.562
	Valid | Loss 1.301, Accuracy 0.619
Saving checkpoint for epoch 6 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-6
 Epoch 7 | batch 829/14402 Loss 1.419 Accuracy 0.5650
  Epoch 1 : Time 7642.94s
	Train | Loss 1.409, Accuracy 0.568
	Valid | Loss 1.334, Accuracy 0.613
Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-7
 Epoch 2 : Time 7627.85s
	Train | Loss 1.394, Accuracy 0.573
	Valid | Loss 1.375, Accuracy 0.608
Saving checkpoint for epoch 2 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-8
 Epoch 3 : Time 7619.00s
	Train | Loss 1.381, Accuracy 0.578
	Valid | Loss 1.405, Accuracy 0.606
Saving checkpoint for epoch 3 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-9
 Epoch 4 : Time 7617.24s
	Train | Loss 1.369, Accuracy 0.581
	Valid | Loss 1.473, Accuracy 0.595
Saving checkpoint for epoch 4 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-10
 Epoch 5 : Time 7616.66s
	Train | Loss 1.360, Accuracy 0.585
	Valid | Loss 1.527, Accuracy 0.588
Saving checkpoint for epoch 5 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-11
 Epoch 6 : Time 7614.13s
	Train | Loss 1.351, Accuracy 0.588
	Valid | Loss 1.566, Accuracy 0.586
Saving checkpoint for epoch 6 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-12
 Epoch 7 : Time 7619.38s
	Train | Loss 1.342, Accuracy 0.591
	Valid | Loss 1.621, Accuracy 0.580
Saving checkpoint for epoch 7 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-13
 Epoch 8 : Time 7608.90s
	Train | Loss 1.335, Accuracy 0.593
	Valid | Loss 1.589, Accuracy 0.589
Saving checkpoint for epoch 8 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-14
 Epoch 9 : Time 7607.37s
	Train | Loss 1.327, Accuracy 0.596
	Valid | Loss 1.683, Accuracy 0.577
Saving checkpoint for epoch 9 at /content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-15
'''

NameError: ignored

## Evaluate test data 

In [None]:
def evaluate_peptide_level(dataset, max_length = 50):
    cnt_total =0
    cnt_correct = 0
    for mz, sequence in dataset:
        cnt_total+=1
        if(cnt_total%10 == 0):
            print(cnt_total, cnt_correct/(cnt_total-1))

        encoder_input = tf.convert_to_tensor([mz])
        start, end = 1,2
        output = tf.convert_to_tensor([start],dtype=tf.int64)
        output = tf.expand_dims(output, 0)

        for i in range(max_length):
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(
                encoder_input, output)
            # predictions.shape == (batch_size, seq_len, vocab_size)
            _, _, predictions, _ = transformer(encoder_input, output,
                                   False,
                                   enc_padding_mask,
                                   combined_mask,
                                   dec_padding_mask)
             
            # select the last word from the seq_len dimension
            predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

            predicted_id = tf.argmax(predictions, axis=-1)

            # concatentate the predicted_id to the output which is given to the decoder
            # as its input.
            output = tf.concat([output, predicted_id], axis=-1)

            # return the result if the predicted_id is equal to the end token
            if predicted_id == end:
                if output.shape[1]==sequence.shape[0] and tf.reduce_all(output[0] == sequence):
                    cnt_correct+=1
                break

    return cnt_correct/cnt_total
  

In [None]:
path_test_data='/content/drive/MyDrive/translateMS/data/theoretical_preprocessed_test_data.tfrecords'

test_dataset = tf.data.TFRecordDataset(path_test_data).map(parse_function).shuffle(400000).prefetch(tf.data.AUTOTUNE)

for mz, seq in test_dataset:
  print(mz,seq)
  break  

tf.Tensor(
[     1   8206   8304  10007  13908  14007  15709  15710  15809  17511
  26714  26812  28515  28516  28615  30317  39520  39618  39825  39923
  41321  41626  45222  45320  47023  55435  55533  56725  56823  57236
  58526  65428  65526  65539  65638  67229  67341  75446  75545  76832
  76931  77247  78633  84150  84248  85951  88237  88335  90038  95458
  95556  97259  99641  99739 101442 106766 106865 108567 110949 111048
 112750 118075 118173 119652 119751 119876 121453 125178 125277 126980
 129358 129456 131159 136487 136585 138288 144065 144163 145190 145288
 145866 146991 152768 152866 154569 159897 159995 161698 164076 164175
 165877 169602 169701 171180 171278 171403 172981 178305 178404 180106
 182488 182587 184289 189614 189712 191415 193797 193895 195598 201018
 201116 202819 205105 205203 206906 212422 212521 213808 213907 214223
 215609 223715 223814 223827 223925 225516 225628 232530 232628 233820
 233918 234331 235621 244033 244131 245834 249430 249528 249735 24

In [None]:
test_loss, test_accuracy = evaluate_aminoacid_level(test_dataset)
print(f'\ttest | Loss {valid_loss:.3f}, Accuracy {test_accuracy:.3f}')

	test | Loss 0.003, Accuracy 0.999


In [None]:
accuracy_test_data = evaluate_peptide_level(test_dataset.take(2000))
print(f'Accuracy of test data for peptide level : {accuracy_test_data:.4f}')

10 1.0
20 1.0
30 1.0
40 1.0
50 0.9795918367346939
60 0.9661016949152542
70 0.9710144927536232
80 0.9746835443037974
90 0.9775280898876404
100 0.9797979797979798
110 0.981651376146789
120 0.9831932773109243
130 0.9844961240310077
140 0.9856115107913669
150 0.9865771812080537
160 0.9874213836477987
170 0.9881656804733728
180 0.9888268156424581
190 0.9894179894179894
200 0.9899497487437185
210 0.9904306220095693
220 0.9908675799086758
230 0.9912663755458515
240 0.9916317991631799
250 0.9919678714859438
260 0.9922779922779923
270 0.9925650557620818
280 0.992831541218638
290 0.9930795847750865
300 0.9933110367892977
310 0.9935275080906149
320 0.9937304075235109
330 0.993920972644377
340 0.9941002949852508
350 0.994269340974212
360 0.9944289693593314
370 0.994579945799458
380 0.9920844327176781
390 0.9897172236503856
400 0.9899749373433584
410 0.9902200488997555
420 0.9904534606205251
430 0.9906759906759907
440 0.9908883826879271
450 0.9910913140311804
460 0.9912854030501089
470 0.9914712153

Evaluate real data

In [None]:
path_real_test_dataset='/content/drive/MyDrive/translateMS/data/real_preprocessed_test_data.tfrecords'

In [None]:
real_test_dataset = tf.data.TFRecordDataset(path_real_test_dataset).map(parse_function).shard(num_shards=10, index=0)
real_test_dataset = real_test_dataset.shuffle(40000)

real_test_loss, real_test_accuracy = evaluate_aminoacid_level(real_test_dataset)

print(f'\tReal test data | Loss {real_test_loss:.3f}, Accuracy {real_test_accuracy:.3f}')

	Real test data | Loss 9.726, Accuracy 0.136


In [None]:
accuracy_test_data = evaluate_peptide_level(real_test_dataset)
print(f'Accuracy of real test data for peptide level : {accuracy_test_data:.4f}')

10 0.0
20 0.0
30 0.0
40 0.0
50 0.0
60 0.0
70 0.0
80 0.0
90 0.0
100 0.0
110 0.0
120 0.0
130 0.0
140 0.0
150 0.0
160 0.0
170 0.0
180 0.0
190 0.0
200 0.0
210 0.0
220 0.0
230 0.0
240 0.0
250 0.0
260 0.0
270 0.0
280 0.0
290 0.0
300 0.0
310 0.0
320 0.0
330 0.0
340 0.0
350 0.0
360 0.0
370 0.0
380 0.0
390 0.0
400 0.0
410 0.0
420 0.0
430 0.0
440 0.0
450 0.0
460 0.0
470 0.0
480 0.0
490 0.0
500 0.0
510 0.0
520 0.0
530 0.0
540 0.0
550 0.0
560 0.0
570 0.0
580 0.0
590 0.0
600 0.0
610 0.0
620 0.0
630 0.0
640 0.0
650 0.0
660 0.0
670 0.0
680 0.0
690 0.0
700 0.0
710 0.0
720 0.0
730 0.0
740 0.0
750 0.0
760 0.0
770 0.0
780 0.0
790 0.0
800 0.0
810 0.0
820 0.0
830 0.0
840 0.0
850 0.0
860 0.0
870 0.0
880 0.0
890 0.0
900 0.0
910 0.0
920 0.0
930 0.0
940 0.0
950 0.0
960 0.0
970 0.0
980 0.0
990 0.0
1000 0.0
1010 0.0
1020 0.0
1030 0.0
1040 0.0
1050 0.0
1060 0.0
1070 0.0
1080 0.0
1090 0.0
1100 0.0
1110 0.0
1120 0.0
1130 0.0
1140 0.0
1150 0.0
1160 0.0
1170 0.0
1180 0.0
1190 0.0
1200 0.0
1210 0.0
1220 0.0
1230 0.0
1

KeyboardInterrupt: ignored

# **Fine-tuning for real data**

In [None]:
class Encoder2(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff,
               intensity_vocab_size, dropout_rate=0.1):
    super(Encoder2, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(intensity_vocab_size, self.d_model)

    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, dropout_rate)
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  @tf.function
  def call(self, x, intensity, training, mask):

    # adding embedding and position encoding.
    intensity = self.embedding(intensity)  # (batch_size, intensity_seq_len, d_model)
    x += intensity

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

    return x  # (batch_size, input_seq_len, d_model)


class Decoder2(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, dropout_rate=0.1):

    super(Decoder2, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, dropout_rate)
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
  
  @tf.function
  def call(self, x, enc_output, training,
           look_ahead_mask, padding_mask):

    attention_weights = {}

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training,
                                             look_ahead_mask, padding_mask)

      attention_weights[f'decoder_layer{i+1}_block1'] = block1
      attention_weights[f'decoder_layer{i+1}_block2'] = block2

    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

class ModifiedTransformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff,
               intensity_vocab_size, target_vocab_size, dropout_rate=0.1):
      super(ModifiedTransformer, self).__init__()

      self.encoder = Encoder2(num_layers, d_model, num_heads, dff,
                              intensity_vocab_size, dropout_rate)

      self.decoder = Decoder2(num_layers, d_model, num_heads,
                              dff, dropout_rate)

      self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, input, intensity, target, training, enc_padding_mask,
           look_ahead_mask, dec_padding_mask):
      enc_output = self.encoder(input, intensity, training, enc_padding_mask)  # (batch_size, input_seq_len, d_model)

      # dec_output.shape == (batch_size, tar_seq_len, d_model)
      dec_output, attention_weights = self.decoder(
          target, enc_output, training, look_ahead_mask, dec_padding_mask)

      final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

      return final_output, attention_weights

In [None]:
def create_masks(input, target):
    # Encoder padding mask
    enc_padding_mask = create_padding_mask(input)
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    dec_padding_mask = create_padding_mask(input)
    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
    dec_target_padding_mask = create_padding_mask(target)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.argmax(pred, axis=2))

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')


'''
d_model : input(embedding), ouput 차원
num_layers : 인코더, 디코더 층
num_heads : 멀티헤드 수
d_ff : feedforward 차원 
'''
D_MODEL = 64
NUM_LAYERS = 2
NUM_HEADS = 2
DFF = 128
DROPOUT_RATE = 0.2

learning_rate = CustomSchedule(D_MODEL)
optimizer = tf.keras.optimizers.Adam(learning_rate,
                                     beta_1=0.9,
                                     beta_2=0.98,
                                     epsilon=1e-9)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

In [None]:
def evaluate_aminoacid_level_finetuning(dataset):
    batch_size = 64
    num_batchs = 0
    accuracy = 0
    loss = 0
    dataset_batchs = dataset.padded_batch(batch_size = batch_size, drop_remainder=True)

    for batch, (input, intensity, target) in enumerate(dataset_batchs):
        num_batchs = batch+1
        target_input = target[:, :-1]
        target_real = target[:, 1:]
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input, target_input)

        encoder1_output, decoder1_output, _, _ = \
            pretrained_transformer(input,
                                   target_input,
                                   False,
                                   enc_padding_mask,
                                   combined_mask,
                                   dec_padding_mask)

        predictions, _ = modified_transformer(encoder1_output,
                                              intensity,
                                              decoder1_output,
                                              False,
                                              enc_padding_mask,
                                              combined_mask,
                                              dec_padding_mask)

        loss += loss_function(target_real, predictions)
        accuracy += accuracy_function(target_real, predictions)

    return loss/num_batchs, accuracy/num_batchs

In [None]:

pretrained_transformer = Transformer(
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dff=DFF,
    input_vocab_size=600000,
    target_vocab_size=30,
    positional_encoding_input = 1000,
    positional_encoding_target = 50,
    dropout_rate=DROPOUT_RATE)

modified_transformer = ModifiedTransformer(
    num_layers=NUM_LAYERS,
    d_model=D_MODEL,
    num_heads=NUM_HEADS,
    dff=DFF,
    intensity_vocab_size=12000,
    target_vocab_size=30,
    dropout_rate=DROPOUT_RATE)


In [None]:
#load pretraining checkpoint

ckpt_path_pretraining = '/content/drive/MyDrive/translateMS/checkpoints/pretraining/ckpt-opt'

ckpt_pretraining = tf.train.Checkpoint(transformer=pretrained_transformer,
                           optimizer=optimizer)

ckpt_pretraining.restore(ckpt_path_pretraining)
print('Checkpoint restored!!')

# if a checkpoint exists, restore the latest checkpoint.
#if ckpt_manager.latest_checkpoint:
#    ckpt.restore(ckpt_manager.latest_checkpoint)
#    print('Latest checkpoint restored!!')

Checkpoint restored!!


In [None]:
checkpoint_path_finetuning = "/content/drive/MyDrive/translateMS/checkpoints/finetuning"

ckpt_finetuning = tf.train.Checkpoint(transformer=modified_transformer,
                           optimizer=optimizer)

ckpt_manager_finetuning = tf.train.CheckpointManager(ckpt_finetuning, checkpoint_path_finetuning, max_to_keep=10)
'''
if ckpt_manager_finetuning.latest_checkpoint:
    print(ckpt_manager_finetuning.latest_checkpoint)
    ckpt_finetuning.restore(ckpt_manager_finetuning.latest_checkpoint)
    print('Latest checkpoint restored!!')
'''

"\nif ckpt_manager_finetuning.latest_checkpoint:\n    print(ckpt_manager_finetuning.latest_checkpoint)\n    ckpt_finetuning.restore(ckpt_manager_finetuning.latest_checkpoint)\n    print('Latest checkpoint restored!!')\n"

In [None]:
train_step_signature_finetuning = [
  tf.TensorSpec(shape=(None, None), dtype=tf.int64),
  tf.TensorSpec(shape=(None, None), dtype=tf.int64),
  tf.TensorSpec(shape=(None, None), dtype=tf.int64),
]

@tf.function(input_signature=train_step_signature_finetuning)
def train_step_finetuning(input, intensity, target):

    target_input = target[:, :-1]
    target_real = target[:, 1:]
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input, target_input)

    with tf.GradientTape() as tape:
        #return : enc_output, dec_output, final_output, attention_weights
        encoder1_output, decoder1_output, _, _ = \
            pretrained_transformer(input,
                                   target_input,
                                   False,
                                   enc_padding_mask,
                                   combined_mask,
                                   dec_padding_mask)

        predictions, _ = modified_transformer(encoder1_output,
                                             intensity,
                                             decoder1_output,
                                             True,
                                             enc_padding_mask,
                                             combined_mask,
                                             dec_padding_mask)

        loss = loss_function(target_real, predictions)

    gradients = tape.gradient(loss, modified_transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, modified_transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(accuracy_function(target_real, predictions))

In [None]:
feature_description = {
            'sequence': tf.io.VarLenFeature(tf.int64),
            'intensity': tf.io.VarLenFeature(tf.int64),
            'mz': tf.io.VarLenFeature(tf.int64),
            }

def parse_function(example_proto):
    parsed_example = tf.io.parse_single_example(example_proto,feature_description)
    mz = parsed_example['mz'].values
    intensity = parsed_example['intensity'].values
    sequence = parsed_example['sequence'].values
    return mz, intensity, sequence

path_real_train_dataset='/content/drive/MyDrive/translateMS/data/real_preprocessed_train_data.tfrecords'
path_real_valid_dataset='/content/drive/MyDrive/translateMS/data/real_preprocessed_test_data.tfrecords'
path_real_test_dataset='/content/drive/MyDrive/translateMS/data/real_preprocessed_valid_data.tfrecords'


size_dataset = 0

real_train_dataset = tf.data.TFRecordDataset(path_real_train_dataset).map(parse_function)
real_valid_dataset = tf.data.TFRecordDataset(path_real_train_dataset).map(parse_function)
real_test_dataset = tf.data.TFRecordDataset(path_real_train_dataset).map(parse_function)

In [None]:
for a,b,c in valid_dataset.take(3):
  print(a,b,c)


tf.Tensor(
[    1 10205 11192 11307 11308 11407 11505 12008 12906 12910 13005 13008
 13010 13108 13208 13607 14106 14206 14311 14707 14711 15508 15709 15713
 15813 15907 16008 16708 17088 17111 17309 17511 17610 18106 18509 18512
 18908 19808 19907 20108 20310 20807 20905 21107 21308 21609 22112 22508
 22608 22615 22706 22713 22715 23109 23210 23316 24211 24218 24311 24409
 24416 24509 24512 24516 24912 24916 25209 25510 25609 26012 26019 26112
 26214 26611 27010 27111 27151 27312 28112 28211 28412 28811 28816 28912
 29913 30015 30508 30911 31010 31110 31318 31516 32111 32309 32515 32613
 32712 32812 32968 33018 33119 33216 33220 33814 33868 33913 34013 34110
 34118 34315 34514 34614 34722 34911 35314 35414 35615 35714 35813 36216
 36316 36419 36614 36712 37116 37215 37416 37814 38415 38513 38670 38719
 38769 38917 39017 39223 39570 39615 39621 39824 40216 40316 40613 40719
 40819 40869 40919 41018 41023 41117 41124 41169 41215 41520 41717 42018
 42216 42315 42414 42819 42825 42917 429

In [None]:
for data in real_dataset:
  size_dataset+=1
print(f'Size of dataset : {size_dataset}')

Size of dataset : 4361858


In [None]:
BATCH_SIZE = 512
size_train_dataset = 3041294
NUM_BATCHS = int(size_train_dataset/BATCH_SIZE)
real_train_batchs = (real_train_dataset
                     .shuffle(4000000)
                     .padded_batch(BATCH_SIZE)
                     .prefetch(tf.data.AUTOTUNE))

In [None]:
import time

epoch = 0
size_smallset = size_dataset / num_shards
NUM_BATCHS = int(size_smallset/BATCH_SIZE)

while True:
    start = time.time()
    train_loss.reset_states()
    train_accuracy.reset_states()

    for index in range(num_shards-2):
      train_batchs = (real_dataset
                      .shard(num_shards=num_shards, index = index+2)
                      .padded_batch(BATCH_SIZE)
                      .prefetch(tf.data.AUTOTUNE))
      
      for batch, (input, intensity, target) in enumerate(train_batchs):
          train_step_finetuning(input, intensity, target)

          print('\r',f'Epoch {epoch + 1} | shard {index+1}/{num_shards-2} | batch {batch+1}/{NUM_BATCHS} Loss {train_loss.result():.3f} Accuracy {train_accuracy.result():.4f}',end='')
  
    print('\r',f'Epoch {epoch + 1} : Time {time.time() - start:.2f}s')
  
    ckpt_path_finetuning = ckpt_manager_finetuning.save()
    print('\r', f'Saving checkpoint for epoch {epoch + 1} at {ckpt_path_finetuning}')

    print(f'\tTrain | Loss {train_loss.result():.3f}, Accuracy {train_accuracy.result():.3f}')
    valid_loss, valid_accuracy = evaluate_aminoacid_level_finetuning(valid_dataset)
    print(f'\tValid | Loss {valid_loss:.3f}, Accuracy {valid_accuracy:.3f}')

    epoch+=1

 Epoch 1 : Time 18961.29s
 Saving checkpoint for epoch 1 at /content/drive/MyDrive/translateMS/checkpoints/finetuning/ckpt-1
	Train | Loss 2.175, Accuracy 0.321
	Valid | Loss 1.862, Accuracy 0.414
 Epoch 2 | shard 25/200 | batch 47/84 Loss 1.978 Accuracy 0.3796

KeyboardInterrupt: ignored

In [None]:
ckpt_path_finetuning = ckpt_manager_finetuning.save()
print('\r', f'Saving checkpoint for epoch {epoch + 1} at {ckpt_path_finetuning}')

 Saving checkpoint for epoch 3 at /content/drive/MyDrive/translateMS/checkpoints/finetuning/ckpt-1


In [None]:
'''
Epoch 1 : Time 26541.71s
	Train | Loss 2.183, Accuracy 0.319
	Valid | Loss 1.849, Accuracy 0.419
 Epoch 2 : Time 26385.37s
	Train | Loss 1.912, Accuracy 0.402
	Valid | Loss 1.653, Accuracy 0.483
'''

In [None]:
def evaluate_peptide_level_finetuning(dataset, max_length = 50):
    cnt_total =0
    cnt_correct = 0
    for mz, intensity, sequence in dataset:
        cnt_total+=1
        if(cnt_total%10 == 0):
            print(cnt_total, cnt_correct/(cnt_total-1))

        encoder_input = tf.convert_to_tensor([mz])
        start, end = 1,2
        output = tf.convert_to_tensor([start],dtype=tf.int64)
        output = tf.expand_dims(output, 0)

        for i in range(max_length):
            enc_padding_mask, combined_mask, dec_padding_mask = create_masks(input, output)
            encoder1 = pretrained_transformer.encoder
            encoder1_output = encoder1(input, False, enc_padding_mask)

            predictions, _ = modified_transformer(encoder1_output,
                                                  intensity,
                                                  output,
                                                  False,
                                                  enc_padding_mask,
                                                  combined_mask,
                                                  dec_padding_mask)
             
            # select the last word from the seq_len dimension
            predictions = predictions[:, -1:, :]  # (batch_size, 1, vocab_size)

            predicted_id = tf.argmax(predictions, axis=-1)

            # concatentate the predicted_id to the output which is given to the decoder
            # as its input.
            output = tf.concat([output, predicted_id], axis=-1)

            # return the result if the predicted_id is equal to the end token
            if predicted_id == end:
                if output.shape[1]==sequence.shape[0] and tf.reduce_all(output[0] == sequence):
                    cnt_correct+=1
                break

    return cnt_correct/cnt_total
  

In [None]:
path_test_data='/content/drive/MyDrive/translateMS/data/real_preprocessed_test_data.tfrecords'

test_dataset = tf.data.TFRecordDataset(path_test_data).map(parse_function).shuffle(10000).prefetch(tf.data.AUTOTUNE)

In [None]:
accuracy_test_data = evaluate_peptide_level_finetuning(test_dataset.take(200))
print(f'Accuracy of test data for peptide level : {accuracy_test_data:.4f}')

InvalidArgumentError: ignored