In [1]:
import tensorflow as tf
import numpy as np

import sys
sys.path.append('../..')

from transformer_encoder import MLMTransformerEncoder
from mlm_dataset.batching_mlm_dataset_generator import MLMDatasetGenerator

In [12]:
batch_size = 12
# default is None
sample_limit = 12

# Usage example with original Transformer hyperparameters
num_layers = 1
d_model = 512
num_heads = 8
dff = 2048
input_vocab_size = 22733
maximum_position_encoding = 512

if (d_model % num_heads != 0):
    raise ValueError(f'd_model has to be divisible by num_heads, d_model = {str(d_model)}, num_heads = {str(num_heads)}')

model = MLMTransformerEncoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding)
model([tf.keras.Input(shape=(maximum_position_encoding,), dtype=tf.int64), 
       tf.keras.Input(shape=(maximum_position_encoding,), dtype=tf.int64)], 
       training=False)


# Define an optimizer (e.g., Adam)
optimizer = tf.keras.optimizers.Adam()

# Define a loss function (e.g., categorical cross-entropy for classification)
loss_function = tf.keras.losses.CategoricalCrossentropy()

checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)

word_embedding: 0.1036980152130127
sqrt: 0.0
pos_encoding: 0.0
mha: 0.055165767669677734
add & norm: 0.026170015335083008
ffn: 0.03734254837036133
add & norm: 0.012549161911010742
mlm_head: 0.020002365112304688


In [3]:
# MLM dataset for training
mlm_dataset_generator = MLMDatasetGenerator(512)

# training_data, validation_data, testing_data = mlm_dataset_generator.generateMLMDataset(batch_size, sample_limit=sample_limit)
training_data, validation_data, testing_data = mlm_dataset_generator.read_mlm_dataset_from_file(batch_size=batch_size, sample_limit=sample_limit)
mlm_dataset_generator.read_raw_training_data_from_file()

# Initialize a Tokenizer and fit on text data
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=0, filters='')
# Traing tokenizer on dataset vocab
tokenizer.fit_on_texts(mlm_dataset_generator.getVocubulary())

# padding function
from keras.preprocessing.sequence import pad_sequences

# to free memory
mlm_dataset_generator = None

In [4]:
def tokenize(batch: tuple, input_vocab_size) -> ((tf.constant([], tf.float32), tf.constant([], tf.float32)), []):
    tokens_batch = batch[::2]
    labels_batch = batch[1::2]
    
    # tokenize the tokens_batch and labels_batch, string to token_id conversion
    tokenized_tokens_batch = tokenizer.texts_to_sequences(tokens_batch)
    tokenized_labels_batch = tokenizer.texts_to_sequences(labels_batch)

    # apply padding on tokens
    # we used -1 as padding to allow for reversing of the attention mask
    padded_tokenized_tokens_batch = pad_sequences(tokenized_tokens_batch, maxlen=maximum_position_encoding, padding='post', value=-1)

    # create the attention mask
    attention_mask = np.array(padded_tokenized_tokens_batch)
    # change padding tokens -1 to token ID according to tokenizer
    padded_tokenized_tokens_batch[padded_tokenized_tokens_batch == -1] = tokenizer.word_index['[pad]']
    # get indices of mask tokens
    mask_token_indices = np.where(attention_mask == tokenizer.word_index['[mask]'])
    # change mask tokens to 0
    attention_mask[mask_token_indices[0], mask_token_indices[1]] = 0
    # change non-masked tokens to 1
    attention_mask[attention_mask > 0] = 1
    # change padding tokens -1 to attention of 0
    attention_mask[attention_mask == -1] = 0
    attention_mask = tf.expand_dims(attention_mask, axis=-1)

    # create labels
    labels = []
    for sequence_labels in tokenized_labels_batch:
        labels += np.eye(input_vocab_size)[sequence_labels].tolist()

    return (tf.constant(padded_tokenized_tokens_batch, tf.float32), 
            tf.cast(attention_mask, tf.float32), 
            tf.cast(labels, tf.float32),
            mask_token_indices)

In [5]:
import time

# Define a training loop
def train_step(batch, input_vocab_size):
    start_time = time.time()

    padded_tokenized_tokens_batch, attention_mask, labels, mask_token_indices = tokenize(batch, input_vocab_size)

    with tf.GradientTape() as tape:
        predictions = model([padded_tokenized_tokens_batch, attention_mask], training=False)

        loss = loss_function(labels, tf.gather_nd(predictions, indices=tf.stack([mask_token_indices[0], mask_token_indices[1]], axis=-1)))

    sub_start_time = time.time()

    # takes longest time
    gradients = tape.gradient(loss, model.trainable_variables)

    print('gradients_calc:', time.time() - sub_start_time)
    sub_start_time = time.time()
    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    print('optimization:', time.time() - sub_start_time)

    return loss , str(time.time() - start_time)

In [6]:
# Example of usage in the training loop

num_epochs = 20
for epoch in range(num_epochs):
    for batch_index, batch in enumerate(training_data):  # Provide training data
        loss, elapsed_time = train_step(batch, input_vocab_size)
        # Log or print the loss for monitoring
        print('Epoch ' + str(epoch) + ', Batch ' + str(batch_index) + ', Loss = ' + str(loss.numpy()) + ', Elapsed Time: ' + elapsed_time)
    checkpoint.save(f"model_training_checkpoints/epoch_{str(epoch)}/")

word_embedding: 0.021105289459228516
sqrt: 0.009525060653686523
pos_encoding: 0.004992485046386719
mha: 0.5606846809387207
add & norm: 0.04333615303039551
ffn: 0.27907586097717285
add & norm: 0.029526948928833008
mlm_head: 1.780632495880127
gradients_calc: 6.063301086425781
optimization: 0.6320910453796387
Epoch 0, Batch 0, Loss = 10.199326, Elapsed Time: 10.82668161392212
word_embedding: 0.006000518798828125
sqrt: 0.0020051002502441406
pos_encoding: 0.005000591278076172
mha: 0.46110987663269043
add & norm: 0.04052257537841797
ffn: 0.2876865863800049
add & norm: 0.028002262115478516
mlm_head: 1.637077808380127
gradients_calc: 4.704793930053711
optimization: 0.5159957408905029
Epoch 1, Batch 0, Loss = 8.716139, Elapsed Time: 8.577103614807129
word_embedding: 0.0
sqrt: 0.008115530014038086
pos_encoding: 0.0020449161529541016
mha: 0.41971778869628906
add & norm: 0.03000640869140625
ffn: 0.24251890182495117
add & norm: 0.025007009506225586
mlm_head: 1.485877513885498


KeyboardInterrupt: 

In [None]:
# model loading

# loaded_checkpoint = tf.train.Checkpoint(model=MLMTransformerEncoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding), 
#                                         optimizer=tf.keras.optimizers.Adam())
# loaded_checkpoint.restore(tf.train.latest_checkpoint("model_training_checkpoints/"))
# loaded_model = loaded_checkpoint.model

In [None]:
# print(model_trainable_variables[0])
# print(model_gradients[0])

In [None]:
# print(model_trainable_variables[8])