In [1]:
import tensorflow as tf
import numpy as np

import sys
sys.path.append('../..')

from transformer_encoder import MLMTransformerEncoder
from mlm_dataset.mlm_dataset_generator import MLMDatasetGenerator

In [2]:
# Usage example with original Transformer hyperparameters
num_layers = 1
d_model = 512
num_heads = 8
dff = 2048
input_vocab_size = 40000
maximum_position_encoding = 10000

model = MLMTransformerEncoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding)
model([tf.keras.Input(shape=(None,)), tf.keras.Input(shape=(None,)), tf.keras.Input(shape=(None,))])

# Define an optimizer (e.g., Adam)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Define a loss function (e.g., categorical cross-entropy for classification)
loss_function = tf.keras.losses.CategoricalCrossentropy()

# np.set_printoptions(threshold=np.inf)

In [3]:
# MLM dataset for training
mlm_dataset_generator = MLMDatasetGenerator('../../dataset/resume_dataset.csv')

oov_token = '[oov]'

# Initialize a Tokenizer and fit on text data
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token, filters='')
# Traing tokenizer on dataset vocab
tokenizer.fit_on_texts(mlm_dataset_generator.getVocubulary())

# padding function
from keras.preprocessing.sequence import pad_sequences

batch_size = 20
# default is None
sample_limit = 1000

mlm_dataset = mlm_dataset_generator.generateMLMDataset(batch_size, sample_limit=sample_limit)

# to free memory
mlm_dataset_generator = None

In [4]:
# check how many words are in the dataset (currently: 37032)
# print(tokenizer.word_index['[mask]'])

In [5]:
# MLM dataset checker
# inputs, mask, labels = mlm_dataset[0]

# print(inputs)
# print(mask)

# for sequence_index, sequence in enumerate(labels):
#     for token_index, token in enumerate(sequence):
#         for value in token:
#             if (value > 0):
#                 print(sequence_index, token_index, value)

In [6]:
def tokenize(batch: tuple, input_vocab_size) -> ((tf.constant([], tf.float32), tf.constant([], tf.float32)), tf.constant([], tf.float32)):
    tokens_batch, labels_batch = batch

    # tokenize the tokens_batch and labels_batch, string to token_id conversion
    tokenized_tokens_batch = tokenizer.texts_to_sequences(tokens_batch)
    tokenized_labels_batch = np.array(tokenizer.texts_to_sequences(labels_batch)).flatten().tolist()

    # apply padding on tokens
    # we used -1 as padding to allow for reversing of the attention mask
    padded_tokenized_tokens_batch = pad_sequences(tokenized_tokens_batch, padding='post', value=-1)

    # create the attention mask
    attention_mask = np.array(padded_tokenized_tokens_batch)
    # change mask tokens to 0
    attention_mask[attention_mask == tokenizer.word_index['[mask]']] = 0
    # change non-masked tokens to 1
    attention_mask[attention_mask > 0] = 1

    # create reversed attention mask
    mlm_mask = attention_mask.copy()
    mlm_mask[mlm_mask == 1] = -1
    mlm_mask[mlm_mask == 0] = 1
    mlm_mask[mlm_mask == -1] = 0
    mlm_mask = np.expand_dims(mlm_mask, axis=-1)

    # create labels
    labels = attention_mask.copy()
    labels[labels == -1] = 1
    labels = labels.tolist()
    masked_token_index = 0
    for sequence in labels:
        for token_index, token in enumerate(sequence):
            if (token == 0):
                token_label = [0] * input_vocab_size
                token_label[tokenized_labels_batch[masked_token_index]] = 1
                sequence[token_index] = token_label
                masked_token_index += 1
            elif(token == 1):
                sequence[token_index] = [0] * input_vocab_size

    # change padding tokens to 0
    attention_mask[attention_mask == -1] = 0

    return (tf.constant(padded_tokenized_tokens_batch, tf.float32), 
            tf.constant(attention_mask, tf.float32), 
            tf.constant(mlm_mask, tf.float32), 
            tf.constant(labels, tf.float32))

In [7]:
# import time

# model_trainable_variables = []
# model_gradients = []

# Define a training loop
def train_step(batch, input_vocab_size):
    # start_time = time.time()

    padded_tokenized_tokens_batch, attention_mask, mlm_mask, labels = tokenize(batch, input_vocab_size)

    # counter = 0
    with tf.GradientTape() as tape:
        predictions = model([padded_tokenized_tokens_batch, attention_mask, mlm_mask], training=False)

        loss = loss_function(labels, predictions)
        # print('\n> LOSS')
        # print(loss)

    # get the predicted token(s) ID(s)
    # indices = []
    # predicted_token = []
    # for index, row in enumerate(mask[0]):
    #     if (row == 0):
    #         predicted_token.append(np.argmax(predictions[index]))
            # indices.append(index)

    # if (counter == 9):
    #     print(inputs, labels, mask, token_indices, token_ids)
    #     # print('\n> LABELS')
    #     # print(tokenized_labels)
    #     print('\n> PREDICTIONS')
    #     print(predictions)

    #     # display the token index and element index of values > 0
    #     for index, row in enumerate(tokenized_labels):
    #         for element_index, element in enumerate(row):
    #             if (element > 0):
    #                 print(index, element, element_index)

    # Manual Loss calculation
    # total_loss_test = 0
    # for tokenized_label, prediction in zip(tokenized_labels, predictions):
    #     total_loss_test += np.sum(tokenized_label * -np.log(prediction))
    # print("manual:", total_loss_test / len(predictions))
    
    gradients = tape.gradient(loss, model.trainable_variables)
    # print('GRADIENTS')
    # print(gradients)

    # model_gradients.append(gradients)

    # gradients_accumulator = [grad_accum + grad for grad_accum, grad in zip(gradients_accumulator, gradients)]
    # total_loss += loss

    # print('Seq ' + str(counter) + ', Loss = ' + str(loss.numpy()) + ', Predicted Token = ' + str(predicted_token) + ', True Token = ' + str(token_ids))
    # counter += 1

    # gradients_avg = [grad / len(inputs_batch) for grad in gradients_accumulator]
    
    # optimizer.minimize(total_loss / len(inputs_batch), model.trainable_variables, tape=tape)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss #, str(time.time() - start_time)

In [8]:
# Example of usage in the training loop
num_epochs = 100
for epoch in range(num_epochs):
    for batch_index, batch in enumerate(mlm_dataset):  # Provide training data
        loss = train_step(batch, input_vocab_size)
        # Log or print the loss for monitoring
        print('Epoch ' + str(epoch) + ', Batch ' + str(batch_index) + ', Loss = ' + str(loss.numpy())) # + ', Elapsed Time: ' + elapsed_time)

Epoch 0, Batch 0, Loss = 0.7173047
Epoch 0, Batch 1, Loss = 0.9461497
Epoch 0, Batch 2, Loss = 0.39980787
Epoch 0, Batch 3, Loss = 1.161488
Epoch 0, Batch 4, Loss = 0.92218745
Epoch 0, Batch 5, Loss = 0.4583708
Epoch 0, Batch 6, Loss = 0.56126654
Epoch 0, Batch 7, Loss = 0.53752095
Epoch 0, Batch 8, Loss = 0.6324646
Epoch 0, Batch 9, Loss = 0.8333324
Epoch 0, Batch 10, Loss = 0.31450394
Epoch 0, Batch 11, Loss = 0.41068906
Epoch 0, Batch 12, Loss = 1.2938486
Epoch 0, Batch 13, Loss = 0.50085026
Epoch 0, Batch 14, Loss = 0.7120204
Epoch 0, Batch 15, Loss = 0.9528971
Epoch 0, Batch 16, Loss = 0.18879342
Epoch 0, Batch 17, Loss = 1.0689213
Epoch 0, Batch 18, Loss = 1.126787
Epoch 0, Batch 19, Loss = 0.33892745
Epoch 0, Batch 20, Loss = 0.6003047
Epoch 0, Batch 21, Loss = 0.37335286
Epoch 0, Batch 22, Loss = 0.32314193
Epoch 0, Batch 23, Loss = 0.8925075
Epoch 0, Batch 24, Loss = 0.96448386
Epoch 0, Batch 25, Loss = 0.84367436
Epoch 0, Batch 26, Loss = 0.13906586
Epoch 0, Batch 27, Loss = 

ResourceExhaustedError: {{function_node __wrapped__Mul_device_/job:localhost/replica:0/task:0/device:CPU:0}} OOM when allocating tensor with shape[20,99,40000] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:Mul] name: 

In [None]:
# print(model_trainable_variables[0])
# print(model_gradients[0])

In [None]:
# print(model_trainable_variables[8])