In [None]:
import tensorflow as tf
import numpy as np

import sys
sys.path.append('../..')

from transformer_encoder import MLMTransformerEncoder
from mlm_dataset.mlm_dataset_generator import MLMDatasetGenerator

In [None]:
# MLM dataset for training
mlm_dataset_generator = MLMDatasetGenerator('../../dataset/resume_dataset.csv')
inputs, labels = mlm_dataset_generator.generateMLMDataset(1)[0]
inputs = inputs[0]
labels = labels[0]
print(inputs, labels)

# Initialize a Tokenizer and fit on text data
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='[OOV]')
tokenizer.fit_on_texts(mlm_dataset_generator.getVocubulary())

# check how many words are in the dataset (currently: 37032)
# print(list(tokenizer.word_index.keys()))

In [None]:
# Usage example with original Transformer hyperparameters
num_layers = 1
d_model = 512
num_heads = 8
dff = 2048
input_vocab_size = 40000
maximum_position_encoding = 10000
rate = 0.1

model = MLMTransformerEncoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate)
# dummy_input = [tf.keras.Input(shape=(None, None, 512)), tf.keras.Input(shape=(None, None, 512))]
# model(dummy_input)

# model_trainable_variables = []
# gradients_test = []

# model_trainable_variables.append(model.trainable_variables)

# Define an optimizer (e.g., Adam)
optimizer = tf.keras.optimizers.Adam()

# Define a loss function (e.g., categorical cross-entropy for classification)
loss_function = tf.keras.losses.CategoricalCrossentropy()

In [None]:
def train_step(inputs, labels):
    # create one-hot encoded mask and get the indices
    mask =[[]]
    token_indices = []
    for index, token in enumerate(inputs):
        if token == '[MASK]':
            mask[0].append(0)
            token_indices.append(index)
        else: 
            mask[0].append(1)
    mask = tf.constant(mask, tf.float32)
    # tokenize inputs
    input_ids = tf.cast(tokenizer.texts_to_sequences([inputs]), tf.float32)
    # tokenize labels
    token_ids = tokenizer.texts_to_sequences(labels)
    # create array of zeroes with dimension [sequence_length, input_vocab_size]
    tokenized_labels = np.zeros((len(inputs), input_vocab_size))
    # change the [masked_token_index, token_id] to ones
    for index, token_index in enumerate(token_indices):
        tokenized_labels[token_index, token_ids[index]] = 1
    tokenized_labels = tf.constant(tokenized_labels, dtype=tf.float32)

    # print('\n> INPUTS')
    # print(input_ids)
    # print(mask)

    with tf.GradientTape() as tape:

        predictions = model([input_ids, mask], training=False)[0]

        # predictions = tf.nn.softmax(predictions)

        # print('\n> LABELS')
        # print(tokenized_labels)
        # print('\n> PREDICTIONS')
        # print(predictions)

        loss = loss_function(tokenized_labels, predictions)

        # print('\n> LOSS')
        # print(loss)

    # indices = []
    predicted_token = []
    for index, row in enumerate(mask[0]):
        if (row == 0):
            predicted_token.append(np.argmax(predictions[index]))
            # indices.append(index)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    # print('GRADIENTS')
    # print(gradients)

    # gradients_test.append(gradients)
    # model_trainable_variables.append(model.trainable_variables)

    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, predicted_token, token_ids

In [None]:
# print(model_trainable_variables[5])

In [None]:
# print(gradients_test[0])

In [None]:
# Example of usage in the training loop
num_epochs = 10000
for epoch in range(num_epochs):
    loss, predicted_token, token_ids = train_step(inputs, labels)
    # Log or print the loss for monitoring
    print('Epoch ' + str(epoch) + ', Loss = ' + str(loss) + ', Predicted Token = ' + str(predicted_token) + ', True Token = ' + str(token_ids))