In [1]:
import tensorflow as tf
import numpy as np

import sys
sys.path.append('../..')

from transformer_encoder import MLMTransformerEncoder
from mlm_dataset.mlm_dataset_generator import MLMDatasetGenerator

In [2]:
# MLM dataset for training
mlm_dataset_generator = MLMDatasetGenerator('../../dataset/resume_dataset.csv')
mlm_dataset = mlm_dataset_generator.generateMLMDataset(256)

# Initialize a Tokenizer and fit on text data
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='[OOV]')
tokenizer.fit_on_texts(mlm_dataset_generator.getVocubulary())

# check how many words are in the dataset (currently: 37032)
# print(list(tokenizer.word_index.keys()))

In [7]:
# MLM dataset checker
inputs, labels = mlm_dataset[0]

print(inputs[121], labels[121])
print(inputs[122], labels[122])

# for index, row in enumerate(inputs):
#     if(row.count('[MASK]') != len(labels[index])):
#         print(index, row, labels[index])

['policy', '[MASK]'] ['development']
['web', 'page', '[MASK]'] ['development']


In [4]:
# Usage example with original Transformer hyperparameters
num_layers = 6
d_model = 512
num_heads = 8
dff = 2048
input_vocab_size = 40000
maximum_position_encoding = 10000

model = MLMTransformerEncoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding)
dummy_input = [tf.keras.Input(shape=(None,)), tf.keras.Input(shape=(None,))]
model(dummy_input)

# Define an optimizer (e.g., Adam)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Define a loss function (e.g., categorical cross-entropy for classification)
loss_function = tf.keras.losses.CategoricalCrossentropy()

In [5]:
import time

# Define a training loop
def train_step(inputs_batch, labels_batch):
    start_time = time.time()

    total_loss = 0.0
    gradients_accumulator = [tf.zeros_like(var) for var in model.trainable_variables]

    counter = 0
    for inputs, labels in zip(inputs_batch, labels_batch):
        with tf.GradientTape() as tape:
            # create one-hot encoded mask and get the indices
            mask =[[]]
            token_indices = []
            for index, token in enumerate(inputs):
                if token == '[MASK]':
                    mask[0].append(0)
                    token_indices.append(index)
                else: 
                    mask[0].append(1)
            mask = tf.cast(mask, tf.float32)
            # tokenize inputs
            input_ids = tf.cast(tokenizer.texts_to_sequences([inputs]), tf.float32)
            # tokenize labels
            token_ids = tokenizer.texts_to_sequences(labels)
            # create array of zeroes with dimension [sequence_length, input_vocab_size]
            tokenized_labels = np.zeros((len(inputs), input_vocab_size))
            # change the [masked_token_index, token_id] to ones
            for index, token_index in enumerate(token_indices):
                tokenized_labels[token_index, token_ids[index]] = 1
            tokenized_labels = tf.constant(tokenized_labels, dtype=tf.float32)

            # print('\n> INPUTS')
            # print(input_ids)
            # print(mask)

            predictions = model([input_ids, mask], training=False)[0]

            # masked_token_indices = []
            # for index, binary in enumerate(mask[0]):
            #     if (binary == 0):
            #         masked_token_indices.append(index)

            # filtered_prediction = tf.gather(predictions, masked_token_indices, axis=0)
            # filtered_labels = tf.gather(tokenized_labels, masked_token_indices, axis=0)

            predicted_token = []
            for row in predictions.numpy():
                if (row[0] > 0):
                    predicted_token.append(np.argmax(row))

            # tokenized_labels = tf.nn.softmax(tokenized_labels)
            # predictions = tf.nn.softmax(predictions)

            # print('\n> LABELS')
            # print(tokenized_labels)
            # print('\n> PREDICTIONS')
            # print(predictions)

            loss = loss_function(tokenized_labels, predictions)
            # print('\n> LOSS')
            # print(loss)
        
        gradients = tape.gradient(loss, model.trainable_variables)
        # print('GRADIENTS')
        # print(gradients)

        gradients_accumulator = [(grad_accum + grad) / 2 for grad_accum, grad in zip(gradients_accumulator, gradients)]
        total_loss += loss

        print('Seq ' + str(counter) + ', Loss = ' + str(loss) + ', Predicted Token = ' + str(predicted_token) + ', True Token = ' + str(token_ids))
        counter += 1

    # gradients_avg = [grad / len(inputs_batch) for grad in gradients_accumulator]
    
    optimizer.apply_gradients(zip(gradients_accumulator, model.trainable_variables))

    return total_loss / len(inputs_batch), str(time.time() - start_time)

In [6]:
# Example of usage in the training loop
num_epochs = 10
for epoch in range(num_epochs):
    batch_counter = 0
    for inputs_batch, labels_batch in mlm_dataset:  # Provide training data
        loss, elapsed_time = train_step(inputs_batch, labels_batch)
        # Log or print the loss for monitoring
        print('Epoch ' + str(epoch) + ', Batch ' + str(batch_counter) + ', Loss = ' + str(loss) + ', Elapsed Time: ' + elapsed_time + '\n\n\n')
        batch_counter += 1

Seq 0, Loss = tf.Tensor(2.6974745, shape=(), dtype=float32), Predicted Token = [6250], True Token = [[761]]
Seq 1, Loss = tf.Tensor(5.3978014, shape=(), dtype=float32), Predicted Token = [25263], True Token = [[761]]
Seq 2, Loss = tf.Tensor(2.1393628, shape=(), dtype=float32), Predicted Token = [25263, 21427, 34827], True Token = [[826], [31], [7]]
Seq 3, Loss = tf.Tensor(1.8511932, shape=(), dtype=float32), Predicted Token = [34827, 34827, 9466], True Token = [[8085], [1777], [16]]
Seq 4, Loss = tf.Tensor(2.631157, shape=(), dtype=float32), Predicted Token = [32975], True Token = [[294]]
Seq 5, Loss = tf.Tensor(5.3113327, shape=(), dtype=float32), Predicted Token = [6250], True Token = [[11]]
Seq 6, Loss = tf.Tensor(5.3812966, shape=(), dtype=float32), Predicted Token = [6250], True Token = [[2384]]
Seq 7, Loss = tf.Tensor(3.5014212, shape=(), dtype=float32), Predicted Token = [25263], True Token = [[948]]
Seq 8, Loss = tf.Tensor(3.539038, shape=(), dtype=float32), Predicted Token = [

KeyboardInterrupt: 

In [None]:
print(model.trainable_variables)

[<tf.Variable 'mlm_transformer_encoder/transformer_encoder/embedding/embeddings:0' shape=(40000, 512) dtype=float32, numpy=
array([[-0.01588781,  0.01453067, -0.02597257, ...,  0.04620039,
        -0.0454732 , -0.02524504],
       [ 0.0347981 , -0.02962655,  0.04831713, ..., -0.00495056,
        -0.01368779,  0.03826029],
       [-0.01359411,  0.0312282 , -0.02915373, ..., -0.0338385 ,
         0.02261953, -0.01964855],
       ...,
       [ 0.02977041, -0.01259378,  0.02108761, ...,  0.0433084 ,
        -0.03834623,  0.00129116],
       [-0.01145588, -0.01207805,  0.02925486, ..., -0.00199153,
         0.01266504,  0.03773722],
       [-0.01921101,  0.01885745, -0.0260011 , ...,  0.01014141,
        -0.00208644, -0.01153468]], dtype=float32)>, <tf.Variable 'mlm_transformer_encoder/transformer_encoder/encoder/multi_head_attention/query/kernel:0' shape=(512, 8, 512) dtype=float32, numpy=
array([[[-4.3800301e-03, -4.0317036e-04,  2.4611482e-03, ...,
          4.6334777e-04, -4.2244350e-03