In [2]:
import tensorflow as tf
import numpy as np

import sys
sys.path.append('../..')

from transformer_encoder import MLMTransformerEncoder
from mlm_dataset.mlm_dataset_generator import MLMDatasetGenerator

In [3]:
# MLM dataset for training
mlm_dataset_generator = MLMDatasetGenerator('../../dataset/resume_dataset.csv')
inputs, labels = mlm_dataset_generator.generateMLMDataset(1)[0]
inputs = inputs[0]
labels = labels[0]
print(inputs, labels)

# Initialize a Tokenizer and fit on text data
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='[OOV]')
tokenizer.fit_on_texts(mlm_dataset_generator.getVocubulary())

# check how many words are in the dataset (currently: 37032)
# print(list(tokenizer.word_index.keys()))

['[MASK]', 'administrator', 'marketing', 'associate'] ['hr']


In [4]:
# Usage example with original Transformer hyperparameters
num_layers = 1
d_model = 512
num_heads = 8
dff = 2048
input_vocab_size = 40000
maximum_position_encoding = 10000
rate = 0.1

model = MLMTransformerEncoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate)
# dummy_input = [tf.keras.Input(shape=(None, None, 512)), tf.keras.Input(shape=(None, None, 512))]
# model(dummy_input)

model_trainable_variables = []
gradients_test = []

# model_trainable_variables.append(model.trainable_variables)

# Define an optimizer (e.g., Adam)
optimizer = tf.keras.optimizers.Adam()

# Define a loss function (e.g., categorical cross-entropy for classification)
loss_function = tf.keras.losses.CategoricalCrossentropy()

In [5]:
def train_step(inputs, labels):
    # create one-hot encoded mask and get the indices
    mask =[[]]
    token_indices = []
    for index, token in enumerate(inputs):
        if token == '[MASK]':
            mask[0].append(0)
            token_indices.append(index)
        else: 
            mask[0].append(1)
    mask = tf.cast(mask, tf.float32)
    # tokenize inputs
    input_ids = tf.cast(tokenizer.texts_to_sequences([inputs]), tf.float32)
    # tokenize labels
    token_ids = tokenizer.texts_to_sequences(labels)
    # create array of zeroes with dimension [sequence_length, input_vocab_size]
    tokenized_labels = np.zeros((len(inputs), input_vocab_size))
    # change the [masked_token_index, token_id] to ones
    for index, token_index in enumerate(token_indices):
        tokenized_labels[token_index, token_ids[index]] = 1
    tokenized_labels = tf.constant(tokenized_labels, dtype=tf.float32)

    # print('\n> INPUTS')
    # print(input_ids)
    # print(mask)

    with tf.GradientTape() as tape:

        predictions = model([input_ids, mask], training=False)[0]

        # predictions = tf.nn.softmax(predictions)

        # print('\n> LABELS')
        # print(tokenized_labels)
        # print('\n> PREDICTIONS')
        # print(predictions)

        loss = loss_function(tokenized_labels, predictions)

        # print('\n> LOSS')
        # print(loss)

    predicted_token = None
    for row in predictions.numpy():
        if (row[0] > 0):
            predicted_token = np.argmax(row)
    
    gradients = tape.gradient(loss, model.trainable_variables)
    # print('GRADIENTS')
    # print(gradients)

    gradients_test.append(gradients)
    model_trainable_variables.append(model.trainable_variables)

    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, predicted_token, token_ids

In [10]:
print(model_trainable_variables[5])

[<tf.Variable 'mlm_transformer_encoder/transformer_encoder/embedding/embeddings:0' shape=(40000, 512) dtype=float32, numpy=
array([[ 0.04116045,  0.03449507, -0.02031866, ...,  0.0213267 ,
        -0.0170299 , -0.03837688],
       [ 0.00687126, -0.0370633 ,  0.02279961, ...,  0.02008654,
        -0.04162905, -0.04138961],
       [ 0.00798477,  0.00461094,  0.04755919, ..., -0.04037492,
        -0.02516056, -0.03373513],
       ...,
       [ 0.02167935,  0.02031581,  0.0395397 , ..., -0.03466809,
         0.00383444,  0.01453886],
       [-0.03064232, -0.00536763, -0.04551834, ..., -0.01846008,
        -0.04464027,  0.03317425],
       [ 0.0486683 ,  0.00153913,  0.01354399, ..., -0.01716299,
         0.01344727,  0.00891051]], dtype=float32)>, <tf.Variable 'mlm_transformer_encoder/transformer_encoder/encoder/multi_head_attention/query/kernel:0' shape=(512, 8, 512) dtype=float32, numpy=
array([[[ 2.9262018e-03,  4.1775559e-03, -4.8418256e-04, ...,
         -3.1049266e-03,  4.0266183e-03

In [7]:
# print(gradients_test[0])

In [8]:
# Example of usage in the training loop
num_epochs = 10000
for epoch in range(num_epochs):
    loss, predicted_token, token_ids = train_step(inputs, labels)
    # Log or print the loss for monitoring
    print('Epoch ' + str(epoch) + ', Loss = ' + str(loss) + ', Predicted Token = ' + str(predicted_token) + ', True Token = ' + str(token_ids))

Epoch 0, Loss = tf.Tensor(2.6558914, shape=(), dtype=float32), Predicted Token = 34672, True Token = [[183]]
Epoch 1, Loss = tf.Tensor(1.8565924, shape=(), dtype=float32), Predicted Token = 183, True Token = [[183]]
Epoch 2, Loss = tf.Tensor(1.6944456, shape=(), dtype=float32), Predicted Token = 183, True Token = [[183]]
Epoch 3, Loss = tf.Tensor(1.4963288, shape=(), dtype=float32), Predicted Token = 183, True Token = [[183]]
Epoch 4, Loss = tf.Tensor(1.3135195, shape=(), dtype=float32), Predicted Token = 183, True Token = [[183]]
Epoch 5, Loss = tf.Tensor(1.1246877, shape=(), dtype=float32), Predicted Token = 183, True Token = [[183]]
Epoch 6, Loss = tf.Tensor(0.9316613, shape=(), dtype=float32), Predicted Token = 183, True Token = [[183]]
Epoch 7, Loss = tf.Tensor(0.73952705, shape=(), dtype=float32), Predicted Token = 183, True Token = [[183]]
Epoch 8, Loss = tf.Tensor(0.55323195, shape=(), dtype=float32), Predicted Token = 183, True Token = [[183]]
Epoch 9, Loss = tf.Tensor(0.38110

KeyboardInterrupt: 