In [11]:
import sys
sys.path.append('..')

from tokenizer import Tokenizer
from mlm_dataset.mlm_dataset_generator import MLMDatasetGenerator

# hyperparameters
vocab_size = 22789
model_dim = 512
num_heads = 8
ffn_dim = 2048
max_pos = 512

# # test hyperparameters
# vocab_size = 22732
# model_dim = 70
# num_heads = 2
# ffn_dim = 2048
# max_pos = 10

# MLM dataset for training
mlm_dataset_generator = MLMDatasetGenerator(max_pos=max_pos)

mlm_dataset = mlm_dataset_generator.read_mlm_dataset_from_file()
mlm_dataset_generator.read_raw_training_data_from_file()

tokenizer = Tokenizer(max_pos=max_pos, vocab_size=vocab_size)

# fit tokenizer on dataset
tokenizer.fit_on_texts(mlm_dataset_generator.getVocubulary())

# generate MLM dataset

batch_size = 20
sample_limit = 1000

# to free memory
del mlm_dataset_generator

In [12]:
print(len(tokenizer.word_index))

22732


In [13]:
# tokenizer.save_word_index()

In [14]:
import numpy as np
from base import Sequential
from layers import WordEmbedding, PositionalEncoding, Dense, MultiHeadAttention, SelfAttention, LayerNormalization
from activation import ReLu, Linear, Softmax
from loss import CategoricalCrossEntropy
from optimizers import Adam, GradientDescent

model = Sequential([
			WordEmbedding(vocab_size, model_dim),
			PositionalEncoding(max_pos, model_dim),
            MultiHeadAttention(num_heads, max_pos, model_dim),
			# SelfAttention(max_pos, model_dim),
			LayerNormalization(model_dim),
			# Feed Forward Network
			Dense([model_dim, ffn_dim], ReLu),
			Dense([ffn_dim, model_dim], Linear),
			LayerNormalization(model_dim),
			# MLM Head
			Dense([model_dim, vocab_size], Softmax)
])

In [15]:
# Example truncation handling

# sample_text = 'I am an example resume. Tae tae tae, meow mewo mewo. Cats dogs relationships. I am an example resume. Tae tae tae, meow mewo mewo. Cats dogs relationships.'

# padded_tokenized_training_tokens, training_attention_mask = tokenizer.clean_truncate_tokenize_pad_atten(sample_text)

# print(padded_tokenized_training_tokens)
# print(training_attention_mask)

# print(model.predict(padded_tokenized_training_tokens, training_attention_mask).shape)

In [16]:
training_data, validation_data, testing_data = mlm_dataset

sample_limit = None

training_tokens = training_data[:sample_limit * 2 if sample_limit else None:2]
training_labels = training_data[1:sample_limit * 2 if sample_limit else None:2]

# tokenization, padding, attention mask
padded_tokenized_training_tokens, training_attention_mask = tokenizer.tokenize_pad_atten(tokens=training_tokens)

# MLM training mask
training_mlm_mask = tokenizer.generate_mlm_mask(training_attention_mask)

# change padding tokens to 0
training_attention_mask[training_attention_mask == -1] = 0
padded_tokenized_training_tokens = np.array(padded_tokenized_training_tokens)
padded_tokenized_training_tokens[padded_tokenized_training_tokens == -1] = tokenizer.get_pad_token_id()

In [17]:
# print(padded_tokenized_training_tokens[0])
# print(training_attention_mask[0])
# print(training_mlm_mask[0])

In [18]:
def accuracy_metric(Y, Y_hat):
    Y, Y_hat = Y.T, Y_hat.T

    rows, columns = np.where(Y == 1)
    correct_predictions = sum(1 for actual, predicted in zip(columns, np.argmax(Y_hat[rows], axis=-1)) if actual == predicted)
    return correct_predictions / len(columns)

In [19]:
# load latest model save
model.load_model('0_2600.txt')

In [20]:
tokenized_training_labels = tokenizer.tokenize(training_labels)

total_loss = 0
num_input = 1
for padded_tokenized, attention_mask, mlm_mask, labels in zip(padded_tokenized_training_tokens, 
                                                                  training_attention_mask, 
                                                                  training_mlm_mask, 
                                                                  tokenized_training_labels):
        """
            Creation of labels. Labels should be created per input to reduce memory usage.
            Labels has a shape of (vocab_size, d_model) which is a very large data.
        """
        one_hot_labels = np.zeros((max_pos, vocab_size), dtype=np.float64)
        masked_token_indices = np.where(mlm_mask.flatten() == 1)[0].tolist()
        for row, column in zip(masked_token_indices, labels):
            one_hot_labels[row, column] = 1

        loss, accuracy = model.validate(X=padded_tokenized, 
                    attention_mask=attention_mask, 
                    mlm_mask=mlm_mask,
                    Y=one_hot_labels, 
                    loss_function=CategoricalCrossEntropy,
                    accuracy_metric=accuracy_metric)
        
        total_loss += loss

        print('Loss: ' + str(loss) + ', Total Loss: ' + str(total_loss / num_input) + ', Accuracy: ' + str(accuracy))

        num_input += 1

Loss: 10.038811659062606, Total Loss: 10.038811659062606, Accuracy: 0.0
Loss: 10.039046371859808, Total Loss: 10.038929015461207, Accuracy: 0.0
Loss: 30.11446554661054, Total Loss: 16.730774525844318, Accuracy: 0.0
Loss: 30.084670177550294, Total Loss: 20.069248438770813, Accuracy: 0.0
Loss: 10.038012720608961, Total Loss: 18.06300129513844, Accuracy: 0.0
Loss: 10.038318578873918, Total Loss: 16.725554175761022, Accuracy: 0.0
Loss: 10.038329767294535, Total Loss: 15.770236403122952, Accuracy: 0.0
Loss: 10.038929084570666, Total Loss: 15.053822988303917, Accuracy: 0.0
Loss: 10.036760691250413, Total Loss: 14.496371621964638, Accuracy: 0.0
Loss: 10.036726029694375, Total Loss: 14.05040706273761, Accuracy: 0.0
Loss: 10.037539516153519, Total Loss: 13.685600922139058, Accuracy: 0.0
Loss: 10.037172872494287, Total Loss: 13.381565251335326, Accuracy: 0.0
Loss: 20.073746548657283, Total Loss: 13.8963484280524, Accuracy: 0.0
Loss: 10.036513739947722, Total Loss: 13.620645950330637, Accuracy: 0

KeyboardInterrupt: 

In [11]:
tokenized_training_labels = tokenizer.tokenize(training_labels)

epoch = 20
# save model every n
checkpoint_count = 500

# load training checkpoint
sequence_start = 0
with open('training_checkpoint.txt', 'r') as file:
    sequence_start = int(file.read()) + 1

import time

for epoch_count in range(epoch):
    sequence_count = 0
    for padded_tokenized, attention_mask, mlm_mask, labels in zip(padded_tokenized_training_tokens[sequence_start:], 
                                                                  training_attention_mask[sequence_start:], 
                                                                  training_mlm_mask[sequence_start:], 
                                                                  tokenized_training_labels[sequence_start:]):
        """
            Creation of labels. Labels should be created per input to reduce memory usage.
            Labels has a shape of (vocab_size, d_model) which is a very large data.
        """
        one_hot_labels = np.zeros((max_pos, vocab_size), dtype=np.float64)
        masked_token_indices = np.where(mlm_mask.flatten() == 1)[0].tolist()
        for row, column in zip(masked_token_indices, labels):
            one_hot_labels[row, column] = 1

        training_log = model.fit(X=padded_tokenized, 
                    attention_mask=attention_mask, 
                    mlm_mask=mlm_mask, 
                    Y=one_hot_labels, 
                    loss_function=CategoricalCrossEntropy, 
                    optimizer=GradientDescent(),
                    accuracy_metric=accuracy_metric)
        
        print('Epoch ' + str(epoch_count) + ', Seq ' + str(sequence_count) + ', ' + training_log)
        
        if (sequence_count % checkpoint_count == 0):
            start_time = time.time()

            model.save_model(str(epoch_count) + '_' + str(sequence_count) + '.txt')
            # save checkpoint settings
            with open('training_checkpoint.txt', 'w') as file:
                file.write(str(sequence_count))

            print('Model Saved, Elapsed Time: ' + str(time.time() - start_time))
            start_time = time.time()

        sequence_count += 1

        

Epoch 0, Seq 0, Loss: 10.038173605033311, Accuracy: 0.0, Elapsed Time: 13.443068265914917


KeyboardInterrupt: 

In [None]:
# # convergence test
# padded_tokenized, attention_mask, mlm_mask = list(zip(padded_tokenized_training_tokens, training_attention_mask, training_mlm_mask))[0]
# tokenized_training_labels = tokenizer.tokenize(training_labels)[0]

# for _ in range(20):
#     """
#         Creation of labels. Labels should be created per input to reduce memory usage.
#         Labels has a shape of (vocab_size, d_model) which is a very large data.
#     """
#     one_hot_labels = np.zeros((max_pos, vocab_size), dtype=np.float64)
#     masked_token_indices = np.where(mlm_mask.flatten() == 1)[0].tolist()
#     for row, column in zip(masked_token_indices, tokenized_training_labels):
#         one_hot_labels[row, column] = 1
#     # np.add.at(one_hot_labels, (masked_token_indices, tokenized_labels), 1)

#     model.fit(X=padded_tokenized, 
#                 attention_mask=attention_mask, 
#                 mlm_mask=mlm_mask, 
#                 Y=one_hot_labels, 
#                 loss_function=CategoricalCrossEntropy, 
#                 optimizer=GradientDescent(),
#                 accuracy_metric=accuracy_metric)

In [20]:
# # model beheading test
# model.remove_mlm_head()

# convergence test
padded_tokenized, attention_mask = tokenizer.clean_truncate_tokenize_pad_atten('ressadf sdf sd afs dfd f f the and two for')

embedding = model.predict(X=padded_tokenized, attention_masks=attention_mask)

print(embedding) # shape of (d_model, seq_length)
print(embedding.shape) # (512, 512)

[[-6.81737016e-07 -6.81737016e-07 -6.81737015e-07 ... -6.81737016e-07
  -6.81737016e-07 -6.81737016e-07]
 [-8.78205461e-07 -8.78205461e-07 -8.78205460e-07 ... -8.78205460e-07
  -8.78205460e-07 -8.78205460e-07]
 [ 8.49496321e-07  8.49496321e-07  8.49496322e-07 ...  8.49496322e-07
   8.49496322e-07  8.49496322e-07]
 ...
 [-7.11249682e-07 -7.11249682e-07 -7.11249682e-07 ... -7.11249683e-07
  -7.11249683e-07 -7.11249683e-07]
 [-5.68341682e-07 -5.68341682e-07 -5.68341682e-07 ... -5.68341682e-07
  -5.68341682e-07 -5.68341682e-07]
 [-9.12259581e-07 -9.12259581e-07 -9.12259582e-07 ... -9.12259582e-07
  -9.12259581e-07 -9.12259581e-07]]
(512, 512)


In [None]:
# print(model.get_trainable_variables())
# model.save_model()

In [None]:
# model.load_model()
# print(model.get_trainable_variables())

In [None]:
# print(model.get_trainable_variables())