In [1]:
import sys
sys.path.append('..')

from tokenizer import Tokenizer
from mlm_dataset.mlm_dataset_generator import MLMDatasetGenerator

# hyperparameters
vocab_size = 22732
model_dim = 512
num_heads = 8
ffn_dim = 2048
max_pos = 512

# # test hyperparameters
# vocab_size = 22732
# model_dim = 70
# num_heads = 2
# ffn_dim = 2048
# max_pos = 10

# MLM dataset for training
mlm_dataset_generator = MLMDatasetGenerator(max_pos=max_pos)

mlm_dataset = mlm_dataset_generator.read_mlm_dataset_from_file()
mlm_dataset_generator.read_raw_training_data_from_file()

tokenizer = Tokenizer(max_pos=max_pos, vocab_size=vocab_size)

# fit tokenizer on dataset
tokenizer.fit_on_texts(mlm_dataset_generator.getVocubulary())

# generate MLM dataset

batch_size = 20
sample_limit = 1000

# to free memory
del mlm_dataset_generator

In [2]:
print(len(tokenizer.word_index))

22732


In [3]:
# tokenizer.save_word_index()

In [4]:
import numpy as np
from base import Sequential
from layers import WordEmbedding, PositionalEncoding, Dense, MultiHeadAttention, SelfAttention, LayerNormalization
from activation import ReLu, Linear, Softmax
from loss import CategoricalCrossEntropy
from optimizers import Adam, GradientDescent

model = Sequential([
			WordEmbedding(vocab_size, model_dim),
			PositionalEncoding(max_pos, model_dim),
			MultiHeadAttention(num_heads, max_pos, model_dim),
			# SelfAttention(max_pos, model_dim),
			LayerNormalization(model_dim),
			# Feed Forward Network
			Dense([model_dim, ffn_dim], ReLu),
			Dense([ffn_dim, model_dim], Linear),
			LayerNormalization(model_dim),
			# MLM Head
			Dense([model_dim, vocab_size], Softmax)
])

In [5]:
# Example truncation handling

# sample_text = 'I am an example resume. Tae tae tae, meow mewo mewo. Cats dogs relationships. I am an example resume. Tae tae tae, meow mewo mewo. Cats dogs relationships.'

# padded_tokenized_training_tokens, training_attention_mask = tokenizer.clean_truncate_tokenize_pad_atten(sample_text)

# print(padded_tokenized_training_tokens)
# print(training_attention_mask)

# print(model.predict(padded_tokenized_training_tokens, training_attention_mask).shape)

In [6]:
training_data, validation_data, testing_data = mlm_dataset

sample_limit = None

training_tokens = training_data[:sample_limit * 2 if sample_limit else None:2]
training_labels = training_data[1:sample_limit * 2 if sample_limit else None:2]

# tokenization, padding, attention mask
padded_tokenized_training_tokens, training_attention_mask = tokenizer.tokenize_pad_atten(tokens=training_tokens)

# MLM training mask
training_mlm_mask = tokenizer.generate_mlm_mask(training_attention_mask)

# change padding tokens to 0
training_attention_mask[training_attention_mask == -1] = 0
padded_tokenized_training_tokens = np.array(padded_tokenized_training_tokens)
padded_tokenized_training_tokens[padded_tokenized_training_tokens == -1] = tokenizer.get_pad_token_id()

In [7]:
def accuracy_metric(Y, Y_hat):
    Y, Y_hat = Y.T, Y_hat.T

    rows, columns = np.where(Y == 1)
    correct_predictions = sum(1 for actual, predicted in zip(columns, np.argmax(Y_hat[rows], axis=-1)) if actual == predicted)
    return correct_predictions / len(columns)

In [None]:
# # load latest model save
# model.log('.txt')

In [8]:
tokenized_training_labels = tokenizer.tokenize(training_labels)

epoch = 20
# save model every n
checkpoint_count = 10 

# load training checkpoint
sequence_start = 0
with open('training_checkpoint.txt', 'r') as file:
    sequence_start = int(file.read())

import time

for epoch_count in range(epoch):
    sequence_count = 0
    for padded_tokenized, attention_mask, mlm_mask, labels in zip(padded_tokenized_training_tokens[sequence_start:], 
                                                                  training_attention_mask[sequence_start:], 
                                                                  training_mlm_mask[sequence_start:], 
                                                                  tokenized_training_labels[sequence_start:]):
        """
            Creation of labels. Labels should be created per input to reduce memory usage.
            Labels has a shape of (vocab_size, d_model) which is a very large data.
        """
        one_hot_labels = np.zeros((max_pos, vocab_size), dtype=np.float64)
        masked_token_indices = np.where(mlm_mask.flatten() == 1)[0].tolist()
        for row, column in zip(masked_token_indices, labels):
            one_hot_labels[row, column] = 1

        training_log = model.fit(X=padded_tokenized, 
                    attention_mask=attention_mask, 
                    mlm_mask=mlm_mask, 
                    Y=one_hot_labels, 
                    loss_function=CategoricalCrossEntropy, 
                    optimizer=GradientDescent(),
                    accuracy_metric=accuracy_metric)
        
        print('Epoch ' + str(epoch_count) + ', Seq ' + str(sequence_count) + ', ' + training_log)
        
        if (sequence_count % checkpoint_count == 0):
            start_time = time.time()

            model.save_model(str(epoch_count) + '_' + str(sequence_count) + '.txt')
            # save checkpoint settings
            with open('training_checkpoint.txt', 'w') as file:
                file.write(str(sequence_count))

            print('Model Saved, Elapsed Time: ' + str(time.time() - start_time))
            start_time = time.time()

        sequence_count += 1

        

Epoch 0, Seq 0, Loss: 10.032693566967852, Accuracy: 0.0, Elapsed Time: 15.009294271469116
Model Saved, Elapsed Time: 29.796020030975342
Epoch 0, Seq 1, Loss: 10.03220426576604, Accuracy: 0.0, Elapsed Time: 15.532159805297852
Epoch 0, Seq 2, Loss: 30.09458042366779, Accuracy: 0.0, Elapsed Time: 15.477685689926147
Epoch 0, Seq 3, Loss: 30.094297649931548, Accuracy: 0.0, Elapsed Time: 15.299492835998535
Epoch 0, Seq 4, Loss: 10.030507326706617, Accuracy: 0.0, Elapsed Time: 15.364421844482422
Epoch 0, Seq 5, Loss: 10.031559027264011, Accuracy: 0.0, Elapsed Time: 15.53899884223938
Epoch 0, Seq 6, Loss: 10.030914027647146, Accuracy: 0.0, Elapsed Time: 15.517599105834961
Epoch 0, Seq 7, Loss: 10.033554267023806, Accuracy: 0.0, Elapsed Time: 15.408987998962402
Epoch 0, Seq 8, Loss: 10.031110469339412, Accuracy: 0.0, Elapsed Time: 15.798527240753174
Epoch 0, Seq 9, Loss: 10.031337594835554, Accuracy: 0.0, Elapsed Time: 14.635864019393921
Epoch 0, Seq 10, Loss: 10.032236803236762, Accuracy: 0.0,

KeyboardInterrupt: 

In [None]:
# # convergence test
# padded_tokenized, attention_mask, mlm_mask = list(zip(padded_tokenized_training_tokens, training_attention_mask, training_mlm_mask))[0]
# tokenized_training_labels = tokenizer.tokenize(training_labels)[0]

# for _ in range(20):
#     """
#         Creation of labels. Labels should be created per input to reduce memory usage.
#         Labels has a shape of (vocab_size, d_model) which is a very large data.
#     """
#     one_hot_labels = np.zeros((max_pos, vocab_size), dtype=np.float64)
#     masked_token_indices = np.where(mlm_mask.flatten() == 1)[0].tolist()
#     for row, column in zip(masked_token_indices, tokenized_training_labels):
#         one_hot_labels[row, column] = 1
#     # np.add.at(one_hot_labels, (masked_token_indices, tokenized_labels), 1)

#     model.fit(X=padded_tokenized, 
#                 attention_mask=attention_mask, 
#                 mlm_mask=mlm_mask, 
#                 Y=one_hot_labels, 
#                 loss_function=CategoricalCrossEntropy, 
#                 optimizer=GradientDescent(),
#                 accuracy_metric=accuracy_metric)

In [None]:
# # model beheading test
# model.remove_mlm_head()

# # convergence test
# padded_tokenized, attention_mask, mlm_mask = list(zip(padded_tokenized_training_tokens, training_attention_mask, training_mlm_mask))[0]

# embedding = model.predict(X=padded_tokenized, attention_mask=attention_mask)

# print(embedding) # shape of (d_model, seq_length)
# print(embedding.shape) # (512, 512)

[[-4.27097538e-03  1.48845310e-03  6.13420199e-04 ... -2.24484553e-03
  -4.61373138e-04 -3.72050150e-04]
 [ 4.45254348e-03  6.64050752e-03  3.39324109e-03 ...  5.36004780e-03
   4.40420823e-03  2.55265890e-03]
 [ 1.11647376e-03  7.66161637e-04  2.85109310e-03 ...  1.59098767e-03
   1.67978736e-03 -7.33809141e-05]
 ...
 [ 4.71714501e-04  3.59302877e-04  1.88894696e-03 ... -1.34735135e-03
  -1.25241590e-03 -1.32853576e-03]
 [ 3.61046064e-03  2.04226710e-03  6.54083094e-04 ...  3.63204441e-03
   4.01618613e-03  4.09725290e-03]
 [ 6.24274773e-04 -8.59859300e-04  2.10151943e-03 ... -8.59635046e-04
  -3.59025337e-04 -2.00460710e-04]]
(512, 512)


In [None]:
# print(model.get_trainable_variables())
# model.save_model()

In [None]:
# model.load_model()
# print(model.get_trainable_variables())

In [None]:
# print(model.get_trainable_variables())