In [1]:
import sys
sys.path.append('..')

# from mlm_dataset.mlm_dataset_generator import MLMDatasetGenerator

# # MLM dataset for training
# mlm_dataset_generator = MLMDatasetGenerator('../dataset/resume_dataset.csv')

In [2]:
from base import Sequential
from layers import WordEmbedding, PositionalEncoding, Dense, MultiHeadAttention, LayerNormalization
from activation import ReLu, Linear, Softmax
from tokenizer import Tokenizer
from preprocessing import pad_sequences, generate_attention_mask, generate_mlm_mask

# vocab_size = 40000
# model_dim = 512
# num_heads = 8
# ffn_dim = 2048

# tokenizer = Tokenizer(vocab_size)

# model = Sequential([
#         WordEmbedding(vocab_size, model_dim),
#         PositionalEncoding(int, model_dim),
#         MultiHeadAttention(num_heads, int, model_dim),
#         # Feed Forward Network
#         Dense([model_dim, ffn_dim], ReLu()),
#         Dense([ffn_dim, model_dim], Linear()),
#         LayerNormalization(model_dim),
#         # MLM Head
#         Dense([model_dim, vocab_size], Softmax())
# ])

In [3]:
import numpy as np
import tensorflow as tf
import keras.layers

# hyper params
max_pos = 3
d_model = 4
vocab_size = 10

In [4]:
# Word Embedding Layer Test
tf_word_embedding_layer = keras.layers.Embedding(vocab_size, d_model, dtype=tf.float64)
tf_word_embedding_layer(tf.constant([1], dtype=tf.float64))

word_embedding_layer = WordEmbedding(vocab_size, d_model)
word_embedding_layer.set_trainable_variables(tf_word_embedding_layer.get_weights()[0])
print(word_embedding_layer.get_trainable_variables())

{'word_embedding/weights': array([[-0.02679642,  0.04489677,  0.0324373 ,  0.01394802],
       [ 0.01818262, -0.03496223, -0.00070968,  0.02177156],
       [-0.04322726,  0.03282814, -0.02480682,  0.03183122],
       [-0.04540983,  0.03963188, -0.01973656, -0.01484503],
       [ 0.02463193, -0.01660786,  0.00731976, -0.01108236],
       [-0.01081998, -0.00088423,  0.01886229, -0.02092738],
       [-0.00963866,  0.03029319, -0.03382122,  0.00839908],
       [-0.02094575, -0.03279179,  0.00265743, -0.03675856],
       [ 0.04187943,  0.02954264, -0.02944153,  0.03337073],
       [-0.0356996 , -0.00486696,  0.02759491, -0.00151797]])}


In [5]:

inputs = [1, 2, 3]

tf_word_embedding = tf_word_embedding_layer(tf.constant(inputs, dtype=tf.float64))
print('tensorflow:')
print(tf_word_embedding)

word_embedding = word_embedding_layer.forward(np.array(inputs, dtype=np.int64).T)
print('numpy:')
print(word_embedding)

tensorflow:
tf.Tensor(
[[ 0.01818262 -0.03496223 -0.00070968  0.02177156]
 [-0.04322726  0.03282814 -0.02480682  0.03183122]
 [-0.04540983  0.03963188 -0.01973656 -0.01484503]], shape=(3, 4), dtype=float64)
numpy:
[[ 0.01818262 -0.03496223 -0.00070968  0.02177156]
 [-0.04322726  0.03282814 -0.02480682  0.03183122]
 [-0.04540983  0.03963188 -0.01973656 -0.01484503]]


In [6]:
# Positional Encoding Layer Test
def positional_encoding(position, d_model):
    angle_rads = np.arange(position)[:, np.newaxis] / np.power(10000, (2 * (np.arange(d_model)[np.newaxis, :] // 2)) / np.float64(d_model))
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float64)

tf_positional_encoding_layer = positional_encoding(max_pos, d_model)
print('tensorflow:')
print(tf_positional_encoding_layer)

positional_encoding_layer = PositionalEncoding(max_pos, d_model)
print('numpy:')
print(positional_encoding_layer.get_positional_encoding())

tensorflow:
tf.Tensor(
[[[ 0.          1.          0.          1.        ]
  [ 0.84147098  0.54030231  0.00999983  0.99995   ]
  [ 0.90929743 -0.41614684  0.01999867  0.99980001]]], shape=(1, 3, 4), dtype=float64)
numpy:
[[ 0.          1.          0.          1.        ]
 [ 0.84147098  0.54030231  0.00999983  0.99995   ]
 [ 0.90929743 -0.41614684  0.01999867  0.99980001]]


In [7]:
tf_positional_encoding = tf_word_embedding + tf_positional_encoding_layer[:, :tf.shape(tf_word_embedding)[1], :]
print('tensorflow:')
print(tf_positional_encoding)

positional_encoding = positional_encoding_layer.forward(word_embedding)
print('numpy:')
print(positional_encoding)

tensorflow:
tf.Tensor(
[[[ 1.81826236e-02  9.65037769e-01 -7.09684361e-04  1.02177156e+00]
  [ 7.98243721e-01  5.73130446e-01 -1.48069907e-02  1.03178122e+00]
  [ 8.63887601e-01 -3.76514959e-01  2.62102576e-04  9.84954975e-01]]], shape=(1, 3, 4), dtype=float64)
numpy:
[[ 1.81826236e-02  9.65037769e-01 -7.09684361e-04  1.02177156e+00]
 [ 7.98243721e-01  5.73130446e-01 -1.48069907e-02  1.03178122e+00]
 [ 8.63887601e-01 -3.76514959e-01  2.62102576e-04  9.84954975e-01]]


In [8]:
# Dense Layer Test
tf_dense_layer = keras.layers.Dense(d_model, activation='softmax', dtype=tf.float64)
tf_dense_layer.build((None, d_model))

tf_dense_layer_weights_bias = tf_dense_layer.get_weights()
print(tf_dense_layer_weights_bias)

dense_layer = Dense((d_model, d_model), Softmax())
dense_layer.set_trainable_variables(tf_dense_layer_weights_bias[0].T, tf_dense_layer_weights_bias[1])

print(dense_layer.get_trainable_variables())

[array([[ 0.62405692, -0.21443712, -0.38097981,  0.18317922],
       [ 0.5555344 , -0.54636909,  0.33529233,  0.7122871 ],
       [ 0.39219143,  0.68758778, -0.61777777, -0.56404858],
       [ 0.11725931,  0.66089474, -0.17675787,  0.86545405]]), array([0., 0., 0., 0.])]
{'dense/weights': array([[ 0.62405692,  0.5555344 ,  0.39219143,  0.11725931],
       [-0.21443712, -0.54636909,  0.68758778,  0.66089474],
       [-0.38097981,  0.33529233, -0.61777777, -0.17675787],
       [ 0.18317922,  0.7122871 , -0.56404858,  0.86545405]]), 'dense/bias': array([0., 0., 0., 0.])}


In [9]:
tf_dense = tf_dense_layer(tf_positional_encoding)
print('tensorflow:')
print(tf_dense)

dense = dense_layer.forward(positional_encoding.T)
print('numpy:')
print(dense)

tensorflow:
tf.Tensor(
[[[0.21453601 0.12711786 0.1262116  0.53213452]
  [0.28901323 0.13729808 0.08559047 0.48809822]
  [0.25377974 0.31814877 0.08659949 0.341472  ]]], shape=(1, 3, 4), dtype=float64)
float64
numpy:
[[0.21453601 0.28901323 0.25377974]
 [0.12711786 0.13729808 0.31814877]
 [0.1262116  0.08559047 0.08659949]
 [0.53213452 0.48809822 0.341472  ]]


In [11]:
# Self-Attention Layer Test
tf_multi_head_layer = keras.layers.MultiHeadAttention(key_dim=d_model, num_heads=1, dtype=tf.float64)
tf_multi_head_layer(tf_dense, tf_dense, tf_dense)
tf_self_attention_weights_bias = list(tf_multi_head_layer.get_weights())
print(tf_multi_head_layer.trainable_variables)

qkv_weights_bias = [None] * 8

qkv_weights_bias[0] = tf_self_attention_weights_bias[0].reshape(-1, tf_self_attention_weights_bias[0].shape[-1]).T
qkv_weights_bias[1] = tf_self_attention_weights_bias[1].flatten()
qkv_weights_bias[2] = tf_self_attention_weights_bias[2].reshape(-1, tf_self_attention_weights_bias[2].shape[-1]).T
qkv_weights_bias[3] = tf_self_attention_weights_bias[3].flatten()
qkv_weights_bias[4] = tf_self_attention_weights_bias[4].reshape(-1, tf_self_attention_weights_bias[4].shape[-1]).T
qkv_weights_bias[5] = tf_self_attention_weights_bias[5].flatten()

# qkv_weights_bias[6] = tf_self_attention_weights_bias[0].reshape(-1, tf_self_attention_weights_bias[0].shape[-1])[1::2].T
# qkv_weights_bias[7] = tf_self_attention_weights_bias[1].flatten()[1::2]
# qkv_weights_bias[8] = tf_self_attention_weights_bias[2].reshape(-1, tf_self_attention_weights_bias[2].shape[-1])[1::2].T
# qkv_weights_bias[9] = tf_self_attention_weights_bias[3].flatten()[1::2]
# qkv_weights_bias[10] = tf_self_attention_weights_bias[4].reshape(-1, tf_self_attention_weights_bias[4].shape[-1])[1::2].T
# qkv_weights_bias[11] = tf_self_attention_weights_bias[5].flatten()[1::2]

qkv_weights_bias[6] = tf_self_attention_weights_bias[6].reshape(-1, tf_self_attention_weights_bias[6].shape[-1]).T
qkv_weights_bias[7] = tf_self_attention_weights_bias[7].flatten()
# print(qkv_weights_bias)

multi_head_layer = MultiHeadAttention(1, max_pos, d_model)
multi_head_layer.set_trainable_variables(qkv_weights_bias)
print(multi_head_layer.get_trainable_variables())

[<tf.Variable 'multi_head_attention_1/query/kernel:0' shape=(4, 1, 4) dtype=float64, numpy=
array([[[-0.01531117,  0.27474707,  0.27248481, -0.03255484]],

       [[-0.14374464,  0.20547901,  0.24631336, -0.13923326]],

       [[ 0.22982336,  0.24703751, -0.50483596,  0.12321709]],

       [[ 0.28682414,  0.12411855,  0.04726004,  0.44943438]]])>, <tf.Variable 'multi_head_attention_1/query/bias:0' shape=(1, 4) dtype=float64, numpy=array([[0., 0., 0., 0.]])>, <tf.Variable 'multi_head_attention_1/key/kernel:0' shape=(4, 1, 4) dtype=float64, numpy=
array([[[ 0.26196153, -0.09671605, -0.24637236, -0.5085401 ]],

       [[-0.09059429, -0.4157922 ,  0.1952512 ,  0.00193626]],

       [[ 0.42969267,  0.53759254,  0.38552019,  0.39248583]],

       [[ 0.4302314 ,  0.47688448,  0.19252971, -0.42686889]]])>, <tf.Variable 'multi_head_attention_1/key/bias:0' shape=(1, 4) dtype=float64, numpy=array([[0., 0., 0., 0.]])>, <tf.Variable 'multi_head_attention_1/value/kernel:0' shape=(4, 1, 4) dtype=floa

In [12]:
tf_multi_head = tf_multi_head_layer(tf_dense, tf_dense, tf_dense)
print('tensorflow:')
print(tf_multi_head)

multi_head = multi_head_layer.forward(dense)
print('numpy:')
print(multi_head)

tensorflow:
tf.Tensor(
[[[ 0.20596518 -0.2491776  -0.27652762  0.15199152]
  [ 0.20596896 -0.24920166 -0.27652975  0.1520024 ]
  [ 0.20596095 -0.24920758 -0.27652661  0.15200054]]], shape=(1, 3, 4), dtype=float64)
float64
numpy:
[[ 0.20596518  0.20596896  0.20596095]
 [-0.2491776  -0.24920166 -0.24920758]
 [-0.27652762 -0.27652975 -0.27652661]
 [ 0.15199152  0.1520024   0.15200054]]


In [20]:
# Layer Normalization Test
tf_layernorm_layer = keras.layers.LayerNormalization(epsilon=1e-100, dtype=np.float64)
tf_layernorm_layer(tf_multi_head)
print(tf_layernorm_layer.get_weights())

layernorm_layer = LayerNormalization(d_model)
print(layernorm_layer.get_trainable_variables())

[array([1., 1., 1., 1.]), array([0., 0., 0., 0.])]
{'layer_norm/gamma': array([[1.],
       [1.],
       [1.],
       [1.]]), 'layer_norm/beta': array([0., 0., 0., 0.])}


In [21]:
tf_layernorm = tf_layernorm_layer(tf_multi_head)
print('tensorflow:')
print(tf_layernorm)

layernorm = layernorm_layer.forward(multi_head)
print('numpy:')
print(layernorm)

tensorflow:
tf.Tensor(
[[[ 1.11693435 -0.93373073 -1.05695739  0.87375377]
  [ 1.11691601 -0.9337857  -1.05690824  0.87377793]
  [ 1.1169047  -0.93380693 -1.05688978  0.87379202]]], shape=(1, 3, 4), dtype=float64)
numpy:
[[ 1.11693435  1.11691601  1.1169047 ]
 [-0.93373073 -0.9337857  -0.93380693]
 [-1.05695739 -1.05690824 -1.05688978]
 [ 0.87375377  0.87377793  0.87379202]]


In [None]:
# # Example encoder stacking

encoder = Sequential([
    Dense([10, 5], Linear()),
    Dense([2, 3], Linear()),
])

# model = Sequential([
#     WordEmbedding(20, 5),
#     ]+
#     encoder.stack_layers(2)+
#     [
#     WordEmbedding(20, 5),
# ])

# # clear memory used by encoder
# del encoder

In [None]:
for key, value in encoder.get_trainable_variables().items():
    print(key)
    print(value)

encoder.save_model()

del encoder

print('')

encoder = Sequential([
    Dense([10, 5], Linear()),
    Dense([2, 3], Linear()),
])

encoder.load_model()

for key, value in encoder.get_trainable_variables().items():
    print(key)
    print(value)

In [None]:
# print(model.trainable_variables)

# model.save_model()

# del model

# model = Sequential([
#     Dense([10, 10], Linear())
# ])

# model.load_model()

# print(model.trainable_variables)

In [None]:
# fit tokenizer on dataset
tokenizer.fit_on_texts(mlm_dataset_generator.getVocubulary())

In [None]:
# generate MLM dataset

batch_size = 20
sample_limit = 1000

mlm_dataset = mlm_dataset_generator.generateMLMDataset(batch_size, sample_limit=sample_limit)

# to free memory
mlm_dataset_generator = None

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
    for batch in mlm_dataset:  # Provide training data
        tokens, labels = batch

        # tokenization
        tokenized_batch = tokenizer.tokenize(tokens)

        # batch padding
        padded_tokenized_batch = pad_sequences(tokenized_batch, pad_token=-1)

        # attention mask
        attention_mask = generate_attention_mask(padded_tokenized_batch, tokenizer.get_mask_token_id())

        # MLM training mask
        mlm_mask = generate_mlm_mask(attention_mask)

        # change padding tokens to 0
        attention_mask[attention_mask == -1] = 0

predictions = model.fit_predict()