In [1]:
#### This notebook was created to test Bert Masked Encoder to be used for Masked language model and the Seq4Rec using MultiHeadAttention Layer
# 80% Replace with a [MASK] token: For 80% of the selected inputs the token is replaced with a [MASK] token similar to the classic cloze tests mentioned earlier.
# 10% Replace with an incorrect word: For 10% of the selected inputs, the token is replaced by another randomly selected word whose only requirement is that it’s different from the selected token.
# 10% Replace with the correct word: The remaining 10% of the time the selected token is simply replaced with the correct token.



In [2]:
### PARAMETER_LIST 
OUTPUT_LEN =256
SEQUENCE_LEN =  256
MAX_LEN = SEQUENCE_LEN
EMBED_DIM = 128
VOCAB_SIZE = 6000
TOKEN_NUM = 30000
N_NEURONS = 128
NUM_HEADS = 8
KEY_DIM = 128
NUM_ATT_LAYER = 1
SPECIAL_TOKENS=["[MASK]"]

In [3]:
import numpy as np
import pandas as pd 
import os 
import sys 
import re, string

import matplotlib.pyplot as plt 
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras import layers 
from tensorflow.keras.layers import Dense, TextVectorization, Embedding, MultiHeadAttention, Dropout,LayerNormalization

In [4]:

data_url = 'https://raw.githubusercontent.com/malinphy/datasets/main/IMDB_sent/IMDB%20Dataset.csv'
df = pd.read_csv(data_url)

In [5]:
len(df)

9998

In [6]:
def custom_standardization(input_data):
    lowercase = str(input_data).lower()
    s = re.sub("<br />", " ",lowercase)
    s = re.sub('\x96|\x85|\xe3',' ',s) 
    out = re.sub('[%s]' % re.escape(string.punctuation), '', s)
    return out

df['review'] = df['review'].apply(custom_standardization)

In [7]:
df.head(3)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming te...,positive
2,i thought this was a wonderful way to spend ti...,positive


In [8]:
vectorize_layer = TextVectorization(
        max_tokens=VOCAB_SIZE,
        output_mode="int",
        # standardize=custom_standardization,
        output_sequence_length=SEQUENCE_LEN,
    )
vectorize_layer.adapt((df['review']))

In [9]:
vocab = vectorize_layer.get_vocabulary()
vocab = vocab[0 : VOCAB_SIZE - len(SPECIAL_TOKENS)] + ["[mask]"]
vectorize_layer.set_vocabulary(vocab)
mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]

In [10]:
encoded_texts = vectorize_layer(df['review'])

In [11]:
def get_masked_input_and_labels(encoded_texts):
    # 15% BERT masking
    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
    # Do not mask special tokens
    inp_mask[encoded_texts <= 2] = False
    # Set targets to -1 by default, it means ignore
    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
    # Set labels for masked tokens
    labels[inp_mask] = encoded_texts[inp_mask]

    # Prepare input
    encoded_texts_masked = np.copy(encoded_texts)
    # Set input to [MASK] which is the last token for the 90% of tokens
    # This means leaving 10% unchanged
    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
    encoded_texts_masked[
        inp_mask_2mask
    ] = mask_token_id  # mask token is the last in the dict

    # Set 10% to a random token
    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
    encoded_texts_masked[inp_mask_2random] = np.random.randint(
        3, mask_token_id, inp_mask_2random.sum()
    )

    # Prepare sample_weights to pass to .fit() method
    sample_weights = np.ones(labels.shape)
    sample_weights[labels == -1] = 0

    # y_labels would be same as encoded_texts i.e input tokens
    y_labels = np.copy(encoded_texts)

    return encoded_texts_masked, y_labels, sample_weights

In [12]:
bir, iki, uc =get_masked_input_and_labels(encoded_texts)

In [13]:
def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [
            [pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
            if pos != 0
            else np.zeros(d_emb)
            for pos in range(max_len)
        ]
    )
    pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i+1
    return pos_enc

In [14]:
first_input = tf.keras.Input(shape = (SEQUENCE_LEN,))
embedding_layer = Embedding(VOCAB_SIZE,EMBED_DIM)(first_input)
position_embeddings = layers.Embedding(
        input_dim=SEQUENCE_LEN,
        output_dim=EMBED_DIM,
        weights=[get_pos_encoding_matrix(SEQUENCE_LEN, EMBED_DIM)],
        name="position_embedding",
    )(tf.range(start=0, limit=SEQUENCE_LEN, delta=1))
# total_embs = word_embeddings + get_pos_encoding_matrix(SEQUENCE_LEN,EMBED_DIM)
# total_embs = word_embeddings + position_embeddings
x = embedding_layer + position_embeddings

## number of the encoder layer determined with 

for i in range(NUM_ATT_LAYER):
  ### number of attention heads determined according the NUM_HEADS
  mha = MultiHeadAttention(num_heads = NUM_HEADS, key_dim = 16)(x,x,x)
  norm_1 = LayerNormalization(epsilon=1e-6)(mha+x)

  seq_model = tf.keras.Sequential([
                                   Dense(N_NEURONS, activation = 'relu'),
                                   Dense(EMBED_DIM)
  ])

  seq_layer = seq_model(norm_1)
  norm_2 =LayerNormalization(epsilon=1e-6)(seq_layer+norm_1)
  x = norm_2
  print(i)

top_layer = Dense(VOCAB_SIZE+1, activation='relu', name = 'selection_layer')(x)
attention_model = tf.keras.Model(inputs = first_input, outputs = top_layer)
tf.keras.utils.plot_model(
    attention_model,
    to_file='model.png', show_shapes=False, show_dtype=False,
    show_layer_names=True, rankdir='TB', expand_nested=False, dpi=96,
    layer_range=None, show_layer_activations=False
)

attention_model.compile(
    loss = 'SparseCategoricalCrossentropy',
    optimizer = tf.keras.optimizers.Adam(
    learning_rate=0.001,
    #  beta_1=0.9, beta_2=0.999, epsilon=1e-07
     ),
    metrics = ['accuracy']

)

attention_model.fit(bir,iki, epochs = 2, batch_size = 10)
id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
token2id = {y: x for x, y in id2token.items()}
sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])


0
Epoch 1/2
Epoch 2/2


In [15]:
# from google.colab import drive
# drive.mount('/content/drive')
# attention_model.save("drive/MyDrive/Colab Notebooks/bert_mlm.h5")

In [16]:
def decode(tokens):
        return " ".join([id2token[t] for t in tokens if t != 0])

def convert_ids_to_tokens(id):
        return id2token[id]

prediction = attention_model.predict(sample_tokens)

masked_index = np.where(sample_tokens == mask_token_id)
masked_index = masked_index[1]
mask_prediction = prediction[0][masked_index]
k=5
top_indices = mask_prediction[0].argsort()[-k :][::-1]
values = mask_prediction[0][top_indices]

for i in range(len(top_indices)):
  p = top_indices[i]
  v = values[i]
  tokens = np.copy(sample_tokens[0])
  tokens[masked_index[0]] = p
  result = {
                "input_text": decode(sample_tokens[0].numpy()),
                # "prediction": decode(tokens),
                "probability": v,
                "predicted mask token": convert_ids_to_tokens(p),
            }
  pprint(result)

KeyError: ignored

In [17]:
len(vocab)

6000