<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/bert/tf_bert_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT from scratch

## Intro

In [None]:
#@title ## Import packages
import os
import re
import shutil
from dataclasses import dataclass

import numpy as np
import pandas as pd
import tensorflow as tf
from google.colab import auth, data_table
from numpy.random import rand, randint
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Input,
    LayerNormalization,
    MultiHeadAttention,
)
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing import text_dataset_from_directory

tf.get_logger().setLevel("ERROR")

print("tensorflow", tf.__version__)

tensorflow 2.4.0


## 1. Configuration

In [None]:
@dataclass
class Config:
    MAX_LEN = 256
    BATCH_SIZE = 32
    LR = 0.001
    VOCAB_SIZE = 30000
    EMBED_DIM = 128
    NUM_HEAD = 8
    FF_DIM = 128
    NUM_LAYERS = 1
    DS_URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" 
    TRAIN_DIR = None
    TEST_DIR = None
    VALIDATION_SPLIT = 0.1
    SEED = 42
    MASK_TOKEN="[mask]"
    MASK_TOKEN_ID=None

config = Config()

## 2. Download Dataset - Large Movie Review

We use [Stanford’s Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) as the dataset for sentiment analysis. This dataset is divided into two datasets for training and testing purposes, each containing 25,000 movie reviews downloaded from IMDb. In each dataset, the number of comments labeled as “positive” and “negative” is equal.


In [None]:
dataset = tf.keras.utils.get_file(
    "aclImdb.tar.gz", config.DS_URL, untar=True, cache_dir=".", cache_subdir=""
)
dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")
config.TRAIN_DIR = os.path.join(dataset_dir, "train")
config.TEST_DIR = os.path.join(dataset_dir, "test")
print("Train dataset:", config.TRAIN_DIR)
print("Train dataset:", config.TEST_DIR )

# remove unused folders from train folder
remove_dir = os.path.join(config.TRAIN_DIR, "unsup")
shutil.rmtree(remove_dir)

Train dataset: ./aclImdb/train
Train dataset: ./aclImdb/test


The IMDB dataset has already been divided into train and test, but it lacks a validation set. Let's create a validation split using a 90:10 split.

> Tensorflow Datasource requires to define a random seed or to pass shuffle=False so that the validation and training splits have no overlap. **We will pass a random seed**.

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

def optimize_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

train_ds = text_dataset_from_directory(
    config.TRAIN_DIR,
    subset="training",
    batch_size=config.BATCH_SIZE,
    validation_split=config.VALIDATION_SPLIT,
    seed=config.SEED,
)
class_names = train_ds.class_names
train_ds = optimize_dataset(train_ds)

val_ds = text_dataset_from_directory(
    config.TRAIN_DIR,
    subset="validation",
    batch_size=config.BATCH_SIZE,
    validation_split=config.VALIDATION_SPLIT,
    seed=config.SEED,
)
val_ds = optimize_dataset(val_ds)

test_ds = text_dataset_from_directory(
    config.TEST_DIR,
    batch_size=config.BATCH_SIZE
)
test_ds = optimize_dataset(test_ds)

Found 25000 files belonging to 2 classes.
Using 22500 files for training.
Found 25000 files belonging to 2 classes.
Using 2500 files for validation.
Found 25000 files belonging to 2 classes.


In [None]:
print("Loaded classes:", class_names)

Loaded classes: ['neg', 'pos']


In [None]:
text_batch, label_batch = next(train_ds.as_numpy_iterator())
df = pd.DataFrame({"text": text_batch, "label": label_batch})
data_table.DataTable(df, include_index=False, num_rows_per_page=3)

Unnamed: 0,text,label
0,"b'""Pandemonium"" is a horror movie spoof that c...",0
1,"b""David Mamet is a very interesting and a very...",0
2,b'Great documentary about the lives of NY fire...,1
3,"b""It's boggles the mind how this movie was nom...",0
4,b'The concept of the legal gray area in Love C...,0
5,"b""This flick reminds me some really bad scienc...",0
6,b'Finally a thriller which omits the car chase...,1
7,"b""I'm a Christian who generally believes in th...",0
8,b'This effort is based on the true story of Ji...,1
9,"b""Live Feed is set in some unnamed Chinese/Jap...",0


## 3. Preprocessing dataset

Standarization will uncase our samples as well as removing special characters.

In [None]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape("!#$%&'()*+,-./:;<=>?@\^_`{|}~"), ""
    )

In [None]:
custom_standardization(tf.constant("Simple<br /> text~!")).numpy()

b'simple  text'

In [None]:
custom_standardization(tf.constant(config.MASK_TOKEN)).numpy()

b'[mask]'

Build the preprocessing layer that will be used to tokenize raw text. The original BERT uses subword tokenization, but we will simplify that approach.

In [None]:
def build_vectorize_layer(
    texts,
    vocab_size=config.VOCAB_SIZE,
    max_seq=config.MAX_LEN,
    special_tokens=[config.MASK_TOKEN],
):
    vectorize_layer = TextVectorization(
        max_tokens=vocab_size,
        output_mode="int",
        standardize=custom_standardization,
        output_sequence_length=max_seq,
    )
    text_without_labels = texts.map(lambda text, label: text)
    vectorize_layer.adapt(text_without_labels)
    
    # insert mask token in vocabulary
    vocab = vectorize_layer.get_vocabulary()
    vocab = vocab[2:vocab_size - len(special_tokens)] + special_tokens
    vectorize_layer.set_vocabulary(vocab)
    return vectorize_layer


In [None]:
vectorize_layer = build_vectorize_layer(train_ds)

Let's test our tokenization layer.

In [None]:
list(
    vectorize_layer([tf.constant(f"dogs {config.MASK_TOKEN} awesome")]).numpy()[
        0
    ][:10]
)

[2371, 29999, 1163, 0, 0, 0, 0, 0, 0, 0]

Get the vectorized token for our mask.

In [None]:
config.MASK_TOKEN_ID = vectorize_layer([config.MASK_TOKEN]).numpy()[0][0]
print("Mask token id:", config.MASK_TOKEN_ID)

Mask token id: 29999


In [None]:
def encode(texts):
    encoded_texts = vectorize_layer(texts)
    return encoded_texts.numpy()

In [None]:
list(encode([tf.constant(f"dogs {config.MASK_TOKEN} awesome")])[0][:10])

[2371, 29999, 1163, 0, 0, 0, 0, 0, 0, 0]

Mask out 15% of the input tokens applying the following distribution:
- 80% of time, it will be replaced by [mask] token
- 10% of time, it will be replaced by a random token
- 10% of time, it will be keep as original.

In [None]:
def mask_encoded_batch(encoded_text_batch):
    encoded = encoded_text_batch.numpy()
    # mask out 15% of the input tokens
    selected_tokens = rand(*encoded.shape) < 0.15
    # do not mask special tokens
    selected_tokens[encoded <= 2] = False
    #print(selected_tokens)
    # 80% of time, it will be replaced by [mask] token
    masked_tokens = selected_tokens & (rand(*encoded.shape) < 0.80)
    #print(masked_tokens)
    # 10% of time, it will be keep as original.
    masked_tokens = masked_tokens & (rand(*encoded.shape) < 0.90)
    #print(masked_tokens)
    encoded[masked_tokens] = config.MASK_TOKEN_ID
    # 10% of time, it will be replaced by a random token
    random_tokens = selected_tokens & (rand(*encoded.shape) < 0.10)
    #print(random_tokens)
    encoded[random_tokens] = randint(3, config.MASK_TOKEN_ID, random_tokens.sum())
    
    return tf.convert_to_tensor(encoded, dtype=tf.int64)

In [None]:
mask_test = mask_encoded_batch(
    tf.constant([[1, 10, 20, 30, 40, 50, 60, 70, 0]])
)
mask_test = np.squeeze(mask_test.numpy())
assert (
    mask_test[0] == 1 and mask_test[len(mask_test) - 1] == 0
), "Do not mask reserved tokens."
print(mask_test)

[    1    10 29999 29999    40 29999    60    70     0]


In [None]:
def mask_helper(texts):
    encoded_texts = vectorize_layer(texts)
    masked_encoded = mask_encoded_batch(encoded_texts)
    return masked_encoded, tf.identity(encoded_texts)   

In [None]:
texts, labels = mask_helper([tf.constant("This is a simple test!")])
texts, labels = list(np.squeeze(texts.numpy())), list(np.squeeze(labels.numpy())) 
print("Encoded mask ", texts[:6])
print("Label        ", labels[:6])

Encoded mask  [10, 7, 4, 585, 29999, 0]
Label         [10, 7, 4, 585, 2207, 0]


In [None]:
train_ds = train_ds.map(
    lambda t, l: tf.py_function(mask_helper, inp=[t], Tout=[tf.int64, tf.int64])
)
train_ds.cardinality()

<tf.Tensor: shape=(), dtype=int64, numpy=704>

In [None]:
texts_batch, labels_batch = next(iter(train_ds))
print("Text batch shape  ", texts_batch.shape)
print("Labels batch shape", labels_batch.shape)

Text batch shape   (32, 256)
Labels batch shape (32, 256)


## 4. Define BERT model

In [None]:
def bert_module(query, key, value, i):
    # multi headed self-attention
    attention_output = MultiHeadAttention(
        num_heads=config.NUM_HEAD,
        key_dim=config.EMBED_DIM // config.NUM_HEAD,
        name=f"enconder_{i}/multiheadattention",
    )(query, key, value)
    attention_output = Dropout(0.1, name=f"enconder_{i}/att_dropout")(
        attention_output
    )
    attention_output = LayerNormalization(
        epsilon=1e-6, name=f"enconder_{i}/att_layernormalization"
    )(query + attention_output)

    # feed-forward layer
    ffn = Sequential(
        [Dense(config.FF_DIM, activation="relu"), Dense(config.EMBED_DIM)],
        name=f"enconder_{i}/ffn",
    )
    ffn_output = ff(attention_output)
    ffn_output = Dropout(0.1, name=f"enconder_{i}/ffn_dropout")(ffn_output)
    sequence_output = LayerNormalization(
        epsilon=1e-6, name=f"enconder_{i}/ffn_dropout"
    )(attention + ffn_output)
    return sequence_output

In [None]:
def get_pos_encoding_matrix(max_len, d_emb):
    pos_enc = np.array(
        [pos / np.power(1000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
        if pos != 0
        else np.zeros(d_emb)
        for pos in range(max_len)
    )
    pos_enc[1, 0::2] = np.sin(pos_enc[1:, 0::2])  # dim 2i
    pos_enc[1, 1::2] = np.cos(pos_enc[1:, 1::2])  # dim 2i + 1
    return pos_enc