<a href="https://colab.research.google.com/github/michaelmml/Word-Generation/blob/main/EncoderBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import math
import re
from random import *
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [2]:
text = (
        'Hello, how are you? I am Romeo.\n'
        'Hello, Romeo My name is Juliet. Nice to meet you.\n'
        'Nice meet you too. How are you today?\n'
        'Great. My baseball team won the competition.\n'
        'Oh Congratulations, Juliet\n'
        'Thanks you Romeo'
    )

In [3]:
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n')  # filter '.', ',', '?', '!'
word_list = list(set(" ".join(sentences).split()))
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}


for i, w in enumerate(word_list):
    word_dict[w] = i + 4
number_dict = {i: w for i, w in enumerate(word_dict)}
vocab_size = len(word_dict)

token_list = list()
for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)

In [41]:
maxlen = 30 # maximum of length
batch_size = 6
max_pred = 5  # max tokens of prediction
n_layers = 6 # number of Encoder of Encoder Layer
n_heads = 12 # number of heads in Multi-Head Attention
d_model = 768 # Embedding Size
d_ff = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2
dropout_rate = 0.1

In [5]:
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        tokens_a_index, tokens_b_index= randrange(len(sentences)), randrange(len(sentences))
        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]

        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]

        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        #MASK LM
        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence

        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [], []
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random() < 0.8:  # 80%
                input_ids[pos] = word_dict['[MASK]'] # make mask
            elif random() < 0.5:  # 10%
                index = randint(0, vocab_size - 1) # random index in vocabulary
                input_ids[pos] = word_dict[number_dict[index]] # replace

        # Zero Paddings
        n_pad = maxlen - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

    #     # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1

    return batch

In [6]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    # eq(zero) is PAD token
    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

In [13]:
batch = make_batch()

In [16]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(tf.constant, zip(*batch))

In [31]:
input_ids

<tf.Tensor: shape=(6, 30), dtype=int32, numpy=
array([[ 1, 11, 22, 23,  2,  3, 21, 26,  2,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 12, 21, 26,  2, 18, 23, 16,  3, 15, 26,  6, 10,  7, 22,  2,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 12, 21, 26,  2,  6,  7, 22,  3,  3, 28, 22, 20,  2,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1,  3, 14, 28, 22, 27, 25,  3,  2, 18, 23, 16, 13, 15, 26,  6,
        10,  7,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 18,  3, 16, 13, 15, 26,  6, 10, 11, 22,  2,  6,  7, 22,  3,
        14, 28, 22, 20,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 1, 12,  3, 26,  2, 11, 22, 23,  2,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)>

In [63]:
class Embedding(tf.keras.layers.Layer):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = tf.keras.layers.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = tf.keras.layers.Embedding(maxlen, d_model)  # position embedding
        self.seg_embed = tf.keras.layers.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = layers.LayerNormalization(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = tf.experimental.numpy.arange(seq_len)
        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

In [64]:
def scaled_dot_product_attention(q,k,v,mask):
  matmul_qk = tf.matmul(q, k, transpose_b=True)
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention = matmul_qk / tf.math.sqrt(dk)
  # add the mask
  if mask is not None:
    scaled_attention += (mask * -1e9)
  
  attention_weights = tf.nn.softmax(scaled_attention, axis=-1)
  output = tf.matmul(attention_weights, v)

  return output, attention_weights

In [65]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0,2,1,3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention,
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

In [66]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model)
  ])

In [67]:
class EncoderLayer(tf.keras.layers.Layer):
    
    def __init__(self, d_model, FFN_units, n_heads, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.n_heads = n_heads
        self.d_model = d_model
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        # Build the multihead layer
        self.multi_head_attention = MultiHeadAttention(self.d_model, self.n_heads)
        self.ffn = point_wise_feed_forward_network(self.d_model, self.FFN_units)

        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        
    def call(self, inputs, mask, training):
        # Forward pass of the multi-head attention
        attn_output, _ = self.multi_head_attention(inputs,
                                              inputs,
                                              inputs,
                                              mask)
        attn_output = self.dropout_1(attn_output, training=training)
        attn_output = self.norm_1(attn_output + inputs)
        outputs = self.ffn(attn_output)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attn_output)
        
        return outputs

In [68]:
def torch_gather(x, indices, gather_axis):
    # if pytorch gather indices are
    # [[[0, 10, 20], [0, 10, 20], [0, 10, 20]],
    #  [[0, 10, 20], [0, 10, 20], [0, 10, 20]]]
    # tf nd_gather needs to be
    # [[0,0,0], [0,0,10], [0,0,20], [0,1,0], [0,1,10], [0,1,20], [0,2,0], [0,2,10], [0,2,20],
    #  [1,0,0], [1,0,10], [1,0,20], [1,1,0], [1,1,10], [1,1,20], [1,2,0], [1,2,10], [1,2,20]]

    # create a tensor containing indices of each element
    all_indices = tf.where(tf.fill(indices.shape, True))
    gather_locations = tf.reshape(indices, [indices.shape.num_elements()])

    # splice in our pytorch style index at the correct axis
    gather_indices = []
    for axis in range(len(indices.shape)):
        if axis == gather_axis:
            gather_indices.append(gather_locations)
        else:
            gather_indices.append(all_indices[:, axis])

    gather_indices = tf.stack(gather_indices, axis=-1)
    gathered = tf.gather_nd(x, gather_indices)
    reshaped = tf.reshape(gathered, indices.shape)
    return reshaped

In [72]:
class BERT(tf.keras.layers.Layer):

    def __init__(self,
                 n_layers,
                 FFN_units,
                 n_heads,
                 dropout_rate,
                 vocab_size,
                 d_model,
                 name="BERT"):
      
        super(BERT, self).__init__(name=name)
        self.n_layers = n_layers
        self.d_model = d_model
        self.embedding = Embedding()
        self.layers = [EncoderLayer(d_model, FFN_units,
                                        n_heads,
                                        dropout_rate) 
                           for _ in range(n_layers)]
        
        self.fc = tf.keras.layers.Dense(d_model, input_shape=(d_model,), activation=None)
        self.linear = tf.keras.layers.Dense(d_model, input_shape=(d_model,), activation=None)
        self.norm = layers.LayerNormalization(d_model)
        self.classifier = tf.keras.layers.Dense(2, input_shape=(d_model,), activation=None)
        
        # decoder is shared with embedding layer
        embed_weight = self.embedding.tok_embed
        n_vocab, n_dim = embed_weight.size()
        self.decoder = tf.keras.layers.Dense(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        # self.decoder_bias = nn.Parameter(torch.zeros(n_vocab))

    def forward(self, input_ids, segment_ids, masked_pos, training):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)

        for layer in self.layers:
            output, enc_self_attn = layer(output, enc_self_attn_mask, training)
        # output : [batch_size, len, d_model], attn : [batch_size, n_heads, d_mode, d_model]
  
        h_pooled = tf.math.tanh(self.fc(output[:, 0])) # [batch_size, d_model]
        logits_clsf = self.classifier(h_pooled) # [batch_size, 2]

        masked_pos = masked_pos[:, :, None].expand(-1, -1, output.size(-1)) # [batch_size, max_pred, d_model]
        # get masked position from final output of transformer.
        
        h_masked = torch_gather(output, 1, masked_pos) # masking position [batch_size, max_pred, d_model]
        h_masked = self.norm(tf.keras.activations.gelu(self.linear(h_masked)))
        logits_lm = self.decoder(h_masked)

        return logits_lm, logits_clsf

In [73]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [74]:
model = BERT(    
    n_layers = n_layers,
    d_model = d_model,
    n_heads = n_heads,
    FFN_units = d_ff,
    vocab_size = vocab_size, 
    dropout_rate = dropout_rate)

learning_rate = CustomSchedule(d_model)
criterion = tf.keras.losses.SparseCategoricalCrossentropy(
                                    from_logits=True, reduction='none')
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

batch = make_batch()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(tf.constant, zip(*batch))

for epoch in range(10):
    optimizer.zero_grad()
    logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
    loss_lm = criterion(logits_lm.transpose(1, 2), masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()
    loss_clsf = criterion(logits_clsf, isNext) # for sentence classification
    loss = loss_lm + loss_clsf
    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    loss.backward()
    optimizer.step()

AttributeError: ignored