In [77]:
from transformers import AutoTokenizer

In [78]:
# !pip install bertviz

In [79]:
# bert model visualization library
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

text = "There is a dog in the garden."

model = BertModel.from_pretrained(model_checkpoint)

show(model, "bert", tokenizer, text, display_mode = "light", layer = 0, head = 0)

Output hidden; open in https://colab.research.google.com to view.

# Tokenization

In [80]:
tokenized_text = tokenizer(text, return_tensors = "pt")
tokenized_text

{'input_ids': tensor([[ 101, 2045, 2003, 1037, 3899, 1999, 1996, 3871, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [81]:
tokenized_text.input_ids

tensor([[ 101, 2045, 2003, 1037, 3899, 1999, 1996, 3871, 1012,  102]])

In [82]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_checkpoint)
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

# Embedding

In [83]:
import torch

token_embedding = torch.nn.Embedding(config.vocab_size, config.hidden_size)
token_embedding

Embedding(30522, 768)

In [84]:
inputs_embeds = token_embedding(tokenized_text.input_ids)
inputs_embeds.shape

torch.Size([1, 10, 768])

In [85]:
inputs_embeds[0][0].shape
# this is the embedding for the first word of the first sentence in out input

torch.Size([768])

# Attention

In [86]:
import torch
from math import sqrt
import numpy as np

In [87]:
query = key = value = inputs_embeds # set the Q, K and V to the same as inouts_embeds later they can update through training

In [88]:
key.size()

torch.Size([1, 10, 768])

In [89]:
dim_k = key.size(-1) # get 768 as dimension
print("The dimension for the vector is: ", dim_k)

attn_scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k) # batch multiplcation
print("The attention score for the input: ", attn_scores) # this the attention score for our token of 10 input
print("Shape of attention score: ", attn_scores.shape)

The dimension for the vector is:  768
The attention score for the input:  tensor([[[ 2.7089e+01, -4.9855e-01, -7.0206e-01,  1.2510e+00,  9.5965e-01,
          -1.0770e+00, -7.0859e-01, -5.0018e-01, -3.4626e-01, -6.0123e-03],
         [-4.9855e-01,  2.7518e+01, -2.5033e-02,  1.6455e+00, -5.9037e-01,
          -6.3501e-01, -1.1600e+00, -8.8539e-01,  4.2317e-01,  9.4370e-02],
         [-7.0206e-01, -2.5033e-02,  2.5871e+01,  1.0656e+00, -1.1591e+00,
          -4.8743e-01, -8.2665e-01, -1.1937e+00, -1.8482e-01,  8.6857e-01],
         [ 1.2510e+00,  1.6455e+00,  1.0656e+00,  3.2572e+01, -1.7993e+00,
           9.1329e-01, -1.3535e+00,  1.9169e-01, -1.8282e+00, -1.5603e+00],
         [ 9.5965e-01, -5.9037e-01, -1.1591e+00, -1.7993e+00,  2.8650e+01,
           5.7000e-01, -5.2806e-01, -8.3222e-01,  5.7307e-01, -7.3861e-01],
         [-1.0770e+00, -6.3501e-01, -4.8743e-01,  9.1329e-01,  5.7000e-01,
           3.0560e+01, -9.9571e-01, -1.6467e-01,  4.5263e-01,  8.2892e-01],
         [-7.0859e-0

In [90]:
import torch.nn.functional as F

weights = F.softmax(attn_scores, dim=1) # attention weights
print("Shape of weight: ", weights.shape)
weights.sum(dim=-1) # checking the softmax output because they need to sum up to 1

Shape of weight:  torch.Size([1, 10, 10])


tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]], grad_fn=<SumBackward1>)

In [91]:
attention_outputs = torch.bmm(weights, value)
attention_outputs.shape # final self attention output

torch.Size([1, 10, 768])

# Modular: Scaled dot product attention

In [92]:
def scaled_dot_product_attention(query, key, value):
  """
  Takes the Q,K,V matrix and returns the attention output
  """
  dim_k = key.size(-1)
  attention_scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
  attention_weights = F.softmax(attention_scores, dim=-1)
  attention_outputs = torch.bmm(attention_weights, value)
  return attention_outputs

In [93]:
scaled_dot_product_attention(query, key, value).shape

torch.Size([1, 10, 768])

In paper they use multi head attention to capture different meaning parallely.

# Multi-head Attention Mechanism

In [94]:
query.shape # 1 , 10 token , 768 embedding

torch.Size([1, 10, 768])

In [95]:
import torch

class AttentionHead(torch.nn.Module):
  def __init__(self, embed_dim, head_dim):
    super().__init__()
    self.q = torch.nn.Linear(in_features=embed_dim, out_features=head_dim)
    self.k = torch.nn.Linear(in_features=embed_dim, out_features=head_dim)
    self.v = torch.nn.Linear(in_features=embed_dim, out_features=head_dim)

  def forward(self, hidden_state):
    attention_outputs = scaled_dot_product_attention(
        self.q(hidden_state),
        self.k(hidden_state),
        self.v(hidden_state)
    )
    return attention_outputs

In [96]:
bert_embed_dim = 768
bert_head_dim = 12

bert_embed_dim / bert_head_dim
# 64 which is the linear projection of each attention head in multi head attention


64.0

In [97]:
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [102]:
class MultiHeadAttention(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    embed_dim = config.hidden_size
    num_heads = config.num_attention_heads
    head_dim = embed_dim // num_heads # number of multiheads in model

    self.heads = torch.nn.ModuleList( # list with gradient associated in it
      [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
    )
    self.output_linear = torch.nn.Linear(
        in_features=embed_dim, # embed_dim * head_dim ?
        out_features=embed_dim
    )
  def forward(self, hidden_state):
    concatenated_outputs = torch.cat(
        [h(hidden_state) for h in self.heads], dim=-1
    )
    concatenated_outputs = self.output_linear(concatenated_outputs)
    return concatenated_outputs

In [103]:
inputs_embeds.shape

torch.Size([1, 10, 768])

In [104]:
# check the attention code
multihead_attention = MultiHeadAttention(config)
attention_output = multihead_attention(inputs_embeds)
attention_output.shape

torch.Size([1, 10, 768])

# Visualize Attention of Our Attention Code

In [108]:
from bertviz import head_view

from transformers import AutoModel

model = AutoModel.from_pretrained(model_checkpoint, output_attentions=True)

sentence_a = "There is a dog in the garden."
sentence_b = "The dog is in the garden."

In [112]:
viz_inputs = tokenizer(sentence_a, sentence_b, return_tensors="pt")

In [113]:
viz_inputs

{'input_ids': tensor([[ 101, 2045, 2003, 1037, 3899, 1999, 1996, 3871, 1012,  102, 1996, 3899,
         2003, 1999, 1996, 3871, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [116]:
attention = model(**viz_inputs).attentions

In [119]:
sentence_b_start = (viz_inputs.token_type_ids == 0).sum(dim=1)
sentence_a_end = (viz_inputs.token_type_ids == 1).sum(dim=1)

tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])

head_view(attention, tokens, sentence_b_start)

Output hidden; open in https://colab.research.google.com to view.

# Position wise Feed Forward Layer

In [121]:
class FeedForward(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.linear_1 = torch.nn.Linear(
        in_features=config.hidden_size,
        out_features=config.intermediate_size
    )
    self.linear_2 = torch.nn.Linear(
        in_features=config.intermediate_size,
        out_features=config.hidden_size
    )
    self.gelu = torch.nn.GELU()
    self.dropout = torch.nn.Dropout(
        config.hidden_dropout_prob
    )

  def forward(self, x):
    x = self.linear_1(x)
    x = self.gelu(x)
    x = self.linear_2(x)
    x = self.dropout(x)
    return x

In [122]:
attention_output.shape

torch.Size([1, 10, 768])

In [123]:
feed_forward = FeedForward(config)
ff_output = feed_forward(attention_output)
ff_output.shape

torch.Size([1, 10, 768])

# Layer Norm

In [127]:
inputs_embeds.shape

torch.Size([1, 10, 768])

In [124]:
class TransformerEncoderLayer(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.layer_norm_1 = torch.nn.LayerNorm(config.hidden_size)
    self.layer_norm_2 = torch.nn.LayerNorm(config.hidden_size)

    self.attention = MultiHeadAttention(config)

    self.feed_forward = FeedForward(config)

  def forward(self, x):
    hidden_state = self.layer_norm_1(x)
    x = x + self.attention(hidden_state)
    x = x + self.feed_forward(self.layer_norm_2(x))
    return x

In [128]:
encoder_layer = TransformerEncoderLayer(config)
encoder_layer(inputs_embeds).size()

torch.Size([1, 10, 768])

# Positional Encoding

In [133]:
class Embeddings(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.token_embeddings = torch.nn.Embedding(config.vocab_size, config.hidden_size)
    self.position_embeddings = torch.nn.Embedding(config.max_position_embeddings, config.hidden_size)
    self.layer_norm = torch.nn.LayerNorm(config.hidden_size, eps=1e-12)
    self.dropout = torch.nn.Dropout()

  def forward(self, input_ids):
    seq_length = input_ids.size(1)
    position_ids = torch.arange(seq_length, dtype=torch.long).unsqueeze(0)
    token_embeddings = self.token_embeddings(input_ids)
    position_embeddings = self.position_embeddings(position_ids)

    embeddings = token_embeddings + position_embeddings
    embeddings = self.layer_norm(embeddings)
    embeddings = self.dropout(embeddings)
    return embeddings

In [134]:
embedding_layer = Embeddings(config)
embedding_layer(viz_inputs.input_ids).shape

torch.Size([1, 18, 768])

In [135]:
viz_inputs

{'input_ids': tensor([[ 101, 2045, 2003, 1037, 3899, 1999, 1996, 3871, 1012,  102, 1996, 3899,
         2003, 1999, 1996, 3871, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

# Encoder Architecture

In [136]:
class TransformerEncoder(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.embeddings = Embeddings(config)
    self.layers = torch.nn.ModuleList(
        [TransformerEncoderLayer(config) for _ in range(config.num_hidden_layers)]
    )

  def forward(self, x):
    x = self.embeddings(x)
    for layer in self.layers:
      x = layer(x)
    return x

In [137]:
tokenized_text

{'input_ids': tensor([[ 101, 2045, 2003, 1037, 3899, 1999, 1996, 3871, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [139]:
TransformerEncoder(config)

TransformerEncoder(
  (embeddings): Embeddings(
    (token_embeddings): Embedding(30522, 768)
    (position_embeddings): Embedding(512, 768)
    (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (layers): ModuleList(
    (0-11): 12 x TransformerEncoderLayer(
      (layer_norm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layer_norm_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attention): MultiHeadAttention(
        (heads): ModuleList(
          (0-11): 12 x AttentionHead(
            (q): Linear(in_features=768, out_features=64, bias=True)
            (k): Linear(in_features=768, out_features=64, bias=True)
            (v): Linear(in_features=768, out_features=64, bias=True)
          )
        )
        (output_linear): Linear(in_features=768, out_features=768, bias=True)
      )
      (feed_forward): FeedForward(
        (linear_1): Linear(in_features=768, out_features=3072, b

In [138]:
encoder = TransformerEncoder(config)
output = encoder(tokenized_text.input_ids)

In [142]:
output.shape, tokenized_text.input_ids.shape

(torch.Size([1, 10, 768]), torch.Size([1, 10]))

# Classification head: for classification

In [143]:
config.hidden_size

768

In [144]:
class TransformerForSequenceClassification(torch.nn.Module):
  def __init__(self, config):
    super().__init__()
    self.encoder = TransformerEncoder(config)
    self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
    self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels) # get 768 input feature from [CLS] token and return required output label

  def forward(self, x):
    x = self.encoder(x)[:, 0, :] # [CLS] token only taken for classification of sequence [10, 768]
    x = self.dropout(x)
    x = self.classifier(x)
    return x

In [145]:
config.num_labels = 3 # for 3 class
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [147]:
encoder_classifier = TransformerForSequenceClassification(config)
classifier_preds = encoder_classifier(tokenized_text.input_ids)

In [148]:
classifier_preds

tensor([[-0.3806, -1.7427,  2.6048]], grad_fn=<AddmmBackward0>)