# 0. Initialie model and tokenizer

In [1]:
import torch
from transformers.models.bert import BertModel, BertTokenizer

In [2]:
model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states=True)

In [3]:
model.config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.55.4",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [4]:
test_sentence = 'this is a test sentence'

model_input = tokenizer(test_sentence, return_tensors='pt')

# 1. Model output

In [17]:
model.eval()

with torch.no_grad():
    output = model(**model_input)

# Output of first layer
output[2][1]

tensor([[[ 0.1556, -0.0080, -0.0707,  ...,  0.0786,  0.0213,  0.0616],
         [-0.5333,  0.5799,  0.1044,  ...,  0.0241,  0.4888,  0.0161],
         [-1.0609, -0.3058, -0.5043,  ...,  0.1874,  0.2874,  0.4032],
         ...,
         [ 0.8206, -0.6656, -0.7054,  ...,  0.1347,  0.1117, -1.9040],
         [ 1.1128,  0.6603, -0.1509,  ...,  0.3253, -1.0006, -1.9106],
         [-0.0736,  0.0346,  0.0376,  ..., -0.4506,  0.6585, -0.0502]]])

# 2. from scratch

- BertLayer
    - attention: BertAttention
        - self: BertSelfAttention
        - output: BertSelfOutput
    - Feed Forward
        - intermediate: BertIntermediate, 768=>4*768
        - output: BertOutput, 4*768=>768

In [8]:
embeddings = output[2][0]

layer = model.encoder.layer[0]
layer

BertLayer(
  (attention): BertAttention(
    (self): BertSdpaSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
    (intermediate_act_fn): GELUActivation()
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

## 2.1 First add & norm, in mha
mha -> multi-head attention

In [22]:
# Output of Multi-Head Attention
mha_output = layer.attention.self(embeddings)

# Output of Add & Norm
attn_output = layer.attention.output(mha_output[0], embeddings)

## 2.2 First add & norm, in mlp

In [21]:
# Output of Feed Forward
mlp1 = layer.intermediate(attn_output)

mlp1.shape

torch.Size([1, 7, 3072])

In [20]:
# Output of Addd & Norm
mlp2 = layer.output(mlp1, attn_output)

mlp2

tensor([[[ 0.1556, -0.0080, -0.0707,  ...,  0.0786,  0.0213,  0.0616],
         [-0.5333,  0.5799,  0.1044,  ...,  0.0241,  0.4888,  0.0161],
         [-1.0609, -0.3058, -0.5043,  ...,  0.1874,  0.2874,  0.4032],
         ...,
         [ 0.8206, -0.6656, -0.7054,  ...,  0.1347,  0.1117, -1.9040],
         [ 1.1128,  0.6603, -0.1509,  ...,  0.3253, -1.0006, -1.9106],
         [-0.0736,  0.0346,  0.0376,  ..., -0.4506,  0.6585, -0.0502]]],
       grad_fn=<NativeLayerNormBackward0>)