In [1]:
from transformers import BertModel

# 1. The structure of base BERT:
- embeddings
    - word embedding
    - position embedding
    - token type embedding
- encoder (12 layers)
    - self attention (kqv)
    - feed forward
- pooler

### Summary
- BERT: just use the encoder of transformer
    - transformer: **encoder**-decoder(seq2seq)

In [5]:
model_name = 'bert-base-uncased' # uncased -> Determine whether it is case sensitive

model = BertModel.from_pretrained(model_name)

model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

# 2. Number of parameters
num(params) is Crazy!!!

In [9]:
# The total number of parameters
total_params = 0
total_learnable_params = 0

for name, param in model.named_parameters():
    print(name, '->', param.shape, '->', param.numel())
    if param.requires_grad:
        total_learnable_params += param.numel()

    total_params += param.numel()

embeddings.word_embeddings.weight -> torch.Size([30522, 768]) -> 23440896
embeddings.position_embeddings.weight -> torch.Size([512, 768]) -> 393216
embeddings.token_type_embeddings.weight -> torch.Size([2, 768]) -> 1536
embeddings.LayerNorm.weight -> torch.Size([768]) -> 768
embeddings.LayerNorm.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.self.query.weight -> torch.Size([768, 768]) -> 589824
encoder.layer.0.attention.self.query.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.self.key.weight -> torch.Size([768, 768]) -> 589824
encoder.layer.0.attention.self.key.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.self.value.weight -> torch.Size([768, 768]) -> 589824
encoder.layer.0.attention.self.value.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.output.dense.weight -> torch.Size([768, 768]) -> 589824
encoder.layer.0.attention.output.dense.bias -> torch.Size([768]) -> 768
encoder.layer.0.attention.output.LayerNorm.weight -> torch.Size([768])

In [8]:
total_params, total_learnable_params

(109482240, 109482240)

In [11]:
total_embeddings_params = 0
total_encoder_params = 0
total_pooler_params = 0

for name, param in model.named_parameters():
    if 'embedding' in name:
        total_embeddings_params += param.numel()
    if 'encoder' in name:
        total_encoder_params += param.numel()
    if 'pooler' in name:
        total_pooler_params += param.numel()
    if param.requires_grad:
        total_learnable_params += param.numel()
    total_params += param.numel()

In [12]:
total_embeddings_params, total_encoder_params, total_pooler_params

(23837184, 85054464, 590592)

In [13]:
params = [total_embeddings_params, total_encoder_params, total_pooler_params]
for param in params:
    print(param/sum(params))

0.21772649152958506
0.776879099295009
0.005394409175405983
