In [2]:
import math, torch
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification, BertConfig

torch.manual_seed(42)

<torch._C.Generator at 0x108799050>

# 0. load tokenizer and model (fast tokenizer, base model)

In [3]:
name = "bert-base-uncased"
tok = AutoTokenizer.from_pretrained(name, use_fast=True)
model = BertModel.from_pretrained(name) #encoder only stack
model.eval() #turn off dropout

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [4]:
print("[Info] vocab_size:", tok.vocab_size)
print("[Info] Hidden size / layers / heads:",model.config.hidden_size, model.config.num_hidden_layers, model.config.num_attention_heads)

[Info] vocab_size: 30522
[Info] Hidden size / layers / heads: 768 12 12


# 1. input representation: [CLS] / [SEP] + token_type_ids + attention_mask

In [5]:
premise = "A cat sits on the mat."
hypo = "An animal is resting indoors."

batch = tok(
    premise,
    hypo,
    padding="max_length",
    max_length=16,
    truncation=True,
    return_tensors="pt",
    return_token_type_ids=True
)

print("\n[1] keys:", batch.keys())
print("[1] shapes:",
      "input_ids", tuple(batch["input_ids"].shape),
      "token_type_ids", tuple(batch["token_type_ids"].shape),
      "attention_mask", tuple(batch["attention_mask"].shape))
print("[1] tokens:", tok.convert_ids_to_tokens(batch["input_ids"][0]))


[1] keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
[1] shapes: input_ids (1, 16) token_type_ids (1, 16) attention_mask (1, 16)
[1] tokens: ['[CLS]', 'a', 'cat', 'sits', 'on', 'the', 'mat', '.', '[SEP]', 'an', 'animal', 'is', 'resting', 'indoors', '.', '[SEP]']


In [6]:
batch["input_ids"]

tensor([[  101,  1037,  4937,  7719,  2006,  1996, 13523,  1012,   102,  2019,
          4111,  2003,  8345, 24274,  1012,   102]])

In [7]:
batch["token_type_ids"]

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])

In [8]:
# sentence A token type id=0, B->1

# 2. Encoder-only stack: no decoder / no causal mask

In [14]:
# forward and shape: last_hidden_state [B, L, H], pooler_output [B, H], attention [B, heads, L, L]

In [12]:
with torch.no_grad():
    out = model(**batch, output_attentions=True)
print("\n[2] last_hidden_state:", tuple(out.last_hidden_state.shape))
print("[2] pooler_output:", tuple(out.pooler_output.shape))
print("[2] num layers of attentions:", len(out.attentions), "each", tuple(out.attentions[0].shape))


[2] last_hidden_state: (1, 16, 768)
[2] pooler_output: (1, 768)
[2] num layers of attentions: 12 each (1, 12, 16, 16)


# 3. Input embeddings = Token + Position + Segment (then LN)

In [25]:
ids = batch["input_ids"] #[1, L]
segs = batch["token_type_ids"] #[1, L]
L = ids.size(1)
pos = torch.arange(L).unsqueeze(0) #[1, L]

with torch.no_grad():
    we = model.embeddings.word_embeddings(ids)        #[1, L, H]
    pe = model.embeddings.position_embeddings(pos)    #[1, L, H]
    se = model.embeddings.token_type_embeddings(segs) #[1, L, H]
    summed = we + pe + se
    summed_ln = model.embeddings.LayerNorm(summed)    #ERT: sum → LayerNorm → Dropout
    #official embeddings result (eval mode, off dropout)
    official = model.embeddings(input_ids=ids, token_type_ids=segs, position_ids=pos)

In [21]:
isinstance(model.embeddings.LayerNorm, torch.nn.LayerNorm)

True

In [26]:
diff = (summed_ln - official).abs().max().item()
print("\n[3] max |(token+pos+seg)->LN - embeddings.forward| = ", f"{diff:.6e}")


[3] max |(token+pos+seg)->LN - embeddings.forward| =  4.768372e-07


In [23]:
# 验证：手动相加后过 LayerNorm，与 BERT 的 embeddings.forward 一致

# 4. Bidirectional demo

In [24]:
sent = "I went to the bank to deposit money."
pack = tok(
    sent,
    padding=False,
    truncation=True,
    return_tensors="pt"
)
tokens = tok.convert_ids_to_tokens(pack["input_ids"][0])
print("\n[4] sentence tokens:", tokens)


[4] sentence tokens: ['[CLS]', 'i', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.', '[SEP]']


In [27]:
# find the position of "bank" (uncased WordPiece is "bank")
try:
    idx_bank = tokens.index("bank")
except ValueError:
    #if cut into wordpieces, fallback mid position
    idx_bank = len(tokens)//2

print("[4] 'bank' index:", idx_bank, "token:", tokens[idx_bank])

[4] 'bank' index: 5 token: bank


## 4.a default bidirectional padding-only mask all 1

In [28]:
with torch.no_grad():
    out_bi = model(**pack).last_hidden_state #[1, L, H]
    h_bi = out_bi[0, idx_bank]

## 4.b 3D attention mask (lowertriangle=1, upper triangle=0) to only look left

In [30]:
L = pack["input_ids"].size(1)
tri = torch.tril(torch.ones(L, L, dtype=torch.long)).unsqueeze(0) #[1, L, L]
with torch.no_grad():
    out_uni = model(input_ids=pack["input_ids"],
                    attention_mask=tri,
                    token_type_ids=None).last_hidden_state
    h_uni = out_uni[0, idx_bank]

## 4.c check the cosine similarity, if <1, then change by using the right information

In [31]:
cos = torch.nn.functional.cosine_similarity(h_bi.unsqueeze(0), h_uni.unsqueeze(0)).item()
print(f"[4.c] cosine( bidirectional vs left-only ) at 'bank' = {cos:.4f}  (the smaller the cos, the bigger the difference)")

[4.c] cosine( bidirectional vs left-only ) at 'bank' = 0.3379  (the smaller the cos, the bigger the difference)


# 5. [CLS] as classification head

In [33]:
cfg = BertConfig.from_pretrained(name, num_labels=3) #suppose 3 classes
clf = BertForSequenceClassification.from_pretrained(name, config=cfg)
clf.eval()
with torch.no_grad():
    logits = clf(**batch).logits #[B, num_labels]
print("\n[5] classification logits shape:", tuple(logits.shape))
print("[5] argmax class:", logits.argmax(-1).tolist())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



[5] classification logits shape: (1, 3)
[5] argmax class: [0]


# 6. Bonus: Base vs "minimal" config

In [34]:
mini_cfg = BertConfig(
    covab_size=tok.vocab_size,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=4,
    intermediate_size=256,
    pad_token_id=tok.pad_token_id
)

mini = BertModel(mini_cfg) #random initialized mini berg (encoder only)
with torch.no_grad():
    _ = mini(**batch)

print("\n[6] mini BERT encoder layers:", len(mini.encoder.layer))
print("[6] This confirms: BERT is encoder-only (no decoder, no cross-attn).")


[6] mini BERT encoder layers: 2
[6] This confirms: BERT is encoder-only (no decoder, no cross-attn).


* 三件套嵌入真实相加（token + position + segment → LayerNorm）与 BERT 内部实现一致。

* 默认注意力是双向的（没有因果上三角遮罩）；我们强行加了“只看左边”的 3D mask，结果该词位的表示明显改变 ⇒ 证明默认是利用右侧上下文的。

* BERT 是纯 Encoder 堆叠；BertForSequenceClassification 直接在 [CLS] 上加线性头即可下游分类。