## We're going to deconstruct the components of the GPT2 decoder model and manually reconstruct its output externally in Python.

## Step 1: Use GPT2 to tokenize our input sequence

In [3]:
from transformers import GPT2TokenizerFast
import torch

# Load GPT-2 tokenizer (fast version only)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Input sentence
sentence = "Open-source LLMs rock."

# Tokenize with offset mapping (GPT-2 doesn't use special tokens like [CLS])
encoding = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True, add_special_tokens=False)

# Extract input IDs and offset mappings
input_ids = encoding["input_ids"]
offsets = encoding["offset_mapping"][0]  # shape: (seq_len, 2)
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# Print token → offset → original string span
print(f"{'Token':15} | {'Offsets':11} | Text Segment")
print("-" * 45)
for token, (start, end) in zip(tokens, offsets):
    span = sentence[start:end]
    print(f"{token:15} | ({start:2}, {end:2})   | '{span}'")


Token           | Offsets     | Text Segment
---------------------------------------------
Open            | ( 0,  4)   | 'Open'
-               | ( 4,  5)   | '-'
source          | ( 5, 11)   | 'source'
ĠLL             | (11, 14)   | ' LL'
Ms              | (14, 16)   | 'Ms'
Ġrock           | (16, 21)   | ' rock'
.               | (21, 22)   | '.'


## Step 1a: We replicate using an alternate tokenizer

In [5]:
from tokenizers import Tokenizer

# Load the raw tokenizer from Hugging Face's pretrained assets
tokenizer_indep = Tokenizer.from_pretrained("gpt2")

# Your sentence
sentence = "Open-source LLMs rock."

# Encode using the low-level tokenizer
output = tokenizer_indep.encode(sentence)

# Extract tokens, IDs, and offsets
tokens_indep = output.tokens
ids_indep = output.ids
offsets_indep = output.offsets

# Print the results
print(f"{'Token':15} | {'Token ID':8} | {'Offsets':11} | Text Segment")
print("-" * 55)
for token, id_, (start, end) in zip(tokens_indep, ids_indep, offsets_indep):
    print(f"{token:15} | {id_:9} | ({start:2}, {end:2})   | '{sentence[start:end]}'")


Token           | Token ID | Offsets     | Text Segment
-------------------------------------------------------
Open            |     11505 | ( 0,  4)   | 'Open'
-               |        12 | ( 4,  5)   | '-'
source          |     10459 | ( 5, 11)   | 'source'
ĠLL             |     27140 | (11, 14)   | ' LL'
Ms              |     10128 | (14, 16)   | 'Ms'
Ġrock           |      3881 | (16, 21)   | ' rock'
.               |        13 | (21, 22)   | '.'


## Step 2: Grab the internal pre-trained raw token embeddings for these tokens from GPT-2.
There is no external parallel to this step, these embeddings are pretrained, we take them and use them in our computations.

In [7]:
from transformers import GPT2Model
import pandas as pd

# Load model
model = GPT2Model.from_pretrained("gpt2")

# Get token embeddings
with torch.no_grad():
    embeddings = model.wte(input_ids).squeeze(0)

# Convert to numpy
emb_matrix = embeddings.cpu().numpy()

# Convert token IDs to tokens
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# Create DataFrame
df = pd.DataFrame(emb_matrix, index=tokens)
df.columns = [f"dim_{i}" for i in range(df.shape[1])]

# Show first 5 dimensions
display(df[[f"dim_{i}" for i in range(5)]])

# Save to CSV
df.to_csv("gpt2_token_embeddings.csv", index_label="token")
print("✅ Saved to gpt2_token_embeddings.csv")


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4
Open,0.112828,-0.125863,0.10789,-0.186214,0.141399
-,0.061386,-0.014965,0.056575,-0.089331,0.017219
source,-0.077845,-0.248645,-0.003302,-0.089265,0.033541
ĠLL,0.074989,-0.192921,-5e-05,0.007463,0.238938
Ms,0.043708,-0.197578,0.300682,-0.120869,-0.081312
Ġrock,-0.097493,-0.034077,0.096704,-0.002258,0.03532
.,0.046641,-0.011302,0.028328,0.046351,0.039038


✅ Saved to gpt2_token_embeddings.csv


## Step 3: Get learned positional encodings

In [9]:
# Get position IDs (same length as input)
position_ids = torch.arange(input_ids.size(1), dtype=torch.long).unsqueeze(0)

# Get positional embeddings from GPT-2
with torch.no_grad():
    pos_embeddings = model.wpe(position_ids).squeeze(0)

# Convert to DataFrame
pos_matrix = pos_embeddings.cpu().numpy()
df_pos = pd.DataFrame(pos_matrix, index=[f"pos_{i}" for i in range(pos_matrix.shape[0])])
df_pos.columns = [f"dim_{i}" for i in range(pos_matrix.shape[1])]

# Show first few dimensions
display(df_pos[[f"dim_{i}" for i in range(5)]])

# Save to CSV
df_pos.to_csv("gpt2_position_embeddings.csv", index_label="position")
print("✅ Saved to gpt2_position_embeddings.csv")


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4
pos_0,-0.018821,-0.197419,0.004027,0.011347,0.063824
pos_1,0.023959,-0.053792,-0.094879,-0.012909,-0.010051
pos_2,0.004216,-0.084764,0.054515,-0.004668,-0.026065
pos_3,-0.000283,-0.073803,0.105526,0.000652,-0.015574
pos_4,0.007637,-0.02509,0.126956,-0.006254,-0.012381
pos_5,0.009602,-0.033885,0.131233,-0.003855,-0.010249
pos_6,0.002679,-0.02053,0.119613,-0.002906,-0.008992


✅ Saved to gpt2_position_embeddings.csv


## Step 4: Get input tensor to first transformer block
Tensor shape will be seq_len, 768 --> 7,768 or a 1, 7, 768 tensor which is squeezed to flatten batch dimension

In [11]:
with torch.no_grad():
    input_tensor = model.wte(input_ids) + model.wpe(position_ids)
input_tensor = input_tensor.squeeze(0)

# Convert to numpy
input_matrix = input_tensor.cpu().numpy()

# Use tokens as index
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
df_input = pd.DataFrame(input_matrix, index=tokens)
df_input.columns = [f"dim_{i}" for i in range(df_input.shape[1])]

# Display first few dimensions
display(df_input[[f"dim_{i}" for i in range(5)]])

# Save to CSV
df_input.to_csv("gpt2_block0_input_tensor.csv", index_label="token")
print("✅ Saved to gpt2_block0_input_tensor.csv")


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4
Open,0.094007,-0.323282,0.111917,-0.174867,0.205223
-,0.085346,-0.068757,-0.038304,-0.10224,0.007169
source,-0.073628,-0.333408,0.051213,-0.093934,0.007476
ĠLL,0.074706,-0.266723,0.105476,0.008115,0.223364
Ms,0.051346,-0.222668,0.427638,-0.127122,-0.093693
Ġrock,-0.087891,-0.067962,0.227937,-0.006113,0.025071
.,0.04932,-0.031832,0.147941,0.043445,0.030046


✅ Saved to gpt2_block0_input_tensor.csv


## Step 5: Caclulate input tensor and check difference with GPT2

In [13]:
with torch.no_grad():
    token_embeddings = model.wte(input_ids).squeeze(0)
    pos_embeddings = model.wpe(position_ids).squeeze(0)
    sum_manual = token_embeddings + pos_embeddings

# Check max absolute difference
diff = torch.abs(sum_manual - input_tensor).max().item()
print(f"✅ Max difference: {diff:.8f}")
assert diff < 1e-6, "❌ Mismatch between summed embeddings and input tensor"


✅ Max difference: 0.00000000


## Step 6: We need to get the full Q K V weight matrices (which are split across 12 heads per block) for 12 blocks
These are combined as one matrix 768x2034 and when split each of QKV are 768x768
Each Q K V is then split across each of the 12 heads in each block

In [15]:
import os
import numpy as np

# Ensure directory exists
os.makedirs("gpt2_weights", exist_ok=True)

# Loop through all 12 transformer blocks
for i, block in enumerate(model.h):
    # Get combined QKV weight: shape [768, 2304]
    qkv_weight = block.attn.c_attn.weight.detach().cpu().numpy()

    # Split into Q, K, V: each [768, 768]
    W_Q, W_K, W_V = np.split(qkv_weight, 3, axis=1)

    # Save each matrix
    np.save(f"gpt2_weights/block_{i}_W_Q.npy", W_Q)
    np.save(f"gpt2_weights/block_{i}_W_K.npy", W_K)
    np.save(f"gpt2_weights/block_{i}_W_V.npy", W_V)

    print(f"✅ Saved QKV weight matrices for block {i}")


✅ Saved QKV weight matrices for block 0
✅ Saved QKV weight matrices for block 1
✅ Saved QKV weight matrices for block 2
✅ Saved QKV weight matrices for block 3
✅ Saved QKV weight matrices for block 4
✅ Saved QKV weight matrices for block 5
✅ Saved QKV weight matrices for block 6
✅ Saved QKV weight matrices for block 7
✅ Saved QKV weight matrices for block 8
✅ Saved QKV weight matrices for block 9
✅ Saved QKV weight matrices for block 10
✅ Saved QKV weight matrices for block 11


## Step 7: Let's get the biases for Q K V, learned starting points suggesting what Q K V should tend towards without even seeing the input

In [17]:
import os
import numpy as np

# Ensure the directory exists
os.makedirs("gpt2_weights", exist_ok=True)

# Loop over all 12 transformer blocks
for i, block in enumerate(model.h):
    # Combined QKV bias: shape [2304]
    qkv_bias = block.attn.c_attn.bias.detach().cpu().numpy()

    # Split into Q, K, V: each [768]
    b_Q, b_K, b_V = np.split(qkv_bias, 3)

    # Save each bias
    np.save(f"gpt2_weights/block_{i}_b_Q.npy", b_Q)
    np.save(f"gpt2_weights/block_{i}_b_K.npy", b_K)
    np.save(f"gpt2_weights/block_{i}_b_V.npy", b_V)

    print(f"✅ Saved QKV biases for block {i}")


✅ Saved QKV biases for block 0
✅ Saved QKV biases for block 1
✅ Saved QKV biases for block 2
✅ Saved QKV biases for block 3
✅ Saved QKV biases for block 4
✅ Saved QKV biases for block 5
✅ Saved QKV biases for block 6
✅ Saved QKV biases for block 7
✅ Saved QKV biases for block 8
✅ Saved QKV biases for block 9
✅ Saved QKV biases for block 10
✅ Saved QKV biases for block 11


## Step 8: Use a hook to get the Q K V block 0 GPT2 matrices for our input

In [19]:
import pandas as pd
import numpy as np

with torch.no_grad():
    qkv = model.h[0].attn.c_attn(input_tensor)  # shape: [seq_len, 2304]
    Q_ref, K_ref, V_ref = torch.chunk(qkv, 3, dim=-1)  # each: [seq_len, 768]

# Save to CSV
pd.DataFrame(Q_ref).to_csv("gpt2_block0_Q_ref.csv", index_label="token")
pd.DataFrame(K_ref).to_csv("gpt2_block0_K_ref.csv", index_label="token")
pd.DataFrame(V_ref).to_csv("gpt2_block0_V_ref.csv", index_label="token")

# Display shape and a sample
print("✅ Saved Q, K, V matrices from GPT-2 block 0")
print(f"Q_ref shape: {Q_ref.shape}")
print(f"K_ref shape: {K_ref.shape}")
print(f"V_ref shape: {V_ref.shape}\n")

# Show preview of first token’s first 5 dimensions
print("Q_ref[0][:5]:", np.round(Q_ref[0][:5], 6))
print("K_ref[0][:5]:", np.round(K_ref[0][:5], 6))
print("V_ref[0][:5]:", np.round(V_ref[0][:5], 6))


✅ Saved Q, K, V matrices from GPT-2 block 0
Q_ref shape: torch.Size([7, 768])
K_ref shape: torch.Size([7, 768])
V_ref shape: torch.Size([7, 768])

Q_ref[0][:5]: tensor([-3.0921,  2.8398,  1.3211, -2.8124,  0.5560])
K_ref[0][:5]: tensor([-11.1340,  16.9833,   9.8151,   1.2522,   5.7914])
V_ref[0][:5]: tensor([ 0.1662, -0.0763,  0.2380,  0.1444, -0.0268])


## Step 9: Let's calculate Q K V for block 0 ourselves 
WWe are projecting the input tensor into three new planes using Q K V weight matrices

In [21]:
import torch
import numpy as np

# Load input tensor (shape: [seq_len, hidden_size])
# Assuming you've already defined `input_tensor` earlier:
# If not, define it like this:
# with torch.no_grad():
#     input_tensor = model.wte(input_ids) + model.wpe(torch.arange(input_ids.shape[1]))

# Load block 0 weights and biases
W_Q = np.load("gpt2_weights/block_0_W_Q.npy")
W_K = np.load("gpt2_weights/block_0_W_K.npy")
W_V = np.load("gpt2_weights/block_0_W_V.npy")

b_Q = np.load("gpt2_weights/block_0_b_Q.npy")
b_K = np.load("gpt2_weights/block_0_b_K.npy")
b_V = np.load("gpt2_weights/block_0_b_V.npy")

# Convert to torch tensors
W_Q = torch.tensor(W_Q, dtype=torch.float32)
W_K = torch.tensor(W_K, dtype=torch.float32)
W_V = torch.tensor(W_V, dtype=torch.float32)

b_Q = torch.tensor(b_Q, dtype=torch.float32)
b_K = torch.tensor(b_K, dtype=torch.float32)
b_V = torch.tensor(b_V, dtype=torch.float32)

# Project input to get Q, K, V (shape: [seq_len, hidden])
Q = input_tensor @ W_Q + b_Q
K = input_tensor @ W_K + b_K
V = input_tensor @ W_V + b_V

# Optional: check shapes
print("Q shape:", Q.shape)
print("K shape:", K.shape)
print("V shape:", V.shape)


Q shape: torch.Size([7, 768])
K shape: torch.Size([7, 768])
V shape: torch.Size([7, 768])


## Step 10: Compare manual Q K V with GPT2 Q K V

In [23]:
import pandas as pd
import numpy as np

# Load reference QKV from model
Q_ref = pd.read_csv("gpt2_block0_Q_ref.csv", index_col=0).values
K_ref = pd.read_csv("gpt2_block0_K_ref.csv", index_col=0).values
V_ref = pd.read_csv("gpt2_block0_V_ref.csv", index_col=0).values

# Compare with manually computed Q, K, V
Q_manual = Q.numpy()
K_manual = K.numpy()
V_manual = V.numpy()

# Compute differences
print("Q diff (mean abs):", np.mean(np.abs(Q_ref - Q_manual)))
print("K diff (mean abs):", np.mean(np.abs(K_ref - K_manual)))
print("V diff (mean abs):", np.mean(np.abs(V_ref - V_manual)))



Q diff (mean abs): 4.938428878046922e-07
K diff (mean abs): 4.149513652007874e-07
V diff (mean abs): 7.063186317529891e-08


## Step 11: Let's get GPT attention scores

In [25]:
import torch
import pandas as pd
from transformers import GPT2Model, GPT2TokenizerFast

# --- Load model and tokenizer ---
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")
model.eval()

# --- Tokenize input ---
sentence = "open-source llms rock."
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=False)

# --- Dictionary to hold hook output ---
raw_attn_output = {}

# --- Define hook: grab input to `c_proj`, which is attention output before projection ---
def hook_fn(module, input, output):
    # input is a tuple — we take the tensor input to the c_proj layer
    raw_attn_output["block1_attn_concat"] = input[0].detach().squeeze(0)  # shape [seq_len, 768]

# --- Register the hook on c_proj (linear output projection after attention) ---
hook_handle = model.h[0].attn.c_proj.register_forward_hook(hook_fn)

# --- Run the model ---
with torch.no_grad():
    _ = model(**inputs)

# --- Remove hook after capturing ---
hook_handle.remove()

# --- Optional: Save or inspect result ---
df = pd.DataFrame(raw_attn_output["block1_attn_concat"].cpu().numpy())
df.to_csv("gpt2_block1_attn_concat_output.csv", index_label="token")

print("✅ Captured attention output before projection (matches manual calc):")
print(raw_attn_output["block1_attn_concat"][:5, :10])


✅ Captured attention output before projection (matches manual calc):
tensor([[-0.0031,  0.0954, -0.0282, -0.0525, -0.0847, -0.3572, -0.0809, -0.1034,
          0.0004,  0.0065],
        [-0.0086,  0.0862, -0.0295, -0.0558, -0.0849, -0.3284, -0.0753, -0.0988,
          0.0032, -0.0009],
        [-0.0155,  0.0634, -0.0210, -0.0754, -0.0867, -0.3057, -0.0757, -0.1096,
          0.0010, -0.0201],
        [ 0.0547,  0.0341,  0.0691, -0.0647, -0.0633, -0.1771, -0.0393, -0.1540,
          0.0169, -0.0618],
        [ 0.0540,  0.0650,  0.0465, -0.0529, -0.0637, -0.2485, -0.0470, -0.1461,
          0.0102, -0.0298]])


## Step 12: Let's calculate attention using Q K V
Q K V are tokens x 768 dimensinality, split to give 12 tokens by 64 heads, transpose to let each head see all tokens, compute attention head score matrix creating 12 tokens by tokens dimensionality attention scores, apply a causal mask, softmax over scores to produce attention probabilities for each token in the sequence, multiple by V, concatenate all heads. 

In [27]:
import torch
import torch.nn.functional as F
from transformers import GPT2Model, GPT2TokenizerFast

# --- Load model and tokenizer ---
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")
model.eval()

# --- Tokenize correct input ---
sentence = "Open-source LLMs rock."  # 👈 this exact string
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=False)
input_ids = inputs["input_ids"]
seq_len = input_ids.shape[1]
device = model.device

# --- Hook to capture pre-projection attention output ---
raw_attn_output = {}

def hook_fn(module, input, output):
    raw_attn_output["block1_attn_concat"] = input[0].detach().squeeze(0)

hook_handle = model.h[0].attn.c_proj.register_forward_hook(hook_fn)

# --- Run the model (trigger hook) ---
with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# --- MANUAL CALCULATION ---
with torch.no_grad():
    # Step 1: Input embeddings (token + position)
    wte = model.wte(input_ids).squeeze(0)
    wpe = model.wpe(torch.arange(seq_len, device=device)).squeeze(0)
    input_tensor = wte + wpe  # [seq_len, hidden_size]

    # Step 2: LayerNorm
    ln_weight = model.h[0].ln_1.weight.detach()
    ln_bias = model.h[0].ln_1.bias.detach()

    def layer_norm(x, weight, bias, eps=1e-5):
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, unbiased=False, keepdim=True)
        return ((x - mean) / torch.sqrt(var + eps)) * weight + bias

    x_norm = layer_norm(input_tensor, ln_weight, ln_bias)

    # Step 3: QKV from single projection (GPT-2 uses c_attn)
    W_full = model.h[0].attn.c_attn.weight.detach()
    b_full = model.h[0].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)  # Each [seq_len, hidden]

    # Step 4: Split into heads
    num_heads = 12
    head_dim = 64
    def split_heads(x):
        return x.view(seq_len, num_heads, head_dim).transpose(0, 1)  # [n_heads, seq_len, head_dim]

    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    # Step 5: Causal mask
    mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool, device=device))

    # Step 6: Attention computation
    attn_outputs = []
    for i in range(num_heads):
        q = Q_heads[i]  # [seq_len, head_dim]
        k = K_heads[i]
        v = V_heads[i]

        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    # Step 7: Concatenate heads
    attn_concat = torch.cat(attn_outputs, dim=-1)  # [seq_len, hidden]



## Step 13: Now compare GPT2 layer 1 attention calculation with our manual attention calculation 

In [29]:
# --- Final Comparison with hooked GPT-2 output ---
diff = (attn_concat - raw_attn_output["block1_attn_concat"]).abs()
print("Max diff:", diff.max().item())
print("Mean diff:", diff.mean().item())
print("Allclose (1e-5)?", torch.allclose(attn_concat, raw_attn_output["block1_attn_concat"], atol=1e-5))

Max diff: 8.344650268554688e-07
Mean diff: 4.256519048340124e-08
Allclose (1e-5)? True


## Step 14: Output projection (Mixing)

In [31]:
import torch
import pandas as pd
from transformers import GPT2Model, GPT2TokenizerFast

# --- Load model and tokenizer ---
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")
model.eval()

# --- Tokenize your sentence once ---
sentence = "Open-source LLMs rock."
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=False)

# --- Hook dictionary ---
mixed_output = {}

# --- Hook function to capture output of c_proj (i.e., mixed attention) ---
def cproj_hook(module, input, output):
    mixed_output["block1_mixed"] = output.detach().squeeze(0)

# --- Register hook on block 1's c_proj ---
hook_handle = model.h[0].attn.c_proj.register_forward_hook(cproj_hook)

# --- Run the model to trigger the hook ---
with torch.no_grad():
    _ = model(**inputs)

# --- Remove the hook after capturing the output ---
hook_handle.remove()

# --- Save to CSV or inspect ---
df = pd.DataFrame(mixed_output["block1_mixed"].cpu().numpy())
df.to_csv("gpt2_block1_mixed_output.csv", index_label="token")

# --- Optional: Inspect shape or values ---
print("✅ Captured shape:", mixed_output["block1_mixed"].shape)
print(mixed_output["block1_mixed"][:2, :5])


✅ Captured shape: torch.Size([7, 768])
tensor([[ 1.3581, -0.6009,  0.3659, -0.0182,  0.0638],
        [ 0.4536, -0.3578,  0.2065,  0.0037,  0.0930]])


In [32]:
with torch.no_grad():
    manual_mixed = model.h[0].attn.c_proj(attn_concat)


In [33]:
diff = (manual_mixed - mixed_output["block1_mixed"]).abs()
print("Max diff:", diff.max().item())
print("Mean diff:", diff.mean().item())
print("Allclose (1e-5)?", torch.allclose(manual_mixed, mixed_output["block1_mixed"], atol=1e-5))


Max diff: 5.7220458984375e-06
Mean diff: 1.938748255270184e-07
Allclose (1e-5)? True


## Step 16: LayerNorm + MLP + residual

In [35]:
import torch
import torch.nn.functional as F
from transformers import GPT2Model, GPT2TokenizerFast

# Load model and tokenizer if not already loaded
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")
model.eval()

# Use your sentence
sentence = "Open-source LLMs rock."
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=False)
input_ids = inputs["input_ids"]
seq_len = input_ids.shape[1]
device = model.device

# === EMBEDDING ===
with torch.no_grad():
    wte = model.wte(input_ids).squeeze(0)
    wpe = model.wpe(torch.arange(seq_len, device=device)).squeeze(0)
    input_tensor = wte + wpe  # [seq_len, hidden]

# === LAYER NORM 1 ===
def layer_norm(x, weight, bias, eps=1e-5):
    mean = x.mean(-1, keepdim=True)
    var = x.var(-1, unbiased=False, keepdim=True)
    return ((x - mean) / torch.sqrt(var + eps)) * weight + bias

ln1_weight = model.h[0].ln_1.weight.detach()
ln1_bias = model.h[0].ln_1.bias.detach()
x_norm = layer_norm(input_tensor, ln1_weight, ln1_bias)

# === ATTENTION ===
W_full = model.h[0].attn.c_attn.weight.detach()
b_full = model.h[0].attn.c_attn.bias.detach()
qkv = x_norm @ W_full + b_full
Q, K, V = qkv.split(768, dim=-1)

# Split into heads
num_heads = 12
head_dim = 64
def split_heads(x):
    return x.view(seq_len, num_heads, head_dim).transpose(0, 1)

Q_heads = split_heads(Q)
K_heads = split_heads(K)
V_heads = split_heads(V)

# Causal mask
mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool, device=device))

# Scaled dot-product attention
attn_outputs = []
for i in range(num_heads):
    q, k, v = Q_heads[i], K_heads[i], V_heads[i]
    scores = q @ k.T / (head_dim ** 0.5)
    scores = scores.masked_fill(~mask, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    attn_output = weights @ v
    attn_outputs.append(attn_output)

# Concat heads and mix
attn_concat = torch.cat(attn_outputs, dim=-1)  # [seq_len, hidden]
manual_mixed = model.h[0].attn.c_proj(attn_concat)

# === MANUAL BLOCK 1 COMPLETION ===

# 1. Residual after attention
x_resid1 = input_tensor + manual_mixed

# 2. LayerNorm 2
ln2_weight = model.h[0].ln_2.weight.detach()
ln2_bias = model.h[0].ln_2.bias.detach()
x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

# 3. MLP (feed-forward)
mlp_out = model.h[0].mlp(x_norm2)

# 4. Final residual
block1_manual = x_resid1 + mlp_out  # [seq_len, hidden]

# === GPT2 BLOCK 1 OUTPUT (actual) ===
with torch.no_grad():
    block1_out = model.h[0](input_tensor.unsqueeze(0))  # [1, seq_len, hidden] or nested tuple

# Safely extract the real tensor
while isinstance(block1_out, (tuple, list)):
    block1_out = block1_out[0]
block1_out = block1_out.squeeze(0)

# === COMPARISON ===
diff = (block1_manual - block1_out).abs()
print("Max diff:", diff.max().item())
print("Mean diff:", diff.mean().item())
print("Allclose (1e-5)?", torch.allclose(block1_manual, block1_out, atol=1e-5))


Max diff: 1.52587890625e-05
Mean diff: 7.624373665748863e-07
Allclose (1e-5)? True


## Move more quickly now we know what's going on

## Let's grab GPT2 layer 1 and compare to manual calculations

In [38]:
import torch
import torch.nn.functional as F
from transformers import GPT2Model, GPT2TokenizerFast
import numpy as np
import pandas as pd

# Load model and tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")
model.eval()

# Tokenize input
sentence = "Open-source LLMs rock."
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=False)
input_ids = inputs["input_ids"]
seq_len = input_ids.shape[1]
device = model.device

# === STEP 1: Get input to block 1 (output of block 0) ===
with torch.no_grad():
    # Get embeddings
    wte = model.wte(input_ids).squeeze(0)
    wpe = model.wpe(torch.arange(seq_len, device=device)).squeeze(0)
    input_tensor = wte + wpe  # [seq_len, hidden_size]
    
    # Pass through block 0 to get input for block 1
    block0_output = model.h[0](input_tensor.unsqueeze(0))
    # Handle tuple output
    while isinstance(block0_output, (tuple, list)):
        block0_output = block0_output[0]
    block0_output = block0_output.squeeze(0)  # This is the input to block 1

# === STEP 2: Hook to capture GPT-2's actual block 1 output ===
gpt2_block1_output = {}

def block1_hook(module, input, output):
    # Handle tuple output
    if isinstance(output, tuple):
        gpt2_block1_output["block1"] = output[0].detach().squeeze(0)
    else:
        gpt2_block1_output["block1"] = output.detach().squeeze(0)

# Register hook on block 1
hook_handle = model.h[1].register_forward_hook(block1_hook)

# Run model to trigger hook
with torch.no_grad():
    _ = model(**inputs)

# Remove hook
hook_handle.remove()

# === STEP 3: Manual calculation of block 1 ===

# Layer norm function
def layer_norm(x, weight, bias, eps=1e-5):
    mean = x.mean(-1, keepdim=True)
    var = x.var(-1, unbiased=False, keepdim=True)
    return ((x - mean) / torch.sqrt(var + eps)) * weight + bias

# 3.1: LayerNorm 1
ln1_weight = model.h[1].ln_1.weight.detach()
ln1_bias = model.h[1].ln_1.bias.detach()
x_norm = layer_norm(block0_output, ln1_weight, ln1_bias)

# 3.2: Get combined QKV weight and bias (the way GPT-2 stores them)
W_full = model.h[1].attn.c_attn.weight.detach()
b_full = model.h[1].attn.c_attn.bias.detach()

# 3.3: Project to get Q, K, V using combined projection
qkv = x_norm @ W_full + b_full
Q, K, V = qkv.split(768, dim=-1)  # Each [seq_len, hidden]

# 3.4: Split into heads
num_heads = 12
head_dim = 64

def split_heads(x):
    return x.view(seq_len, num_heads, head_dim).transpose(0, 1)  # [n_heads, seq_len, head_dim]

Q_heads = split_heads(Q)
K_heads = split_heads(K)
V_heads = split_heads(V)

# 3.5: Causal mask
mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool, device=device))

# 3.6: Attention computation
attn_outputs = []
for i in range(num_heads):
    q = Q_heads[i]  # [seq_len, head_dim]
    k = K_heads[i]
    v = V_heads[i]
    
    scores = q @ k.T / (head_dim ** 0.5)
    scores = scores.masked_fill(~mask, float("-inf"))
    weights = F.softmax(scores, dim=-1)
    attn_output = weights @ v
    attn_outputs.append(attn_output)

# 3.7: Concatenate heads
attn_concat = torch.cat(attn_outputs, dim=-1)  # [seq_len, hidden]

# 3.8: Output projection (mixing) - manual calculation
W_proj = model.h[1].attn.c_proj.weight.detach()
b_proj = model.h[1].attn.c_proj.bias.detach()
manual_mixed = attn_concat @ W_proj + b_proj

# 3.9: Residual connection after attention
x_resid1 = block0_output + manual_mixed

# 3.10: LayerNorm 2
ln2_weight = model.h[1].ln_2.weight.detach()
ln2_bias = model.h[1].ln_2.bias.detach()
x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

# 3.11: MLP (feed-forward) - manual calculation
# GPT-2 MLP: Linear(768->3072) -> GELU -> Linear(3072->768)
W_fc = model.h[1].mlp.c_fc.weight.detach()
b_fc = model.h[1].mlp.c_fc.bias.detach()
W_proj2 = model.h[1].mlp.c_proj.weight.detach()
b_proj2 = model.h[1].mlp.c_proj.bias.detach()

# First linear layer
mlp_hidden = x_norm2 @ W_fc + b_fc

# GELU activation
def gelu(x):
    return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

mlp_hidden = gelu(mlp_hidden)

# Second linear layer
mlp_out = mlp_hidden @ W_proj2 + b_proj2

# 3.12: Final residual
block1_manual = x_resid1 + mlp_out  # [seq_len, hidden]

# === STEP 4: Comparison ===
diff = (block1_manual - gpt2_block1_output["block1"]).abs()
print("Block 1 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block1_manual, gpt2_block1_output['block1'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block1_manual, gpt2_block1_output['block1'], atol=1e-4)}")

# Additional diagnostics
print(f"\nShape check:")
print(f"Manual output shape: {block1_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block1_output['block1'].shape}")

# Show first few values for verification
print(f"\nFirst 5 values of manual output:")
print(block1_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block1_output['block1'][0, :5])

Block 1 Comparison Results:
Max difference: 6.103515625e-05
Mean difference: 6.612365837099787e-07
Allclose (atol=1e-5)? True
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.3345, -0.3729,  1.1140,  0.0153,  0.9656])

First 5 values of GPT-2 output:
tensor([ 1.3345, -0.3729,  1.1140,  0.0153,  0.9656])


## Let's grab GPT2 block 2 output and compare to manual calculations

In [40]:
import torch
import torch.nn.functional as F
from transformers import GPT2Model, GPT2TokenizerFast
import numpy as np
import pandas as pd

# Load model and tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")
model.eval()

# Tokenize input
sentence = "Open-source LLMs rock."
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=False)
input_ids = inputs["input_ids"]
seq_len = input_ids.shape[1]
device = model.device

# Layer norm function
def layer_norm(x, weight, bias, eps=1e-5):
    mean = x.mean(-1, keepdim=True)
    var = x.var(-1, unbiased=False, keepdim=True)
    return ((x - mean) / torch.sqrt(var + eps)) * weight + bias

# GELU activation
def gelu(x):
    return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

# Split heads function
def split_heads(x, num_heads=12, head_dim=64):
    return x.view(seq_len, num_heads, head_dim).transpose(0, 1)

# === STEP 1: Run block 0 manually (from Step 16 in notebook) ===
with torch.no_grad():
    # Get embeddings
    wte = model.wte(input_ids).squeeze(0)
    wpe = model.wpe(torch.arange(seq_len, device=device)).squeeze(0)
    input_tensor = wte + wpe  # [seq_len, hidden_size]
    
    # Block 0 computation (from Step 16)
    # LayerNorm 1
    ln1_weight = model.h[0].ln_1.weight.detach()
    ln1_bias = model.h[0].ln_1.bias.detach()
    x_norm = layer_norm(input_tensor, ln1_weight, ln1_bias)
    
    # QKV
    W_full = model.h[0].attn.c_attn.weight.detach()
    b_full = model.h[0].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)
    
    # Multi-head attention
    num_heads = 12
    head_dim = 64
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)
    
    mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool, device=device))
    
    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)
    
    attn_concat = torch.cat(attn_outputs, dim=-1)
    manual_mixed = model.h[0].attn.c_proj(attn_concat)
    
    # Residual
    x_resid1 = input_tensor + manual_mixed
    
    # LayerNorm 2
    ln2_weight = model.h[0].ln_2.weight.detach()
    ln2_bias = model.h[0].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)
    
    # MLP
    mlp_out = model.h[0].mlp(x_norm2)
    
    # Final residual
    block0_manual = x_resid1 + mlp_out

# === Run block 1 manually (using block0_manual as input) ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[1].ln_1.weight.detach()
    ln1_bias = model.h[1].ln_1.bias.detach()
    x_norm = layer_norm(block0_manual, ln1_weight, ln1_bias)
    
    # QKV
    W_full = model.h[1].attn.c_attn.weight.detach()
    b_full = model.h[1].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)
    
    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)
    
    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)
    
    attn_concat = torch.cat(attn_outputs, dim=-1)
    
    # Output projection - manual
    W_proj = model.h[1].attn.c_proj.weight.detach()
    b_proj = model.h[1].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj
    
    # Residual
    x_resid1 = block0_manual + manual_mixed
    
    # LayerNorm 2
    ln2_weight = model.h[1].ln_2.weight.detach()
    ln2_bias = model.h[1].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)
    
    # MLP - manual
    W_fc = model.h[1].mlp.c_fc.weight.detach()
    b_fc = model.h[1].mlp.c_fc.bias.detach()
    W_proj2 = model.h[1].mlp.c_proj.weight.detach()
    b_proj2 = model.h[1].mlp.c_proj.bias.detach()
    
    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2
    
    # Final residual
    block1_manual = x_resid1 + mlp_out

# === STEP 2: Hook to capture GPT-2's actual block 2 output ===
gpt2_block2_output = {}

def block2_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block2_output["block2"] = output[0].detach().squeeze(0)
    else:
        gpt2_block2_output["block2"] = output.detach().squeeze(0)

hook_handle = model.h[2].register_forward_hook(block2_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP 3: Manual calculation of block 2 (using block1_manual as input) ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[2].ln_1.weight.detach()
    ln1_bias = model.h[2].ln_1.bias.detach()
    x_norm = layer_norm(block1_manual, ln1_weight, ln1_bias)
    
    # QKV
    W_full = model.h[2].attn.c_attn.weight.detach()
    b_full = model.h[2].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)
    
    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)
    
    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)
    
    attn_concat = torch.cat(attn_outputs, dim=-1)
    
    # Output projection - manual
    W_proj = model.h[2].attn.c_proj.weight.detach()
    b_proj = model.h[2].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj
    
    # Residual
    x_resid1 = block1_manual + manual_mixed
    
    # LayerNorm 2
    ln2_weight = model.h[2].ln_2.weight.detach()
    ln2_bias = model.h[2].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)
    
    # MLP - manual
    W_fc = model.h[2].mlp.c_fc.weight.detach()
    b_fc = model.h[2].mlp.c_fc.bias.detach()
    W_proj2 = model.h[2].mlp.c_proj.weight.detach()
    b_proj2 = model.h[2].mlp.c_proj.bias.detach()
    
    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2
    
    # Final residual
    block2_manual = x_resid1 + mlp_out

# === STEP 4: Comparison ===
diff = (block2_manual - gpt2_block2_output["block2"]).abs()
print("Block 2 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block2_manual, gpt2_block2_output['block2'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block2_manual, gpt2_block2_output['block2'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block2_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block2_output['block2'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block2_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block2_output['block2'][0, :5])

Block 2 Comparison Results:
Max difference: 0.000732421875
Mean difference: 1.5148484635574277e-06
Allclose (atol=1e-5)? True
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.3505, -0.3250,  0.9175,  0.1337,  1.1187])

First 5 values of GPT-2 output:
tensor([ 1.3505, -0.3250,  0.9175,  0.1337,  1.1187])


## Let's grab GPT2 layer 3 and compare to manual calculations

In [42]:
# === STEP: Hook to capture GPT-2's actual block 3 output ===
gpt2_block3_output = {}

def block3_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block3_output["block3"] = output[0].detach().squeeze(0)
    else:
        gpt2_block3_output["block3"] = output.detach().squeeze(0)

hook_handle = model.h[3].register_forward_hook(block3_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 3 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[3].ln_1.weight.detach()
    ln1_bias = model.h[3].ln_1.bias.detach()
    x_norm = layer_norm(block2_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[3].attn.c_attn.weight.detach()
    b_full = model.h[3].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[3].attn.c_proj.weight.detach()
    b_proj = model.h[3].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block2_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[3].ln_2.weight.detach()
    ln2_bias = model.h[3].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[3].mlp.c_fc.weight.detach()
    b_fc = model.h[3].mlp.c_fc.bias.detach()
    W_proj2 = model.h[3].mlp.c_proj.weight.detach()
    b_proj2 = model.h[3].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block3_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block3_manual - gpt2_block3_output["block3"]).abs()
print("Block 3 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block3_manual, gpt2_block3_output['block3'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block3_manual, gpt2_block3_output['block3'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block3_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block3_output['block3'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block3_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block3_output['block3'][0, :5])


Block 3 Comparison Results:
Max difference: 0.000732421875
Mean difference: 2.0162131022516405e-06
Allclose (atol=1e-5)? True
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.3790, -0.3283,  0.9930,  0.1346,  1.2495])

First 5 values of GPT-2 output:
tensor([ 1.3790, -0.3283,  0.9930,  0.1346,  1.2495])


In [43]:
## ## Let's grab GPT2 layer 4 and compare to manual calculations

In [44]:
# === STEP: Hook to capture GPT-2's actual block 4 output ===
gpt2_block4_output = {}

def block4_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block4_output["block4"] = output[0].detach().squeeze(0)
    else:
        gpt2_block4_output["block4"] = output.detach().squeeze(0)

hook_handle = model.h[4].register_forward_hook(block4_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 4 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[4].ln_1.weight.detach()
    ln1_bias = model.h[4].ln_1.bias.detach()
    x_norm = layer_norm(block3_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[4].attn.c_attn.weight.detach()
    b_full = model.h[4].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[4].attn.c_proj.weight.detach()
    b_proj = model.h[4].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block3_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[4].ln_2.weight.detach()
    ln2_bias = model.h[4].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[4].mlp.c_fc.weight.detach()
    b_fc = model.h[4].mlp.c_fc.bias.detach()
    W_proj2 = model.h[4].mlp.c_proj.weight.detach()
    b_proj2 = model.h[4].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block4_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block4_manual - gpt2_block4_output["block4"]).abs()
print("Block 4 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block4_manual, gpt2_block4_output['block4'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block4_manual, gpt2_block4_output['block4'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block4_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block4_output['block4'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block4_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block4_output['block4'][0, :5])


Block 4 Comparison Results:
Max difference: 0.000732421875
Mean difference: 2.339834281883668e-06
Allclose (atol=1e-5)? True
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.2032, -0.3899,  0.9021,  0.0613,  1.2972])

First 5 values of GPT-2 output:
tensor([ 1.2032, -0.3899,  0.9021,  0.0613,  1.2972])


In [45]:
## Let's grab GPT2 layer 5 and compare to manual calculations

In [46]:
# === STEP: Hook to capture GPT-2's actual block 5 output ===
gpt2_block5_output = {}

def block5_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block5_output["block5"] = output[0].detach().squeeze(0)
    else:
        gpt2_block5_output["block5"] = output.detach().squeeze(0)

hook_handle = model.h[5].register_forward_hook(block5_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 5 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[5].ln_1.weight.detach()
    ln1_bias = model.h[5].ln_1.bias.detach()
    x_norm = layer_norm(block4_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[5].attn.c_attn.weight.detach()
    b_full = model.h[5].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[5].attn.c_proj.weight.detach()
    b_proj = model.h[5].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block4_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[5].ln_2.weight.detach()
    ln2_bias = model.h[5].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[5].mlp.c_fc.weight.detach()
    b_fc = model.h[5].mlp.c_fc.bias.detach()
    W_proj2 = model.h[5].mlp.c_proj.weight.detach()
    b_proj2 = model.h[5].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block5_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block5_manual - gpt2_block5_output["block5"]).abs()
print("Block 5 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block5_manual, gpt2_block5_output['block5'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block5_manual, gpt2_block5_output['block5'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block5_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block5_output['block5'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block5_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block5_output['block5'][0, :5])


Block 5 Comparison Results:
Max difference: 0.000732421875
Mean difference: 2.6769591841002693e-06
Allclose (atol=1e-5)? False
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.1781, -0.5091,  0.9690,  0.0742,  1.5109])

First 5 values of GPT-2 output:
tensor([ 1.1781, -0.5091,  0.9690,  0.0742,  1.5109])


In [47]:
## Layer 6

In [48]:
# === STEP: Hook to capture GPT-2's actual block 6 output ===
gpt2_block6_output = {}

def block6_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block6_output["block6"] = output[0].detach().squeeze(0)
    else:
        gpt2_block6_output["block6"] = output.detach().squeeze(0)

hook_handle = model.h[6].register_forward_hook(block6_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 6 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[6].ln_1.weight.detach()
    ln1_bias = model.h[6].ln_1.bias.detach()
    x_norm = layer_norm(block5_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[6].attn.c_attn.weight.detach()
    b_full = model.h[6].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[6].attn.c_proj.weight.detach()
    b_proj = model.h[6].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block5_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[6].ln_2.weight.detach()
    ln2_bias = model.h[6].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[6].mlp.c_fc.weight.detach()
    b_fc = model.h[6].mlp.c_fc.bias.detach()
    W_proj2 = model.h[6].mlp.c_proj.weight.detach()
    b_proj2 = model.h[6].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block6_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block6_manual - gpt2_block6_output["block6"]).abs()
print("Block 6 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block6_manual, gpt2_block6_output['block6'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block6_manual, gpt2_block6_output['block6'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block6_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block6_output['block6'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block6_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block6_output['block6'][0, :5])


Block 6 Comparison Results:
Max difference: 0.000732421875
Mean difference: 3.019989435415482e-06
Allclose (atol=1e-5)? False
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.1117, -0.5070,  0.9983,  0.0067,  1.6709])

First 5 values of GPT-2 output:
tensor([ 1.1117, -0.5070,  0.9983,  0.0067,  1.6709])


In [49]:
# === STEP: Hook to capture GPT-2's actual block 7 output ===
gpt2_block7_output = {}

def block7_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block7_output["block7"] = output[0].detach().squeeze(0)
    else:
        gpt2_block7_output["block7"] = output.detach().squeeze(0)

hook_handle = model.h[7].register_forward_hook(block7_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 7 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[7].ln_1.weight.detach()
    ln1_bias = model.h[7].ln_1.bias.detach()
    x_norm = layer_norm(block6_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[7].attn.c_attn.weight.detach()
    b_full = model.h[7].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[7].attn.c_proj.weight.detach()
    b_proj = model.h[7].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block6_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[7].ln_2.weight.detach()
    ln2_bias = model.h[7].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[7].mlp.c_fc.weight.detach()
    b_fc = model.h[7].mlp.c_fc.bias.detach()
    W_proj2 = model.h[7].mlp.c_proj.weight.detach()
    b_proj2 = model.h[7].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block7_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block7_manual - gpt2_block7_output["block7"]).abs()
print("Block 7 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block7_manual, gpt2_block7_output['block7'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block7_manual, gpt2_block7_output['block7'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block7_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block7_output['block7'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block7_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block7_output['block7'][0, :5])


Block 7 Comparison Results:
Max difference: 0.000732421875
Mean difference: 3.427066076255869e-06
Allclose (atol=1e-5)? False
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.0447, -0.3819,  0.8458, -0.1170,  1.7977])

First 5 values of GPT-2 output:
tensor([ 1.0447, -0.3819,  0.8458, -0.1170,  1.7977])


In [50]:
# === STEP: Hook to capture GPT-2's actual block 8 output ===
gpt2_block8_output = {}

def block8_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block8_output["block8"] = output[0].detach().squeeze(0)
    else:
        gpt2_block8_output["block8"] = output.detach().squeeze(0)

hook_handle = model.h[8].register_forward_hook(block8_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 8 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[8].ln_1.weight.detach()
    ln1_bias = model.h[8].ln_1.bias.detach()
    x_norm = layer_norm(block7_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[8].attn.c_attn.weight.detach()
    b_full = model.h[8].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[8].attn.c_proj.weight.detach()
    b_proj = model.h[8].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block7_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[8].ln_2.weight.detach()
    ln2_bias = model.h[8].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[8].mlp.c_fc.weight.detach()
    b_fc = model.h[8].mlp.c_fc.bias.detach()
    W_proj2 = model.h[8].mlp.c_proj.weight.detach()
    b_proj2 = model.h[8].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block8_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block8_manual - gpt2_block8_output["block8"]).abs()
print("Block 8 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block8_manual, gpt2_block8_output['block8'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block8_manual, gpt2_block8_output['block8'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block8_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block8_output['block8'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block8_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block8_output['block8'][0, :5])


Block 8 Comparison Results:
Max difference: 0.000732421875
Mean difference: 4.001936758868396e-06
Allclose (atol=1e-5)? False
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.1141, -0.3506,  0.8414, -0.2093,  1.7509])

First 5 values of GPT-2 output:
tensor([ 1.1141, -0.3506,  0.8414, -0.2093,  1.7509])


In [51]:
## Block 9

In [52]:
# === STEP: Hook to capture GPT-2's actual block 9 output ===
gpt2_block9_output = {}

def block9_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block9_output["block9"] = output[0].detach().squeeze(0)
    else:
        gpt2_block9_output["block9"] = output.detach().squeeze(0)

hook_handle = model.h[9].register_forward_hook(block9_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 9 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[9].ln_1.weight.detach()
    ln1_bias = model.h[9].ln_1.bias.detach()
    x_norm = layer_norm(block8_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[9].attn.c_attn.weight.detach()
    b_full = model.h[9].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[9].attn.c_proj.weight.detach()
    b_proj = model.h[9].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block8_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[9].ln_2.weight.detach()
    ln2_bias = model.h[9].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[9].mlp.c_fc.weight.detach()
    b_fc = model.h[9].mlp.c_fc.bias.detach()
    W_proj2 = model.h[9].mlp.c_proj.weight.detach()
    b_proj2 = model.h[9].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block9_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block9_manual - gpt2_block9_output["block9"]).abs()
print("Block 9 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block9_manual, gpt2_block9_output['block9'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block9_manual, gpt2_block9_output['block9'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block9_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block9_output['block9'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block9_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block9_output['block9'][0, :5])


Block 9 Comparison Results:
Max difference: 0.000732421875
Mean difference: 4.653896667150548e-06
Allclose (atol=1e-5)? False
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.0533, -0.2555,  0.5720, -0.3252,  1.8056])

First 5 values of GPT-2 output:
tensor([ 1.0533, -0.2555,  0.5720, -0.3252,  1.8056])


In [53]:
# === STEP: Hook to capture GPT-2's actual block 10 output ===
gpt2_block10_output = {}

def block10_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block10_output["block10"] = output[0].detach().squeeze(0)
    else:
        gpt2_block10_output["block10"] = output.detach().squeeze(0)

hook_handle = model.h[10].register_forward_hook(block10_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 10 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[10].ln_1.weight.detach()
    ln1_bias = model.h[10].ln_1.bias.detach()
    x_norm = layer_norm(block9_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[10].attn.c_attn.weight.detach()
    b_full = model.h[10].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[10].attn.c_proj.weight.detach()
    b_proj = model.h[10].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block9_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[10].ln_2.weight.detach()
    ln2_bias = model.h[10].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[10].mlp.c_fc.weight.detach()
    b_fc = model.h[10].mlp.c_fc.bias.detach()
    W_proj2 = model.h[10].mlp.c_proj.weight.detach()
    b_proj2 = model.h[10].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block10_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block10_manual - gpt2_block10_output["block10"]).abs()
print("Block 10 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block10_manual, gpt2_block10_output['block10'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block10_manual, gpt2_block10_output['block10'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block10_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block10_output['block10'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block10_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block10_output['block10'][0, :5])


Block 10 Comparison Results:
Max difference: 0.00048828125
Mean difference: 5.560018962569302e-06
Allclose (atol=1e-5)? False
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 0.9528, -0.0858,  0.3158, -0.3833,  1.8234])

First 5 values of GPT-2 output:
tensor([ 0.9528, -0.0858,  0.3158, -0.3833,  1.8234])


In [54]:
# === STEP: Hook to capture GPT-2's actual block 11 output ===
gpt2_block11_output = {}

def block11_hook(module, input, output):
    if isinstance(output, tuple):
        gpt2_block11_output["block11"] = output[0].detach().squeeze(0)
    else:
        gpt2_block11_output["block11"] = output.detach().squeeze(0)

hook_handle = model.h[11].register_forward_hook(block11_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of block 11 ===
with torch.no_grad():
    # LayerNorm 1
    ln1_weight = model.h[11].ln_1.weight.detach()
    ln1_bias = model.h[11].ln_1.bias.detach()
    x_norm = layer_norm(block10_manual, ln1_weight, ln1_bias)

    # QKV
    W_full = model.h[11].attn.c_attn.weight.detach()
    b_full = model.h[11].attn.c_attn.bias.detach()
    qkv = x_norm @ W_full + b_full
    Q, K, V = qkv.split(768, dim=-1)

    # Multi-head attention
    Q_heads = split_heads(Q)
    K_heads = split_heads(K)
    V_heads = split_heads(V)

    attn_outputs = []
    for i in range(num_heads):
        q, k, v = Q_heads[i], K_heads[i], V_heads[i]
        scores = q @ k.T / (head_dim ** 0.5)
        scores = scores.masked_fill(~mask, float("-inf"))
        weights = F.softmax(scores, dim=-1)
        attn_output = weights @ v
        attn_outputs.append(attn_output)

    attn_concat = torch.cat(attn_outputs, dim=-1)

    # Output projection - manual
    W_proj = model.h[11].attn.c_proj.weight.detach()
    b_proj = model.h[11].attn.c_proj.bias.detach()
    manual_mixed = attn_concat @ W_proj + b_proj

    # Residual
    x_resid1 = block10_manual + manual_mixed

    # LayerNorm 2
    ln2_weight = model.h[11].ln_2.weight.detach()
    ln2_bias = model.h[11].ln_2.bias.detach()
    x_norm2 = layer_norm(x_resid1, ln2_weight, ln2_bias)

    # MLP - manual
    W_fc = model.h[11].mlp.c_fc.weight.detach()
    b_fc = model.h[11].mlp.c_fc.bias.detach()
    W_proj2 = model.h[11].mlp.c_proj.weight.detach()
    b_proj2 = model.h[11].mlp.c_proj.bias.detach()

    mlp_hidden = x_norm2 @ W_fc + b_fc
    mlp_hidden = gelu(mlp_hidden)
    mlp_out = mlp_hidden @ W_proj2 + b_proj2

    # Final residual
    block11_manual = x_resid1 + mlp_out

# === STEP: Comparison ===
diff = (block11_manual - gpt2_block11_output["block11"]).abs()
print("Block 11 Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(block11_manual, gpt2_block11_output['block11'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(block11_manual, gpt2_block11_output['block11'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {block11_manual.shape}")
print(f"GPT-2 output shape: {gpt2_block11_output['block11'].shape}")

print(f"\nFirst 5 values of manual output:")
print(block11_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_block11_output['block11'][0, :5])


Block 11 Comparison Results:
Max difference: 0.00048828125
Mean difference: 7.397434274025727e-06
Allclose (atol=1e-5)? False
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 1.1101,  0.3077, -0.6339,  0.5480,  1.2614])

First 5 values of GPT-2 output:
tensor([ 1.1101,  0.3077, -0.6339,  0.5480,  1.2614])


In [55]:
## Final layer norm

In [56]:
# === STEP: Hook to capture GPT-2's final ln_f output ===
gpt2_ln_f_output = {}

def ln_f_hook(module, input, output):
    gpt2_ln_f_output["ln_f"] = output.detach().squeeze(0)

hook_handle = model.ln_f.register_forward_hook(ln_f_hook)

with torch.no_grad():
    _ = model(**inputs)

hook_handle.remove()

# === STEP: Manual calculation of ln_f ===
ln_f_weight = model.ln_f.weight.detach()
ln_f_bias = model.ln_f.bias.detach()

with torch.no_grad():
    final_output_manual = layer_norm(block11_manual, ln_f_weight, ln_f_bias)

# === STEP: Comparison ===
diff = (final_output_manual - gpt2_ln_f_output["ln_f"]).abs()
print("Final LayerNorm (ln_f) Comparison Results:")
print("=" * 50)
print(f"Max difference: {diff.max().item()}")
print(f"Mean difference: {diff.mean().item()}")
print(f"Allclose (atol=1e-5)? {torch.allclose(final_output_manual, gpt2_ln_f_output['ln_f'], atol=1e-5)}")
print(f"Allclose (atol=1e-4)? {torch.allclose(final_output_manual, gpt2_ln_f_output['ln_f'], atol=1e-4)}")

print(f"\nShape check:")
print(f"Manual output shape: {final_output_manual.shape}")
print(f"GPT-2 output shape: {gpt2_ln_f_output['ln_f'].shape}")

print(f"\nFirst 5 values of manual output:")
print(final_output_manual[0, :5])
print(f"\nFirst 5 values of GPT-2 output:")
print(gpt2_ln_f_output['ln_f'][0, :5])


Final LayerNorm (ln_f) Comparison Results:
Max difference: 0.0002899169921875
Mean difference: 1.122443222811853e-06
Allclose (atol=1e-5)? False
Allclose (atol=1e-4)? True

Shape check:
Manual output shape: torch.Size([7, 768])
GPT-2 output shape: torch.Size([7, 768])

First 5 values of manual output:
tensor([ 0.0603,  0.0134, -0.2300,  0.0012,  0.0006])

First 5 values of GPT-2 output:
tensor([ 0.0603,  0.0134, -0.2300,  0.0012,  0.0006])


In [57]:
# === STEP 1: Manually project final hidden states to logits ===
# GPT-2 uses a tied weight matrix: lm_head is just a linear projection
# that shares weights with the token embedding layer (wte)

# Get the token embedding matrix (vocab_size x hidden_dim)
lm_head_weight = model.wte.weight.detach()  # shape: [50257, 768]

# Multiply final output (seq_len x hidden_dim) with transpose of lm_head weights
# This gives logits of shape (seq_len x vocab_size)
with torch.no_grad():
    logits_manual = final_output_manual @ lm_head_weight.T  # [seq_len, vocab_size]

# === STEP 2: Compare against GPT-2's own logits ===
# GPT-2 does not expose logits directly unless using GPT2LMHeadModel
# So we manually reconstruct them here using:
# logits = last_hidden_state @ lm_head_weight.T

with torch.no_grad():
    # Get hidden states from GPT-2
    hidden_states = model(**inputs).last_hidden_state.squeeze(0)  # [seq_len, hidden_dim]

    # Use same projection as manual path
    gpt2_logits = hidden_states @ lm_head_weight.T  # [seq_len, vocab_size]

# === STEP 3: Decode predicted tokens from manual logits ===
# We take the argmax over vocabulary at each position to find most likely token
predicted_ids_manual = torch.argmax(logits_manual, dim=-1)  # [seq_len]

# === STEP 4: Decode predicted tokens from GPT-2 logits for comparison ===
predicted_ids_gpt2 = torch.argmax(gpt2_logits, dim=-1)  # [seq_len]

# === STEP 5: Decode token IDs into strings ===
# This reconstructs the textual output GPT-2 would generate
decoded_manual = tokenizer.decode(predicted_ids_manual)
decoded_gpt2 = tokenizer.decode(predicted_ids_gpt2)

# === STEP 6: Compare results ===
print("Manual Decoded Text:")
print(decoded_manual)

print("\nGPT-2 Decoded Text:")
print(decoded_gpt2)

print("\n--- Token Match ---")
print("Token IDs match?", torch.equal(predicted_ids_manual, predicted_ids_gpt2))
print("Text match?", decoded_manual == decoded_gpt2)


Manual Decoded Text:
.source codeVM are.


GPT-2 Decoded Text:
.source codeVM are.


--- Token Match ---
Token IDs match? True
Text match? True
