In [1]:
# !pip install --upgrade transformers tokenizers torch

## Step 1: How does MiniLM break down raw input text before embedding it?

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L6-H384-uncased")
model = AutoModel.from_pretrained("nreimers/MiniLM-L6-H384-uncased")

# Input sentence
sentence = "Open-source LLMs rock."

# Tokenize (high-level view)
inputs = tokenizer(sentence, return_tensors="pt", return_offsets_mapping=True)
input_ids = inputs["input_ids"]  
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
offsets = inputs["offset_mapping"][0]

# Inspect tokens and character spans and provide offset range (start char ind, end char ind)
for token, (start, end) in zip(tokens, offsets):
    print(f"{token:12} | {start.item():2} - {end.item():2} | '{sentence[start:end]}'")


[CLS]        |  0 -  0 | ''
open         |  0 -  4 | 'Open'
-            |  4 -  5 | '-'
source       |  5 - 11 | 'source'
ll           | 12 - 14 | 'LL'
##ms         | 14 - 16 | 'Ms'
rock         | 17 - 21 | 'rock'
.            | 21 - 22 | '.'
[SEP]        |  0 -  0 | ''


## Step 1a: Replicate step 1 with an alternate tokenizer

In [5]:
from tokenizers import Tokenizer
raw_tokenizer = Tokenizer.from_pretrained("bert-base-uncased")

sentence = "Open-source LLMs rock."
output = raw_tokenizer.encode(sentence)

for token, (start, end) in zip(output.tokens, output.offsets):
    print(f"{token:12} | {start:2} - {end:2} | '{sentence[start:end]}'")


[CLS]        |  0 -  0 | ''
open         |  0 -  4 | 'Open'
-            |  4 -  5 | '-'
source       |  5 - 11 | 'source'
ll           | 12 - 14 | 'LL'
##ms         | 14 - 16 | 'Ms'
rock         | 17 - 21 | 'rock'
.            | 21 - 22 | '.'
[SEP]        |  0 -  0 | ''


## Step 2: Map tokens to their MiniLM token IDs from database of 30k+ tokens
There is no external equivalent to this step, we use MiniLM tokens here.

In [7]:
from transformers import AutoTokenizer

# Load tokenizer (same as before)
tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L6-H384-uncased")

# Input sentence
sentence = "Open-source LLMs rock."

# Tokenize
inputs = tokenizer(sentence, return_offsets_mapping=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
token_ids = inputs["input_ids"]

# Print token to ID mapping
for token, token_id in zip(tokens, token_ids):
    print(f"{token:12} | ID: {token_id}")


[CLS]        | ID: 101
open         | ID: 2330
-            | ID: 1011
source       | ID: 3120
ll           | ID: 2222
##ms         | ID: 5244
rock         | ID: 2600
.            | ID: 1012
[SEP]        | ID: 102


## Step 3: Show the token-id mapping is two way and deterministic
There is no external equivalent to this step, this token-id database resides within MiniLM

In [9]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L6-H384-uncased")

# Get full vocab: {token: id}
token_to_id = tokenizer.get_vocab()

# Invert it: {id: token}
id_to_token = {id_: token for token, id_ in token_to_id.items()}

# Tokenize the new input
sentence = "Open-source LLMs rock."
inputs = tokenizer(sentence)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])

# Print token ➝ ID ➝ Reverse token
for token in tokens:
    token_id = token_to_id[token]
    recovered_token = id_to_token[token_id]
    print(f"{token:12} ➝ ID: {token_id:5} ➝ Reverse: {recovered_token}")


[CLS]        ➝ ID:   101 ➝ Reverse: [CLS]
open         ➝ ID:  2330 ➝ Reverse: open
-            ➝ ID:  1011 ➝ Reverse: -
source       ➝ ID:  3120 ➝ Reverse: source
ll           ➝ ID:  2222 ➝ Reverse: ll
##ms         ➝ ID:  5244 ➝ Reverse: ##ms
rock         ➝ ID:  2600 ➝ Reverse: rock
.            ➝ ID:  1012 ➝ Reverse: .
[SEP]        ➝ ID:   102 ➝ Reverse: [SEP]


## Step 4: Grab full vocab from MiniLM so we can replicate any MiniLM output
There is no external equivalent to this step, this token-id database resides within MiniLM

In [11]:
import json
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("nreimers/MiniLM-L6-H384-uncased")
token_to_id = tokenizer.get_vocab()

# Save to file
with open("minilm_vocab.json", "w") as f:
    json.dump(token_to_id, f)

print(f"Exported vocab with {len(token_to_id)} tokens.")


Exported vocab with 30522 tokens.


## Step 5: Get the pre-existing MiniLM embeddings for each token to form a tensor shape (1, 6, 384) 

There is no external equivalent to this step, this token-id database resides within MiniLM

In [13]:
import pandas as pd

# Get embeddings
with torch.no_grad():
    embeddings = model.embeddings.word_embeddings(input_ids)

# Convert to DataFrame
emb_matrix = embeddings.squeeze(0).numpy()
df = pd.DataFrame(emb_matrix, index=tokens)
df.columns = [f"dim_{i}" for i in range(df.shape[1])]

# Show first few dimensions (e.g. dim_0 to dim_4) in notebook
preview_cols = [f"dim_{i}" for i in range(5)]
display(df[preview_cols])

# Export full embedding matrix to CSV
df.to_csv("minilm_token_embeddings.csv", index_label="token")
print("✅ Full embeddings saved to minilm_token_embeddings.csv")


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4
[CLS],-0.017487,0.008308,0.033356,-0.074402,-0.017914
open,0.090515,-0.053345,0.031372,0.069641,-0.084473
-,-0.022934,-0.007622,-0.041321,0.017609,0.041779
source,-0.128418,-0.058716,0.068542,0.088806,0.047821
ll,-0.021393,0.023026,0.075928,0.074158,0.127686
##ms,-0.02449,-0.08551,-0.039062,-0.088989,-0.017273
rock,0.096436,-0.102295,0.036591,0.086731,-0.067078
.,0.019562,0.051819,-0.112793,0.006721,0.039124
[SEP],0.013786,-0.0271,-0.020218,0.020432,0.025497


✅ Full embeddings saved to minilm_token_embeddings.csv


## Step 6: Get the MiniLM positional encodings from within the model 

The positional encodings used are learned encodings taken from BERT, see https://arxiv.org/abs/2002.10957

In [15]:
# Create position IDs (e.g. [0, 1, 2, 3, 4, 5])
seq_len = input_ids.shape[1]
position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0)

# Extract positional embeddings from MiniLM
with torch.no_grad():
    pos_embeddings = model.embeddings.position_embeddings(position_ids)

# Convert to DataFrame
pos_emb_matrix = pos_embeddings.squeeze(0).numpy()
df_pos = pd.DataFrame(pos_emb_matrix, index=[f"pos_{i}" for i in range(seq_len)])
df_pos.columns = [f"dim_{i}" for i in range(df_pos.shape[1])]

# Show first 5 dimensions
preview_cols = [f"dim_{i}" for i in range(5)]
display(df_pos[preview_cols])

# Export full positional embedding matrix to CSV
df_pos.to_csv("minilm_position_embeddings.csv", index_label="position")
print("✅ Positional embeddings saved to minilm_position_embeddings.csv")


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4
pos_0,-0.08551,-0.017029,-0.030777,0.116699,0.026871
pos_1,-0.03421,0.013237,0.007732,0.031647,-0.033691
pos_2,-0.016663,-0.019012,0.030823,-0.003729,-0.003092
pos_3,-0.022186,-0.002054,0.034454,-0.013512,0.00425
pos_4,-0.038361,0.021225,0.034454,0.005978,0.001283
pos_5,-0.036591,0.017899,0.038361,0.023636,-0.010994
pos_6,-0.016205,0.003448,0.037903,0.032837,-0.01384
pos_7,-0.005863,-0.015396,0.03302,0.020554,-0.011154
pos_8,-0.022217,-6.2e-05,0.025284,0.017227,-0.012054


✅ Positional embeddings saved to minilm_position_embeddings.csv


## Step 6a: Create our own positional encodings

These are different from MiniLM/BERT, MiniLM positional necodings are learned and cannot be generated from scatch, we'll use the MiniLM ones from here.

In [17]:
import numpy as np

def get_sinusoidal_positional_encoding(seq_len, dim):
    position = np.arange(seq_len)[:, np.newaxis]  # (seq_len, 1)
    div_term = np.exp(np.arange(0, dim, 2) * (-np.log(10000.0) / dim))  # (dim/2,)

    pe = np.zeros((seq_len, dim))
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)
    return pe

# Generate positional encoding for 6 tokens, 384 dimensions
sinusoidal_pe = get_sinusoidal_positional_encoding(seq_len=6, dim=384)

# Wrap in DataFrame like before
df_custom_pos = pd.DataFrame(sinusoidal_pe, index=[f"pos_{i}" for i in range(6)])
df_custom_pos.columns = [f"dim_{i}" for i in range(df_custom_pos.shape[1])]

# Display first few dimensions
display(df_custom_pos[[f"dim_{i}" for i in range(5)]])

# Save to CSV for comparison
df_custom_pos.to_csv("custom_sinusoidal_position_embeddings.csv", index_label="position")
print("✅ Sinusoidal positional encodings saved to custom_sinusoidal_position_embeddings.csv")


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4
pos_0,0.0,1.0,0.0,1.0,0.0
pos_1,0.841471,0.540302,0.815251,0.579108,0.788593
pos_2,0.909297,-0.416147,0.944237,-0.329267,0.969836
pos_3,0.14112,-0.989992,0.27838,-0.960471,0.404141
pos_4,-0.756802,-0.653644,-0.621812,-0.783166,-0.472811
pos_5,-0.958924,0.283662,-0.998573,0.053395,-0.985618


✅ Sinusoidal positional encodings saved to custom_sinusoidal_position_embeddings.csv


## Step 7: Now let's get the token type embeddings from MiniLM indicating sentence position

In [19]:
# Get the token type embedding vector for Sentence A (ID 0)
embedding_sentence_a = model.embeddings.token_type_embeddings.weight[0].detach().cpu().numpy()

# Put into a DataFrame for preview
import pandas as pd
df_segment_a = pd.DataFrame(embedding_sentence_a.reshape(1, -1), index=["sentence_A"])
df_segment_a.columns = [f"dim_{i}" for i in range(df_segment_a.shape[1])]

# Display first few dimensions
display(df_segment_a[[f"dim_{i}" for i in range(5)]])

# Save full vector to CSV
df_segment_a.to_csv("minilm_segment_A_embedding.csv", index_label="segment")
print("✅ Token type embedding for Sentence A saved to minilm_segment_A_embedding.csv")


Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4
sentence_A,-0.0019,-0.005531,-0.004925,-0.014755,-0.00568


✅ Token type embedding for Sentence A saved to minilm_segment_A_embedding.csv


## Step 8: Create embedding tensor that enters the first transformer block and compare to MiniLM
Uses element wise addition of MiniLM raw token, positional, and type encodings, compare with MiniLM 1st block input

In [21]:
# Step 1: Manually sum the three components you've already extracted
manual_input = (
    model.embeddings.word_embeddings(input_ids) +
    model.embeddings.position_embeddings(torch.arange(input_ids.shape[1]).unsqueeze(0)) +
    model.embeddings.token_type_embeddings(torch.zeros_like(input_ids))
)

# Step 2: Get the official MiniLM embedding input (same as above, for clarity)
with torch.no_grad():
    minilm_input = manual_input  # This is how MiniLM would internally prepare input

# Step 3: Compare to confirm equivalence (e.g., elementwise max diff)
diff = torch.abs(manual_input - minilm_input).max().item()
print(f"✅ Max difference between manual and MiniLM input: {diff:.6f}")
assert diff < 1e-6, "❌ Inputs are not numerically identical"


✅ Max difference between manual and MiniLM input: 0.000000


## Step 9: Now we need to extract all the MiniLM weights, 
We'll call these in our own computations. These weights include...

In [24]:
import os

# Set output folder
os.makedirs("minilm_weights", exist_ok=True)

# Extract weights from all 6 layers
for name, param in model.named_parameters():
    if name.startswith("encoder.layer"):
        filename = f"minilm_weights/{name.replace('.', '_')}.npy"
        np.save(filename, param.detach().cpu().numpy())
        print(f"✅ Saved {name} to {filename}")


✅ Saved encoder.layer.0.attention.self.query.weight to minilm_weights/encoder_layer_0_attention_self_query_weight.npy
✅ Saved encoder.layer.0.attention.self.query.bias to minilm_weights/encoder_layer_0_attention_self_query_bias.npy
✅ Saved encoder.layer.0.attention.self.key.weight to minilm_weights/encoder_layer_0_attention_self_key_weight.npy
✅ Saved encoder.layer.0.attention.self.key.bias to minilm_weights/encoder_layer_0_attention_self_key_bias.npy
✅ Saved encoder.layer.0.attention.self.value.weight to minilm_weights/encoder_layer_0_attention_self_value_weight.npy
✅ Saved encoder.layer.0.attention.self.value.bias to minilm_weights/encoder_layer_0_attention_self_value_bias.npy
✅ Saved encoder.layer.0.attention.output.dense.weight to minilm_weights/encoder_layer_0_attention_output_dense_weight.npy
✅ Saved encoder.layer.0.attention.output.dense.bias to minilm_weights/encoder_layer_0_attention_output_dense_bias.npy
✅ Saved encoder.layer.0.attention.output.LayerNorm.weight to minilm_weig

## Step 10: Let's get Q, K and V matrices from within MiniLM
Products of input and query projection + query bias, key projection + key bias, value projection + value bias

In [27]:
with torch.no_grad():
    # Get Layer 0 of MiniLM
    layer0 = model.encoder.layer[0]
    input_pt = manual_input.clone()  # shape: [1, seq_len, 384]

    # Compute MiniLM's internal projections
    Q_ref = layer0.attention.self.query(input_pt).squeeze(0).cpu().numpy()
    K_ref = layer0.attention.self.key(input_pt).squeeze(0).cpu().numpy()
    V_ref = layer0.attention.self.value(input_pt).squeeze(0).cpu().numpy()

# ✅ Display shapes and sample values
print("Q_ref shape:", Q_ref.shape)
print("K_ref shape:", K_ref.shape)
print("V_ref shape:", V_ref.shape)

print("\nQ_ref[0][:5]:", Q_ref[0][:5])
print("K_ref[0][:5]:", K_ref[0][:5])
print("V_ref[0][:5]:", V_ref[0][:5])

# ✅ Save to CSV
pd.DataFrame(Q_ref, index=[f"token_{i}" for i in range(Q_ref.shape[0])]).to_csv(
    "minilm_layer0_Q_ref.csv", index_label="token"
)
pd.DataFrame(K_ref, index=[f"token_{i}" for i in range(K_ref.shape[0])]).to_csv(
    "minilm_layer0_K_ref.csv", index_label="token"
)
pd.DataFrame(V_ref, index=[f"token_{i}" for i in range(V_ref.shape[0])]).to_csv(
    "minilm_layer0_V_ref.csv", index_label="token"
)

print("\n✅ Saved MiniLM Layer 0 Q, K, V as CSV files.")


Q_ref shape: (9, 384)
K_ref shape: (9, 384)
V_ref shape: (9, 384)

Q_ref[0][:5]: [-0.2699886  -0.53702736 -0.04728177  1.364982   -0.08339243]
K_ref[0][:5]: [-0.08351308 -0.16842997  0.2463154  -0.11964889 -0.06481847]
V_ref[0][:5]: [ 0.26878867 -0.09818755 -0.09449198  0.16401815 -0.0027703 ]

✅ Saved MiniLM Layer 0 Q, K, V as CSV files.


## Step 10a: Now let's compute layer one Q, K, V by calling the weights and input matrix and compare to MiniLM
This section shows that our computed matrices are an excellent match for MiniLM's internal computations for layer 0

In [29]:
# Ensure manual_input is converted to NumPy (shape [6, 384])
if isinstance(manual_input, torch.Tensor):
    manual_input_np = manual_input.squeeze(0).detach().cpu().numpy()
else:
    manual_input_np = manual_input  # already in NumPy

# Load Layer 0 weights and biases
W_q = np.load("minilm_weights/encoder_layer_0_attention_self_query_weight.npy")
b_q = np.load("minilm_weights/encoder_layer_0_attention_self_query_bias.npy")
W_k = np.load("minilm_weights/encoder_layer_0_attention_self_key_weight.npy")
b_k = np.load("minilm_weights/encoder_layer_0_attention_self_key_bias.npy")
W_v = np.load("minilm_weights/encoder_layer_0_attention_self_value_weight.npy")
b_v = np.load("minilm_weights/encoder_layer_0_attention_self_value_bias.npy")

# Compute manually
Q_manual = manual_input_np @ W_q.T + b_q
K_manual = manual_input_np @ W_k.T + b_k
V_manual = manual_input_np @ W_v.T + b_v

# Compare with MiniLM outputs
print("🔍 Max abs diff (Q):", np.abs(Q_manual - Q_ref).max())
print("🔍 Max abs diff (K):", np.abs(K_manual - K_ref).max())
print("🔍 Max abs diff (V):", np.abs(V_manual - V_ref).max())


🔍 Max abs diff (Q): 1.66893e-06
🔍 Max abs diff (K): 7.748604e-07
🔍 Max abs diff (V): 7.1525574e-07


## Step 12: Now let's get the self attention scores for block 1 from MiniLM for our input

In [31]:
import torch
import numpy as np
import pandas as pd

with torch.no_grad():
    layer0 = model.encoder.layer[0]
    input_pt = manual_input.clone()  # [1, seq_len, 384]

    # Get Q, K projections
    Q = layer0.attention.self.query(input_pt)  # [1, 6, 384]
    K = layer0.attention.self.key(input_pt)    # [1, 6, 384]

    # Split into heads: [batch, num_heads, seq_len, head_dim]
    def split_heads(x):
        batch_size, seq_len, hidden_dim = x.size()
        num_heads = 12
        head_dim = hidden_dim // num_heads
        return x.view(batch_size, seq_len, num_heads, head_dim).transpose(1, 2)

    Q_split = split_heads(Q)  # [1, 12, 6, 32]
    K_split = split_heads(K)  # [1, 12, 6, 32]

    # Compute scaled dot-product attention scores: Q @ K^T / sqrt(d_k)
    d_k = Q_split.size(-1)
    scores = torch.matmul(Q_split, K_split.transpose(-2, -1)) / np.sqrt(d_k)  # [1, 12, 6, 6]
    scores_np = scores.squeeze(0).cpu().numpy()  # [12, 6, 6]

# Display attention scores from head 0
print("✅ MiniLM Layer 0 – Head 0 Attention Scores (6×6):")
print(np.round(scores_np[0], 4))

# Save all 12 heads to CSV
for h in range(12):
    pd.DataFrame(scores_np[h]).to_csv(f"minilm_layer0_attention_scores_head{h}.csv", index=False)

print("\n✅ Saved attention scores for all 12 heads (minilm_layer0_attention_scores_head*.csv).")


✅ MiniLM Layer 0 – Head 0 Attention Scores (6×6):
[[-0.1226  0.0408  0.1573  0.098   0.1382  0.0791  0.0524  0.1343  0.1992]
 [ 0.2471  0.042   0.042   0.0346  0.0326  0.0173  0.0275  0.0554  0.116 ]
 [ 0.1513  0.062   0.0665  0.0764  0.0786  0.0323  0.0475  0.0855  0.1372]
 [ 0.2583  0.0438  0.0421  0.049   0.0504 -0.0035  0.0319  0.071   0.118 ]
 [ 0.2206  0.0667  0.0487  0.0688  0.0369  0.0106  0.0281  0.069   0.1163]
 [ 0.1824  0.0296  0.0834  0.0466  0.065   0.0128  0.0537  0.071   0.132 ]
 [ 0.2376  0.0521  0.0478  0.0506  0.034   0.008   0.0324  0.0675  0.1138]
 [ 0.1562  0.0636  0.0961  0.0727  0.0865  0.0394  0.0501  0.1175  0.1602]
 [ 0.0914  0.0647  0.0942  0.075   0.095   0.0417  0.0638  0.1002  0.1547]]

✅ Saved attention scores for all 12 heads (minilm_layer0_attention_scores_head*.csv).


## Step 12 a: Manually calcualte the attention scores for block one
Manually compute the scaled dot-product attention score matrices for all 12 heads in MiniLM Layer 0 (Block 1), using Q and K projections derived from manual input.

In [33]:
import numpy as np
import pandas as pd

# ---------------------------------------------------------------------
# Setup: dimensions and splitting logic
# ---------------------------------------------------------------------

seq_len = Q_manual.shape[0]       # Number of tokens in sequence (e.g., 6)
hidden_dim = Q_manual.shape[1]    # Full hidden dimension (e.g., 384 for MiniLM)
num_heads = 12                    # Number of attention heads in MiniLM
head_dim = hidden_dim // num_heads  # 384 / 12 = 32 dims per head

print(f"Manual attention score calculation — Layer 0 with {num_heads} heads ({head_dim} dim each)")

# ---------------------------------------------------------------------
# Step 1: Split Q and K into per-head slices (12 total)
# ---------------------------------------------------------------------

Q_heads = np.split(Q_manual, num_heads, axis=-1)  # List of [6, 32]
K_heads = np.split(K_manual, num_heads, axis=-1)

# ---------------------------------------------------------------------
# Step 2: For each head, compute scaled dot-product attention scores
# ---------------------------------------------------------------------

manual_scores = []  # Will hold one 6x6 score matrix per head

for i in range(num_heads):
    Qh = Q_heads[i]  # Q for head i: shape [6, 32]
    Kh = K_heads[i]  # K for head i: shape [6, 32]

    # Raw dot product between each Q and all K (token-to-token interaction)
    raw_scores = Qh @ Kh.T  # shape: [6, 6]

    # Scale by sqrt(head_dim) as in standard transformer attention
    scaled_scores = raw_scores / np.sqrt(head_dim)  # scale by sqrt(32)

    manual_scores.append(scaled_scores)

    # -----------------------------------------------------------------
    # Step 3: Display and save this head's score matrix
    # -----------------------------------------------------------------

    print(f"\nHead {i} – Manual Attention Score Matrix (rounded):")
    print(np.round(scaled_scores, 4))

    pd.DataFrame(scaled_scores).to_csv(
        f"manual_layer0_attention_scores_head{i}.csv", index=False
    )

# ---------------------------------------------------------------------
# Completion message
# ---------------------------------------------------------------------
print("\n✅ Completed manual attention score calculation and saved all 12 matrices.")


Manual attention score calculation — Layer 0 with 12 heads (32 dim each)

Head 0 – Manual Attention Score Matrix (rounded):
[[-0.1226  0.0408  0.1573  0.098   0.1382  0.0791  0.0524  0.1343  0.1992]
 [ 0.2471  0.042   0.042   0.0346  0.0326  0.0173  0.0275  0.0554  0.116 ]
 [ 0.1513  0.062   0.0665  0.0764  0.0786  0.0323  0.0475  0.0855  0.1372]
 [ 0.2583  0.0438  0.0421  0.049   0.0504 -0.0035  0.0319  0.071   0.118 ]
 [ 0.2206  0.0667  0.0487  0.0688  0.0369  0.0106  0.0281  0.069   0.1163]
 [ 0.1824  0.0296  0.0834  0.0466  0.065   0.0128  0.0537  0.071   0.132 ]
 [ 0.2376  0.0521  0.0478  0.0506  0.034   0.008   0.0324  0.0675  0.1138]
 [ 0.1562  0.0636  0.0961  0.0727  0.0865  0.0394  0.0501  0.1175  0.1602]
 [ 0.0914  0.0647  0.0942  0.075   0.095   0.0417  0.0638  0.1002  0.1547]]

Head 1 – Manual Attention Score Matrix (rounded):
[[ 0.1205  0.0479 -0.0659 -0.0202  0.0075 -0.0228 -0.0808 -0.0076 -0.0041]
 [ 0.2827  0.0527  0.01    0.0491  0.0069 -0.0166  0.0466  0.0081  0.0664]

## Now let's compare MiniLM attention scores for block one and our calculated attention scores

In [35]:
import numpy as np

# ---------------------------------------------------------------------
# Step 1: Configuration — number of heads
# ---------------------------------------------------------------------

num_heads = 12

# ---------------------------------------------------------------------
# Step 2: Compare each head's score matrix
# ---------------------------------------------------------------------

for i in range(num_heads):
    # Load MiniLM's attention scores for head i
    minilm_scores = np.loadtxt(f"minilm_layer0_attention_scores_head{i}.csv", delimiter=",", skiprows=1)

    # Load your manually computed attention scores for head i
    manual_scores = np.loadtxt(f"manual_layer0_attention_scores_head{i}.csv", delimiter=",", skiprows=1)

    # Compute element-wise absolute difference
    abs_diff = np.abs(minilm_scores - manual_scores)

    # Report the maximum absolute difference
    max_diff = abs_diff.max()

    print(f"Head {i}: max abs diff = {max_diff:.8f}")

    # Optional: flag if it's too high
    if max_diff > 1e-5:
        print("⚠️  Warning: possible mismatch in head", i)

# ---------------------------------------------------------------------
# Completion message
# ---------------------------------------------------------------------
print("\n✅ Completed comparison of manual vs MiniLM attention scores for all heads.")


Head 0: max abs diff = 0.00000008
Head 1: max abs diff = 0.00000012
Head 2: max abs diff = 0.00000190
Head 3: max abs diff = 0.00000013
Head 4: max abs diff = 0.00000040
Head 5: max abs diff = 0.00000030
Head 6: max abs diff = 0.00000009
Head 7: max abs diff = 0.00000021
Head 8: max abs diff = 0.00000012
Head 9: max abs diff = 0.00000020
Head 10: max abs diff = 0.00000020
Head 11: max abs diff = 0.00000012

✅ Completed comparison of manual vs MiniLM attention scores for all heads.


## Now let's get layer 1 attention outout for our input

In [37]:
import torch
import numpy as np
import pandas as pd

# ---------------------------------------------------------------------
# Step 1: Extract attention output from MiniLM Layer 0 (Block 1)
# ---------------------------------------------------------------------
with torch.no_grad():
    layer0 = model.encoder.layer[0]

    # Get attention output (first element of returned tuple)
    attn_output_tensor = layer0.attention(manual_input)[0]  # shape: [1, 6, 384]

# ---------------------------------------------------------------------
# Step 2: Convert to NumPy
# ---------------------------------------------------------------------
attn_output = attn_output_tensor.squeeze(0).cpu().numpy()  # shape: [6, 384]

# ---------------------------------------------------------------------
# Step 3: Display shape and preview a few values
# ---------------------------------------------------------------------
print("✅ MiniLM Layer 0 – Attention Output")
print("Shape:", attn_output.shape)
print("First token (first 5 dims):", np.round(attn_output[0][:5], 6))

# ---------------------------------------------------------------------
# Step 4: Save to CSV
# ---------------------------------------------------------------------
pd.DataFrame(attn_output, index=[f"token_{i}" for i in range(attn_output.shape[0])]).to_csv(
    "minilm_layer0_attention_output.csv", index_label="token"
)

print("\n✅ Saved attention output to minilm_layer0_attention_output.csv")


✅ MiniLM Layer 0 – Attention Output
Shape: (9, 384)
First token (first 5 dims): [-0.366862 -0.023811  0.300175 -0.19604   0.2298  ]

✅ Saved attention output to minilm_layer0_attention_output.csv


## Step 13: Now let's calculate the self attention ourselves and compare to MiniLM.

In [39]:
# ---------------------------------------------------------------------
# Step 1: Compute Q_manual, K_manual, V_manual (from saved weights)
# ---------------------------------------------------------------------

W_q = np.load("minilm_weights/encoder_layer_0_attention_self_query_weight.npy")
b_q = np.load("minilm_weights/encoder_layer_0_attention_self_query_bias.npy")
W_k = np.load("minilm_weights/encoder_layer_0_attention_self_key_weight.npy")
b_k = np.load("minilm_weights/encoder_layer_0_attention_self_key_bias.npy")
W_v = np.load("minilm_weights/encoder_layer_0_attention_self_value_weight.npy")
b_v = np.load("minilm_weights/encoder_layer_0_attention_self_value_bias.npy")

Q_manual = manual_input_np @ W_q.T + b_q  # shape: [6, 384]
K_manual = manual_input_np @ W_k.T + b_k
V_manual = manual_input_np @ W_v.T + b_v

# ---------------------------------------------------------------------
# Step 2: Hook MiniLM to capture raw multi-head attention output (before projection)
# ---------------------------------------------------------------------

raw_miniLM_heads = None  # global capture variable

def hook_capture_heads(module, input, output):
    global raw_miniLM_heads
    raw_miniLM_heads = input[0].detach().squeeze(0).cpu().numpy()  # shape: [6, 384]

# Register hook on Layer 0 output projection
hook_handle = model.encoder.layer[0].attention.output.dense.register_forward_hook(hook_capture_heads)

# Trigger the hook by running the attention block
with torch.no_grad():
    _ = model.encoder.layer[0].attention(manual_input)

# Remove the hook
hook_handle.remove()

# ---------------------------------------------------------------------
# Step 3: Manually compute attention output from Q, K, V
# ---------------------------------------------------------------------

num_heads = 12
head_dim = Q_manual.shape[1] // num_heads  # 384 / 12 = 32
seq_len = Q_manual.shape[0]

# Split Q, K, V into 12 heads
Q_heads = np.split(Q_manual, num_heads, axis=-1)
K_heads = np.split(K_manual, num_heads, axis=-1)
V_heads = np.split(V_manual, num_heads, axis=-1)

# Softmax function (stable)
def softmax(x):
    x = x - np.max(x, axis=-1, keepdims=True)
    exp_x = np.exp(x)
    return exp_x / exp_x.sum(axis=-1, keepdims=True)

# Compute per-head outputs
head_outputs = []

for i in range(num_heads):
    Q = Q_heads[i]  # [6, 32]
    K = K_heads[i]
    V = V_heads[i]

    scores = Q @ K.T / np.sqrt(head_dim)  # [6, 6]
    weights = softmax(scores)             # [6, 6]
    head_output = weights @ V             # [6, 32]
    head_outputs.append(head_output)

# Concatenate all heads: [6, 384]
multihead_output = np.concatenate(head_outputs, axis=-1)

# ---------------------------------------------------------------------
# Step 4: Compare manual result to MiniLM's captured raw output
# ---------------------------------------------------------------------

diff = np.abs(multihead_output - raw_miniLM_heads)
max_diff = diff.max()

print("✅ Manual self-attention output (pre-projection)")
print("Shape:", multihead_output.shape)
print("First token (first 5 dims):", np.round(multihead_output[0][:5], 6))
print(f"\n🔍 Max absolute difference vs MiniLM: {max_diff:.8f}")

if max_diff > 1e-6:
    print("⚠️  Mismatch — check softmax, V multiplication, or concat axis")
else:
    print("✅ Match! Manual self-attention output aligns with MiniLM.")


✅ Manual self-attention output (pre-projection)
Shape: (9, 384)
First token (first 5 dims): [-0.027055 -0.045212 -0.039891 -0.002907 -0.01545 ]

🔍 Max absolute difference vs MiniLM: 0.00000027
✅ Match! Manual self-attention output aligns with MiniLM.


## Step 14: Attention output projection

In [41]:
import numpy as np
import pandas as pd

# ---- Step 1: Manual projection from multi-head output ----
W_o = np.load("minilm_weights/encoder_layer_0_attention_output_dense_weight.npy")
b_o = np.load("minilm_weights/encoder_layer_0_attention_output_dense_bias.npy")
attn_output_manual = multihead_output @ W_o.T + b_o  # shape: [6, 384]

# ---- Step 2: Get true MiniLM projection via hook ----
raw_attn_output = None

def hook_fn(module, input, output):
    global raw_attn_output
    raw_attn_output = output.detach().squeeze(0).cpu().numpy()

hook_handle = model.encoder.layer[0].attention.output.dense.register_forward_hook(hook_fn)

with torch.no_grad():
    _ = model.encoder.layer[0].attention(manual_input)

hook_handle.remove()

# ---- Step 3: Compare outputs ----
diff = np.abs(attn_output_manual - raw_attn_output)
print("✅ Attention Output Projection Check")
print("🔍 Max diff:", diff.max())
print("🔍 Mean diff:", diff.mean())
print("✅ Match:", np.allclose(attn_output_manual, raw_attn_output, atol=1e-6))


✅ Attention Output Projection Check
🔍 Max diff: 3.2782555e-07
🔍 Mean diff: 4.010255e-08
✅ Match: True


## Step 15: Add layer norm 1 for stability

In [43]:
import numpy as np
import torch

# ---- Step 1: Manual LayerNorm computation ----

gamma1 = np.load("minilm_weights/encoder_layer_0_attention_output_LayerNorm_weight.npy")
beta1  = np.load("minilm_weights/encoder_layer_0_attention_output_LayerNorm_bias.npy")

ln1_input = attn_output_manual + manual_input_np

def layer_norm(x, gamma, beta, eps=1e-12):
    mean = x.mean(axis=-1, keepdims=True)
    var  = x.var(axis=-1, keepdims=True)
    norm = (x - mean) / np.sqrt(var + eps)
    return norm * gamma + beta

ln1_out_manual = layer_norm(ln1_input, gamma1, beta1)

# ---- Step 2: Use LayerNorm module directly ----

with torch.no_grad():
    ln1_output_minilm = model.encoder.layer[0].attention.output.LayerNorm(
        torch.tensor(ln1_input).unsqueeze(0)
    ).squeeze(0).numpy()

# ---- Step 3: Compare ----

diff = np.abs(ln1_out_manual - ln1_output_minilm)
print("✅ Step 2: Residual + LayerNorm 1")
print("🔍 Max diff:", diff.max())
print("🔍 Mean diff:", diff.mean())
print("✅ Match:", np.allclose(ln1_out_manual, ln1_output_minilm, atol=1e-6))


✅ Step 2: Residual + LayerNorm 1
🔍 Max diff: 1.9073486e-06
🔍 Mean diff: 1.789185e-08
✅ Match: True


## Step 16: Add layer 2 Norm
LayerNorm 2 adds the transformed result (FFN) back to the original LayerNorm 1 output, and then normalizes the sum, keeping values stable for the next layer.

In [45]:
import numpy as np
import torch

# Use the ln1_out_manual from your previous calculation (Step 15)
# ln1_out_manual is already defined from your previous cell

# Calculate feed-forward network output
W_intermediate = np.load("minilm_weights/encoder_layer_0_intermediate_dense_weight.npy")
b_intermediate = np.load("minilm_weights/encoder_layer_0_intermediate_dense_bias.npy")
W_output = np.load("minilm_weights/encoder_layer_0_output_dense_weight.npy")
b_output = np.load("minilm_weights/encoder_layer_0_output_dense_bias.npy")

# FFN first part (with GELU activation)
def gelu(x):
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))

ffn_intermediate = ln1_out_manual @ W_intermediate.T + b_intermediate
ffn_intermediate_act = gelu(ffn_intermediate)
ff_output_manual = ffn_intermediate_act @ W_output.T + b_output

# ---- Step 1: Load LayerNorm 2 weights ----
gamma2 = np.load("minilm_weights/encoder_layer_0_output_LayerNorm_weight.npy")
beta2 = np.load("minilm_weights/encoder_layer_0_output_LayerNorm_bias.npy")

# ---- Step 2: Manual LayerNorm 2 computation ----
# Residual connection: add input to FFN output
ln2_input = ff_output_manual + ln1_out_manual

def layer_norm(x, gamma, beta, eps=1e-12):
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    norm = (x - mean) / np.sqrt(var + eps)
    return norm * gamma + beta

ln2_out_manual = layer_norm(ln2_input, gamma2, beta2)

# ---- Step 3: Get MiniLM LayerNorm 2 output via module ----
with torch.no_grad():
    ln2_out_minilm = model.encoder.layer[0].output.LayerNorm(
        torch.tensor(ln2_input).unsqueeze(0)
    ).squeeze(0).numpy()

# ---- Step 4: Compare ----
diff = np.abs(ln2_out_manual - ln2_out_minilm)
print("✅ Step 4: Residual + LayerNorm 2")
print("🔍 Max diff:", diff.max())
print("🔍 Mean diff:", diff.mean())
print("✅ Match:", np.allclose(ln2_out_manual, ln2_out_minilm, atol=1e-6))

✅ Step 4: Residual + LayerNorm 2
🔍 Max diff: 4.7683716e-07
🔍 Mean diff: 1.3718549e-08
✅ Match: True


In [46]:
## Do layer 1

In [47]:
import numpy as np
import torch

# --- Set up a hook to capture the intermediate outputs from MiniLM ---
layer_outputs = {}

def get_layer_output(name):
    def hook(module, input, output):
        layer_outputs[name] = output[0].detach().clone()
    return hook

# Register hooks on MiniLM layers
hooks = []
hooks.append(model.encoder.layer[0].register_forward_hook(get_layer_output("layer0")))
hooks.append(model.encoder.layer[1].register_forward_hook(get_layer_output("layer1")))

# Run forward pass to capture outputs
with torch.no_grad():
    _ = model(input_ids, attention_mask=torch.ones_like(input_ids))

# Remove hooks
for hook in hooks:
    hook.remove()

# Get MiniLM's layer 0 output to use as input to our layer 1
layer0_output_minilm = layer_outputs["layer0"].squeeze(0).cpu().numpy()

# ----- Layer 1 Implementation -----
# 1. Self-Attention Block
W_q = np.load("minilm_weights/encoder_layer_1_attention_self_query_weight.npy")
b_q = np.load("minilm_weights/encoder_layer_1_attention_self_query_bias.npy")
W_k = np.load("minilm_weights/encoder_layer_1_attention_self_key_weight.npy")
b_k = np.load("minilm_weights/encoder_layer_1_attention_self_key_bias.npy")
W_v = np.load("minilm_weights/encoder_layer_1_attention_self_value_weight.npy")
b_v = np.load("minilm_weights/encoder_layer_1_attention_self_value_bias.npy")
W_o = np.load("minilm_weights/encoder_layer_1_attention_output_dense_weight.npy")
b_o = np.load("minilm_weights/encoder_layer_1_attention_output_dense_bias.npy")

# Use MiniLM's layer 0 output as input
Q_manual = layer0_output_minilm @ W_q.T + b_q
K_manual = layer0_output_minilm @ W_k.T + b_k
V_manual = layer0_output_minilm @ W_v.T + b_v

# Multi-head attention
num_heads = 12
head_dim = Q_manual.shape[1] // num_heads
seq_len = Q_manual.shape[0]

# Split into heads
Q_heads = np.split(Q_manual, num_heads, axis=-1)
K_heads = np.split(K_manual, num_heads, axis=-1)
V_heads = np.split(V_manual, num_heads, axis=-1)

def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

head_outputs = []
for i in range(num_heads):
    Q = Q_heads[i]
    K = K_heads[i]
    V = V_heads[i]
    
    scores = Q @ K.T / np.sqrt(head_dim)
    weights = softmax(scores)
    head_output = weights @ V
    head_outputs.append(head_output)

multihead_output = np.concatenate(head_outputs, axis=-1)
attn_output = multihead_output @ W_o.T + b_o

# First Residual + LayerNorm
gamma1 = np.load("minilm_weights/encoder_layer_1_attention_output_LayerNorm_weight.npy")
beta1 = np.load("minilm_weights/encoder_layer_1_attention_output_LayerNorm_bias.npy")

def layer_norm(x, gamma, beta, eps=1e-12):
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    norm = (x - mean) / np.sqrt(var + eps)
    return norm * gamma + beta

ln1_input = attn_output + layer0_output_minilm
ln1_output = layer_norm(ln1_input, gamma1, beta1)

# Feed-Forward Network
W_intermediate = np.load("minilm_weights/encoder_layer_1_intermediate_dense_weight.npy")
b_intermediate = np.load("minilm_weights/encoder_layer_1_intermediate_dense_bias.npy")
W_output = np.load("minilm_weights/encoder_layer_1_output_dense_weight.npy")
b_output = np.load("minilm_weights/encoder_layer_1_output_dense_bias.npy")

# Use PyTorch's GELU for exact match with MiniLM
intermediate = ln1_output @ W_intermediate.T + b_intermediate
intermediate_tensor = torch.tensor(intermediate).float()
intermediate_act = torch.nn.functional.gelu(intermediate_tensor).numpy()
ffn_output = intermediate_act @ W_output.T + b_output

# Second Residual + LayerNorm
gamma2 = np.load("minilm_weights/encoder_layer_1_output_LayerNorm_weight.npy")
beta2 = np.load("minilm_weights/encoder_layer_1_output_LayerNorm_bias.npy")

ln2_input = ffn_output + ln1_output
layer1_output = layer_norm(ln2_input, gamma2, beta2)

# Compare with MiniLM's layer 1 output
layer1_output_minilm = layer_outputs["layer1"].squeeze(0).cpu().numpy()
diff = np.abs(layer1_output - layer1_output_minilm)

# Print additional info
print("✅ Layer 1 Complete")
print("🔍 Max diff:", diff.max())
print("🔍 Mean diff:", diff.mean())
print("✅ Close match (1e-6 tolerance):", np.allclose(layer1_output, layer1_output_minilm, atol=1e-6))
print("✅ Close match (1e-5 tolerance):", np.allclose(layer1_output, layer1_output_minilm, atol=1e-5))

# For any future operations, consider this output valid if max diff < 1e-5

✅ Layer 1 Complete
🔍 Max diff: 1.9073486e-06
🔍 Mean diff: 2.3564374e-07
✅ Close match (1e-6 tolerance): False
✅ Close match (1e-5 tolerance): True


In [48]:
import numpy as np
import torch

# --- Set up a hook to capture the intermediate outputs from MiniLM ---
layer_outputs = {}

def get_layer_output(name):
    def hook(module, input, output):
        layer_outputs[name] = output[0].detach().clone()
    return hook

# Register hooks on MiniLM layers
hooks = []
hooks.append(model.encoder.layer[1].register_forward_hook(get_layer_output("layer1")))
hooks.append(model.encoder.layer[2].register_forward_hook(get_layer_output("layer2")))

# Run forward pass to capture outputs
with torch.no_grad():
    _ = model(input_ids, attention_mask=torch.ones_like(input_ids))

# Remove hooks
for hook in hooks:
    hook.remove()

# Get MiniLM's layer 1 output to use as input to our layer 2
layer1_output_minilm = layer_outputs["layer1"].squeeze(0).cpu().numpy()

# ----- Layer 2 Implementation -----
# 1. Self-Attention Block
W_q = np.load("minilm_weights/encoder_layer_2_attention_self_query_weight.npy")
b_q = np.load("minilm_weights/encoder_layer_2_attention_self_query_bias.npy")
W_k = np.load("minilm_weights/encoder_layer_2_attention_self_key_weight.npy")
b_k = np.load("minilm_weights/encoder_layer_2_attention_self_key_bias.npy")
W_v = np.load("minilm_weights/encoder_layer_2_attention_self_value_weight.npy")
b_v = np.load("minilm_weights/encoder_layer_2_attention_self_value_bias.npy")
W_o = np.load("minilm_weights/encoder_layer_2_attention_output_dense_weight.npy")
b_o = np.load("minilm_weights/encoder_layer_2_attention_output_dense_bias.npy")

# Use MiniLM's layer 1 output as input
Q_manual = layer1_output_minilm @ W_q.T + b_q
K_manual = layer1_output_minilm @ W_k.T + b_k
V_manual = layer1_output_minilm @ W_v.T + b_v

# Multi-head attention
num_heads = 12
head_dim = Q_manual.shape[1] // num_heads
seq_len = Q_manual.shape[0]

# Split into heads
Q_heads = np.split(Q_manual, num_heads, axis=-1)
K_heads = np.split(K_manual, num_heads, axis=-1)
V_heads = np.split(V_manual, num_heads, axis=-1)

def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

head_outputs = []
for i in range(num_heads):
    Q = Q_heads[i]
    K = K_heads[i]
    V = V_heads[i]
    
    scores = Q @ K.T / np.sqrt(head_dim)
    weights = softmax(scores)
    head_output = weights @ V
    head_outputs.append(head_output)

multihead_output = np.concatenate(head_outputs, axis=-1)
attn_output = multihead_output @ W_o.T + b_o

# First Residual + LayerNorm
gamma1 = np.load("minilm_weights/encoder_layer_2_attention_output_LayerNorm_weight.npy")
beta1 = np.load("minilm_weights/encoder_layer_2_attention_output_LayerNorm_bias.npy")

def layer_norm(x, gamma, beta, eps=1e-12):
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    norm = (x - mean) / np.sqrt(var + eps)
    return norm * gamma + beta

ln1_input = attn_output + layer1_output_minilm
ln1_output = layer_norm(ln1_input, gamma1, beta1)

# Feed-Forward Network
W_intermediate = np.load("minilm_weights/encoder_layer_2_intermediate_dense_weight.npy")
b_intermediate = np.load("minilm_weights/encoder_layer_2_intermediate_dense_bias.npy")
W_output = np.load("minilm_weights/encoder_layer_2_output_dense_weight.npy")
b_output = np.load("minilm_weights/encoder_layer_2_output_dense_bias.npy")

# Use PyTorch's GELU for exact match with MiniLM
intermediate = ln1_output @ W_intermediate.T + b_intermediate
intermediate_tensor = torch.tensor(intermediate).float()
intermediate_act = torch.nn.functional.gelu(intermediate_tensor).numpy()
ffn_output = intermediate_act @ W_output.T + b_output

# Second Residual + LayerNorm
gamma2 = np.load("minilm_weights/encoder_layer_2_output_LayerNorm_weight.npy")
beta2 = np.load("minilm_weights/encoder_layer_2_output_LayerNorm_bias.npy")

ln2_input = ffn_output + ln1_output
layer2_output = layer_norm(ln2_input, gamma2, beta2)

# Compare with MiniLM's layer 2 output
layer2_output_minilm = layer_outputs["layer2"].squeeze(0).cpu().numpy()
diff = np.abs(layer2_output - layer2_output_minilm)

print("✅ Layer 2 Complete")
print("🔍 Max diff:", diff.max())
print("🔍 Mean diff:", diff.mean())
print("✅ Close match (1e-6 tolerance):", np.allclose(layer2_output, layer2_output_minilm, atol=1e-6))
print("✅ Close match (1e-5 tolerance):", np.allclose(layer2_output, layer2_output_minilm, atol=1e-5))


✅ Layer 2 Complete
🔍 Max diff: 1.9073486e-06
🔍 Mean diff: 1.9157926e-07
✅ Close match (1e-6 tolerance): True
✅ Close match (1e-5 tolerance): True


In [49]:
import numpy as np
import torch

# --- Set up a hook to capture the intermediate outputs from MiniLM ---
layer_outputs = {}

def get_layer_output(name):
    def hook(module, input, output):
        layer_outputs[name] = output[0].detach().clone()
    return hook

# Register hooks on MiniLM layers
hooks = []
hooks.append(model.encoder.layer[2].register_forward_hook(get_layer_output("layer2")))
hooks.append(model.encoder.layer[3].register_forward_hook(get_layer_output("layer3")))

# Run forward pass to capture outputs
with torch.no_grad():
    _ = model(input_ids, attention_mask=torch.ones_like(input_ids))

# Remove hooks
for hook in hooks:
    hook.remove()

# Get MiniLM's layer 2 output to use as input to our layer 3
layer2_output_minilm = layer_outputs["layer2"].squeeze(0).cpu().numpy()

# ----- Layer 3 Implementation -----
# 1. Self-Attention Block
W_q = np.load("minilm_weights/encoder_layer_3_attention_self_query_weight.npy")
b_q = np.load("minilm_weights/encoder_layer_3_attention_self_query_bias.npy")
W_k = np.load("minilm_weights/encoder_layer_3_attention_self_key_weight.npy")
b_k = np.load("minilm_weights/encoder_layer_3_attention_self_key_bias.npy")
W_v = np.load("minilm_weights/encoder_layer_3_attention_self_value_weight.npy")
b_v = np.load("minilm_weights/encoder_layer_3_attention_self_value_bias.npy")
W_o = np.load("minilm_weights/encoder_layer_3_attention_output_dense_weight.npy")
b_o = np.load("minilm_weights/encoder_layer_3_attention_output_dense_bias.npy")

# Use MiniLM's layer 2 output as input
Q_manual = layer2_output_minilm @ W_q.T + b_q
K_manual = layer2_output_minilm @ W_k.T + b_k
V_manual = layer2_output_minilm @ W_v.T + b_v

# Multi-head attention
num_heads = 12
head_dim = Q_manual.shape[1] // num_heads
seq_len = Q_manual.shape[0]

# Split into heads
Q_heads = np.split(Q_manual, num_heads, axis=-1)
K_heads = np.split(K_manual, num_heads, axis=-1)
V_heads = np.split(V_manual, num_heads, axis=-1)

def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

head_outputs = []
for i in range(num_heads):
    Q = Q_heads[i]
    K = K_heads[i]
    V = V_heads[i]
    
    scores = Q @ K.T / np.sqrt(head_dim)
    weights = softmax(scores)
    head_output = weights @ V
    head_outputs.append(head_output)

multihead_output = np.concatenate(head_outputs, axis=-1)
attn_output = multihead_output @ W_o.T + b_o

# First Residual + LayerNorm
gamma1 = np.load("minilm_weights/encoder_layer_3_attention_output_LayerNorm_weight.npy")
beta1 = np.load("minilm_weights/encoder_layer_3_attention_output_LayerNorm_bias.npy")

def layer_norm(x, gamma, beta, eps=1e-12):
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    norm = (x - mean) / np.sqrt(var + eps)
    return norm * gamma + beta

ln1_input = attn_output + layer2_output_minilm
ln1_output = layer_norm(ln1_input, gamma1, beta1)

# Feed-Forward Network
W_intermediate = np.load("minilm_weights/encoder_layer_3_intermediate_dense_weight.npy")
b_intermediate = np.load("minilm_weights/encoder_layer_3_intermediate_dense_bias.npy")
W_output = np.load("minilm_weights/encoder_layer_3_output_dense_weight.npy")
b_output = np.load("minilm_weights/encoder_layer_3_output_dense_bias.npy")

# Use PyTorch's GELU for exact match with MiniLM
intermediate = ln1_output @ W_intermediate.T + b_intermediate
intermediate_tensor = torch.tensor(intermediate).float()
intermediate_act = torch.nn.functional.gelu(intermediate_tensor).numpy()
ffn_output = intermediate_act @ W_output.T + b_output

# Second Residual + LayerNorm
gamma2 = np.load("minilm_weights/encoder_layer_3_output_LayerNorm_weight.npy")
beta2 = np.load("minilm_weights/encoder_layer_3_output_LayerNorm_bias.npy")

ln2_input = ffn_output + ln1_output
layer3_output = layer_norm(ln2_input, gamma2, beta2)

# Compare with MiniLM's layer 3 output
layer3_output_minilm = layer_outputs["layer3"].squeeze(0).cpu().numpy()
diff = np.abs(layer3_output - layer3_output_minilm)

print("✅ Layer 3 Complete")
print("🔍 Max diff:", diff.max())
print("🔍 Mean diff:", diff.mean())
print("✅ Close match (1e-6 tolerance):", np.allclose(layer3_output, layer3_output_minilm, atol=1e-6))
print("✅ Close match (1e-5 tolerance):", np.allclose(layer3_output, layer3_output_minilm, atol=1e-5))

✅ Layer 3 Complete
🔍 Max diff: 1.1920929e-06
🔍 Mean diff: 1.7707889e-07
✅ Close match (1e-6 tolerance): True
✅ Close match (1e-5 tolerance): True


In [50]:
import numpy as np
import torch

# --- Set up a hook to capture the intermediate outputs from MiniLM ---
layer_outputs = {}

def get_layer_output(name):
    def hook(module, input, output):
        layer_outputs[name] = output[0].detach().clone()
    return hook

# Register hooks on MiniLM layers
hooks = []
hooks.append(model.encoder.layer[3].register_forward_hook(get_layer_output("layer3")))
hooks.append(model.encoder.layer[4].register_forward_hook(get_layer_output("layer4")))

# Run forward pass to capture outputs
with torch.no_grad():
    _ = model(input_ids, attention_mask=torch.ones_like(input_ids))

# Remove hooks
for hook in hooks:
    hook.remove()

# Get MiniLM's layer 3 output to use as input to our layer 4
layer3_output_minilm = layer_outputs["layer3"].squeeze(0).cpu().numpy()

# ----- Layer 4 Implementation -----
# 1. Self-Attention Block
W_q = np.load("minilm_weights/encoder_layer_4_attention_self_query_weight.npy")
b_q = np.load("minilm_weights/encoder_layer_4_attention_self_query_bias.npy")
W_k = np.load("minilm_weights/encoder_layer_4_attention_self_key_weight.npy")
b_k = np.load("minilm_weights/encoder_layer_4_attention_self_key_bias.npy")
W_v = np.load("minilm_weights/encoder_layer_4_attention_self_value_weight.npy")
b_v = np.load("minilm_weights/encoder_layer_4_attention_self_value_bias.npy")
W_o = np.load("minilm_weights/encoder_layer_4_attention_output_dense_weight.npy")
b_o = np.load("minilm_weights/encoder_layer_4_attention_output_dense_bias.npy")

# Use MiniLM's layer 3 output as input
Q_manual = layer3_output_minilm @ W_q.T + b_q
K_manual = layer3_output_minilm @ W_k.T + b_k
V_manual = layer3_output_minilm @ W_v.T + b_v

# Multi-head attention
num_heads = 12
head_dim = Q_manual.shape[1] // num_heads
seq_len = Q_manual.shape[0]

# Split into heads
Q_heads = np.split(Q_manual, num_heads, axis=-1)
K_heads = np.split(K_manual, num_heads, axis=-1)
V_heads = np.split(V_manual, num_heads, axis=-1)

def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

head_outputs = []
for i in range(num_heads):
    Q = Q_heads[i]
    K = K_heads[i]
    V = V_heads[i]
    
    scores = Q @ K.T / np.sqrt(head_dim)
    weights = softmax(scores)
    head_output = weights @ V
    head_outputs.append(head_output)

multihead_output = np.concatenate(head_outputs, axis=-1)
attn_output = multihead_output @ W_o.T + b_o

# First Residual + LayerNorm
gamma1 = np.load("minilm_weights/encoder_layer_4_attention_output_LayerNorm_weight.npy")
beta1 = np.load("minilm_weights/encoder_layer_4_attention_output_LayerNorm_bias.npy")

def layer_norm(x, gamma, beta, eps=1e-12):
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    norm = (x - mean) / np.sqrt(var + eps)
    return norm * gamma + beta

ln1_input = attn_output + layer3_output_minilm
ln1_output = layer_norm(ln1_input, gamma1, beta1)

# Feed-Forward Network
W_intermediate = np.load("minilm_weights/encoder_layer_4_intermediate_dense_weight.npy")
b_intermediate = np.load("minilm_weights/encoder_layer_4_intermediate_dense_bias.npy")
W_output = np.load("minilm_weights/encoder_layer_4_output_dense_weight.npy")
b_output = np.load("minilm_weights/encoder_layer_4_output_dense_bias.npy")

# Use PyTorch's GELU for exact match with MiniLM
intermediate = ln1_output @ W_intermediate.T + b_intermediate
intermediate_tensor = torch.tensor(intermediate).float()
intermediate_act = torch.nn.functional.gelu(intermediate_tensor).numpy()
ffn_output = intermediate_act @ W_output.T + b_output

# Second Residual + LayerNorm
gamma2 = np.load("minilm_weights/encoder_layer_4_output_LayerNorm_weight.npy")
beta2 = np.load("minilm_weights/encoder_layer_4_output_LayerNorm_bias.npy")

ln2_input = ffn_output + ln1_output
layer4_output = layer_norm(ln2_input, gamma2, beta2)

# Compare with MiniLM's layer 4 output
layer4_output_minilm = layer_outputs["layer4"].squeeze(0).cpu().numpy()
diff = np.abs(layer4_output - layer4_output_minilm)

print("✅ Layer 4 Complete")
print("🔍 Max diff:", diff.max())
print("🔍 Mean diff:", diff.mean())
print("✅ Close match (1e-6 tolerance):", np.allclose(layer4_output, layer4_output_minilm, atol=1e-6))
print("✅ Close match (1e-5 tolerance):", np.allclose(layer4_output, layer4_output_minilm, atol=1e-5))

✅ Layer 4 Complete
🔍 Max diff: 1.9073486e-06
🔍 Mean diff: 1.9057532e-07
✅ Close match (1e-6 tolerance): True
✅ Close match (1e-5 tolerance): True


In [51]:
import numpy as np
import torch

# --- Set up a hook to capture the intermediate outputs from MiniLM ---
layer_outputs = {}

def get_layer_output(name):
    def hook(module, input, output):
        layer_outputs[name] = output[0].detach().clone()
    return hook

# Register hooks on MiniLM layers
hooks = []
hooks.append(model.encoder.layer[4].register_forward_hook(get_layer_output("layer4")))
hooks.append(model.encoder.layer[5].register_forward_hook(get_layer_output("layer5")))

# Run forward pass to capture outputs
with torch.no_grad():
    _ = model(input_ids, attention_mask=torch.ones_like(input_ids))

# Remove hooks
for hook in hooks:
    hook.remove()

# Get MiniLM's layer 4 output to use as input to our layer 5
layer4_output_minilm = layer_outputs["layer4"].squeeze(0).cpu().numpy()

# ----- Layer 5 Implementation -----
# 1. Self-Attention Block
W_q = np.load("minilm_weights/encoder_layer_5_attention_self_query_weight.npy")
b_q = np.load("minilm_weights/encoder_layer_5_attention_self_query_bias.npy")
W_k = np.load("minilm_weights/encoder_layer_5_attention_self_key_weight.npy")
b_k = np.load("minilm_weights/encoder_layer_5_attention_self_key_bias.npy")
W_v = np.load("minilm_weights/encoder_layer_5_attention_self_value_weight.npy")
b_v = np.load("minilm_weights/encoder_layer_5_attention_self_value_bias.npy")
W_o = np.load("minilm_weights/encoder_layer_5_attention_output_dense_weight.npy")
b_o = np.load("minilm_weights/encoder_layer_5_attention_output_dense_bias.npy")

# Use MiniLM's layer 4 output as input
Q_manual = layer4_output_minilm @ W_q.T + b_q
K_manual = layer4_output_minilm @ W_k.T + b_k
V_manual = layer4_output_minilm @ W_v.T + b_v

# Multi-head attention
num_heads = 12
head_dim = Q_manual.shape[1] // num_heads
seq_len = Q_manual.shape[0]

# Split into heads
Q_heads = np.split(Q_manual, num_heads, axis=-1)
K_heads = np.split(K_manual, num_heads, axis=-1)
V_heads = np.split(V_manual, num_heads, axis=-1)

def softmax(x, axis=-1):
    exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return exp_x / np.sum(exp_x, axis=axis, keepdims=True)

head_outputs = []
for i in range(num_heads):
    Q = Q_heads[i]
    K = K_heads[i]
    V = V_heads[i]
    
    scores = Q @ K.T / np.sqrt(head_dim)
    weights = softmax(scores)
    head_output = weights @ V
    head_outputs.append(head_output)

multihead_output = np.concatenate(head_outputs, axis=-1)
attn_output = multihead_output @ W_o.T + b_o

# First Residual + LayerNorm
gamma1 = np.load("minilm_weights/encoder_layer_5_attention_output_LayerNorm_weight.npy")
beta1 = np.load("minilm_weights/encoder_layer_5_attention_output_LayerNorm_bias.npy")

def layer_norm(x, gamma, beta, eps=1e-12):
    mean = x.mean(axis=-1, keepdims=True)
    var = x.var(axis=-1, keepdims=True)
    norm = (x - mean) / np.sqrt(var + eps)
    return norm * gamma + beta

ln1_input = attn_output + layer4_output_minilm
ln1_output = layer_norm(ln1_input, gamma1, beta1)

# Feed-Forward Network
W_intermediate = np.load("minilm_weights/encoder_layer_5_intermediate_dense_weight.npy")
b_intermediate = np.load("minilm_weights/encoder_layer_5_intermediate_dense_bias.npy")
W_output = np.load("minilm_weights/encoder_layer_5_output_dense_weight.npy")
b_output = np.load("minilm_weights/encoder_layer_5_output_dense_bias.npy")

# Use PyTorch's GELU for exact match with MiniLM
intermediate = ln1_output @ W_intermediate.T + b_intermediate
intermediate_tensor = torch.tensor(intermediate).float()
intermediate_act = torch.nn.functional.gelu(intermediate_tensor).numpy()
ffn_output = intermediate_act @ W_output.T + b_output

# Second Residual + LayerNorm
gamma2 = np.load("minilm_weights/encoder_layer_5_output_LayerNorm_weight.npy")
beta2 = np.load("minilm_weights/encoder_layer_5_output_LayerNorm_bias.npy")

ln2_input = ffn_output + ln1_output
layer5_output = layer_norm(ln2_input, gamma2, beta2)

# Compare with MiniLM's layer 5 output
layer5_output_minilm = layer_outputs["layer5"].squeeze(0).cpu().numpy()
diff = np.abs(layer5_output - layer5_output_minilm)

print("✅ Layer 5 Complete")
print("🔍 Max diff:", diff.max())
print("🔍 Mean diff:", diff.mean())
print("✅ Close match (1e-6 tolerance):", np.allclose(layer5_output, layer5_output_minilm, atol=1e-6))
print("✅ Close match (1e-5 tolerance):", np.allclose(layer5_output, layer5_output_minilm, atol=1e-5))

✅ Layer 5 Complete
🔍 Max diff: 1.0728836e-06
🔍 Mean diff: 1.2315839e-07
✅ Close match (1e-6 tolerance): True
✅ Close match (1e-5 tolerance): True


In [52]:
import numpy as np
import torch

# --- Get MiniLM pooler weights ---
# First, check if there's a pooler layer in the model
if hasattr(model, 'pooler'):
    pooler_dense_weight = model.pooler.dense.weight.detach().cpu().numpy()
    pooler_dense_bias = model.pooler.dense.bias.detach().cpu().numpy()
    has_pooler = True
else:
    has_pooler = False
    print("MiniLM doesn't have an explicit pooler layer")

# --- Get the final hidden states directly from MiniLM ---
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=torch.ones_like(input_ids)).last_hidden_state.squeeze(0).cpu().numpy()
    
    # Get the official pooler output if available
    if has_pooler:
        pooler_output = model(input_ids, attention_mask=torch.ones_like(input_ids)).pooler_output.squeeze(0).cpu().numpy()
    else:
        # If no pooler, try to get the sentence embedding through the model's API
        try:
            pooler_output = model.get_sentence_embedding(input_ids).squeeze(0).cpu().numpy()
        except:
            pooler_output = None
            print("Could not get official sentence embedding")

# --- Try different pooling strategies ---
# 1. CLS token
cls_token = last_hidden_states[0]  # First token is [CLS]

# 2. Mean pooling
def mean_pooling(token_embeddings, attention_mask):
    mask_expanded = np.expand_dims(attention_mask, -1)
    sum_embeddings = np.sum(token_embeddings * mask_expanded, axis=0)
    sum_mask = np.sum(mask_expanded, axis=0)
    return sum_embeddings / sum_mask

attention_mask = np.ones(last_hidden_states.shape[0])
mean_pooled = mean_pooling(last_hidden_states, attention_mask)

# 3. If there's a pooler, apply it to the CLS token
if has_pooler:
    # The pooler typically applies a dense layer followed by tanh
    pooler_manual = np.tanh(cls_token @ pooler_dense_weight.T + pooler_dense_bias)
    print("Applied pooler transformation")

# --- Compare with MiniLM's pooler output ---
if pooler_output is not None:
    cls_diff = np.abs(cls_token - pooler_output).max() if pooler_output is not None else float('inf')
    mean_diff = np.abs(mean_pooled - pooler_output).max() if pooler_output is not None else float('inf')
    
    if has_pooler:
        pooler_diff = np.abs(pooler_manual - pooler_output).max()
        print(f"🔍 Pooler transformation max diff: {pooler_diff}")
    
    print(f"🔍 CLS token max diff: {cls_diff}")
    print(f"🔍 Mean pooling max diff: {mean_diff}")

    # Find the best match
    if has_pooler and pooler_diff < min(cls_diff, mean_diff):
        print("MiniLM uses custom pooler transformation")
        final_embedding = pooler_manual
    elif cls_diff < mean_diff:
        print("MiniLM uses CLS token")
        final_embedding = cls_token
    else:
        print("MiniLM uses mean pooling")
        final_embedding = mean_pooled
else:
    # If we couldn't get the pooler output, default to mean pooling
    print("Defaulting to mean pooling (couldn't verify)")
    final_embedding = mean_pooled

# --- Normalize and compare ---
def normalize(v):
    norm = np.linalg.norm(v)
    return v / norm if norm > 0 else v

final_normalized = normalize(final_embedding)
if pooler_output is not None:
    pooler_normalized = normalize(pooler_output)
    similarity = np.dot(final_normalized, pooler_normalized)
    print(f"\nSimilarity with MiniLM embedding: {similarity}")
    print(f"Final embedding shape: {final_embedding.shape}")
    print(f"First 5 dimensions: {final_embedding[:5]}")

Applied pooler transformation
🔍 Pooler transformation max diff: 2.9802322387695312e-08
🔍 CLS token max diff: 9.19537353515625
🔍 Mean pooling max diff: 2.8654418024751873
MiniLM uses custom pooler transformation

Similarity with MiniLM embedding: 1.0
Final embedding shape: (384,)
First 5 dimensions: [-0.00913658  0.05209111  0.04688694  0.0240268  -0.10534714]


In [53]:
import numpy as np
import torch

# --- Get MiniLM pooler weights ---
pooler_dense_weight = model.pooler.dense.weight.detach().cpu().numpy()
pooler_dense_bias = model.pooler.dense.bias.detach().cpu().numpy()

# --- Get official MiniLM outputs for comparison ---
with torch.no_grad():
    outputs = model(input_ids, attention_mask=torch.ones_like(input_ids))
    # Last hidden state and pooler output
    minilm_last_hidden = outputs.last_hidden_state.squeeze(0).cpu().numpy()
    minilm_pooler_output = outputs.pooler_output.squeeze(0).cpu().numpy()

# --- Manual implementation ---
# 1. Use our layer 5 output as the final hidden states
final_hidden_states = layer5_output

# 2. Apply the pooler transformation to the CLS token (first token)
cls_token = final_hidden_states[0]  # Extract CLS token representation
pooler_output_manual = np.tanh(cls_token @ pooler_dense_weight.T + pooler_dense_bias)

# --- Compare the results ---
# Difference between our manual hidden states and MiniLM's
hidden_diff = np.abs(final_hidden_states - minilm_last_hidden).max()
# Difference between our manual pooler output and MiniLM's
pooler_diff = np.abs(pooler_output_manual - minilm_pooler_output).max()

# Compute cosine similarity to show vector alignment
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

hidden_sim = cosine_similarity(final_hidden_states.flatten(), minilm_last_hidden.flatten())
pooler_sim = cosine_similarity(pooler_output_manual, minilm_pooler_output)

# Display results
print("✅ MiniLM Sentence Encoding Reproduction Complete")
print("=" * 50)
print(f"🔍 Final hidden states max difference: {hidden_diff}")
print(f"🔍 Pooler output max difference: {pooler_diff}")
print(f"🔍 Hidden states cosine similarity: {hidden_sim}")
print(f"🔍 Pooler output cosine similarity: {pooler_sim}")
print("=" * 50)

# Compare the first 10 dimensions of both embeddings
print("\nFinal Sentence Embeddings (first 10 dimensions):")
print("-" * 50)
print("MiniLM official:  ", minilm_pooler_output[:10])
print("Manual reproduction:", pooler_output_manual[:10])
print("-" * 50)

print("\nShape of final embedding:", pooler_output_manual.shape)
print("This vector can now be used for sentence similarity, classification, etc.")

✅ MiniLM Sentence Encoding Reproduction Complete
🔍 Final hidden states max difference: 1.0728836059570312e-06
🔍 Pooler output max difference: 7.450580596923828e-08
🔍 Hidden states cosine similarity: 1.0000001192092896
🔍 Pooler output cosine similarity: 1.0

Final Sentence Embeddings (first 10 dimensions):
--------------------------------------------------
MiniLM official:   [-0.00913658  0.05209111  0.04688694  0.02402681 -0.10534714 -0.02789352
  0.0032652   0.0857238  -0.01132614 -0.00611972]
Manual reproduction: [-0.0091366   0.05209111  0.04688695  0.02402678 -0.10534718 -0.0278935
  0.0032652   0.08572381 -0.01132614 -0.00611977]
--------------------------------------------------

Shape of final embedding: (384,)
This vector can now be used for sentence similarity, classification, etc.
