In [1]:
!pip install transformer-lens
# !pip install --upgrade transformers tokenizers




Load smallest model with eye towards speed (clearing existing memory, etc.)

In [3]:
from transformer_lens import HookedTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Choose ONE model. 0.6B is recommended for speed/testing.
model_name = "Qwen/Qwen2.5-0.5B-Instruct" # Qwen-3 (0.6B class)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"üöÄ Loading {model_name} on {device}...")

# 1. Clear any leftover memory from previous attempts
torch.cuda.empty_cache()

# 2. Load the model ONCE with memory-saving settings
hf_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16, # Half precision saves 50% VRAM
    device_map="auto"          # Handles GPU placement automatically
)

# 3. Hook it for interpretability (SAE work)
_llm = HookedTransformer.from_pretrained(
    model_name,
    hf_model=hf_model,
    device=device
)

print("‚ú® Success! Model is hooked and ready.")

üöÄ Loading Qwen/Qwen2.5-0.5B-Instruct on cpu...




Loaded pretrained model Qwen/Qwen2.5-0.5B-Instruct into HookedTransformer
‚ú® Success! Model is hooked and ready.


Load lending club data. To ensure balance, merge 100k accepted and 100k rejected loans.

In [4]:
# load lending club accepted data
import pandas as pd
df_accepted = pd.read_csv("gs://exceptions-data/LLM Delegation/LendingClub/data/accepted_2007_to_2018Q4.csv",
                          nrows = 100000) # just first 100k rows, for speed




# load lending club rejected data
import pandas as pd
df_rejected = pd.read_csv("gs://exceptions-data/LLM Delegation/LendingClub/data/rejected_2007_to_2018Q4.csv",
                          nrows = 100000) # just first 100k rows, for speed



# combine data frames
# only these columns are shared across the two dfs
# zip code is also shared, but partially excluded (only three digits)
df = pd.concat([
    df_accepted.rename(columns={
        "loan_amnt": "loan_amnt",
        "title": "title",
        "addr_state": "addr_state",
        "emp_length": "emp_length",
        "policy_code": "policy_code",
    })[
        ["loan_amnt", "title", "addr_state", "emp_length", "policy_code"]
    ].assign(accepted=1),

    df_rejected.rename(columns={
        "Amount Requested": "loan_amnt",
        "Loan Title": "title",
        "State": "addr_state",
        "Employment Length": "emp_length",
        "Policy Code": "policy_code",
    })[
        ["loan_amnt", "title", "addr_state", "emp_length", "policy_code"]
    ].assign(accepted=0)
], ignore_index=True)


# clean title column
df["title"] = df["title"].str.lower().str.replace("_", " ", regex=False)

# clean length column
df["emp_length"] = df["emp_length"].map({"< 1 year":0, "1 year":1, "2 years":2, "3 years":3, "4 years":4, "5 years":5, "6 years":6, "7 years":7, "8 years":8, "9 years":9, "10+ years":10})


df.head()

  df_accepted = pd.read_csv("gs://exceptions-data/LLM Delegation/LendingClub/data/accepted_2007_to_2018Q4.csv",


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


Base prompt for LLM engagement.

In [39]:
import pandas as pd
import random

# scenario builder
def create_prompt_base(row):

    loan_amnt = int(round(row['loan_amnt']))
    emp_length = int(round(row['emp_length']))

    base = (
        f"Someone from {row['addr_state']} is applying for a loan of "
        f"${loan_amnt}."
    )

    if pd.notna(row['title']):
        base += f" The loan is for a {row['title']}."

    base += f" They have been employed for {emp_length} years."

    return base

create_prompt_base(df.iloc[0])

Low level runner for all LLM calls.

In [38]:
import re
import torch

MAX_CTX = 512 # Lowering this slightly further ensures speed
RESERVE = 8

def get_llm_base(prompt: str, max_tokens: int = 20):
    """
    Optimized LLM call with variable length control.
    """
    global _llm
    device = next(_llm.parameters()).device

    # Tokenize and Truncate
    toks = _llm.to_tokens(prompt, prepend_bos=False).to(device)
    if toks.shape[1] > MAX_CTX - RESERVE:
        toks = toks[:, -(MAX_CTX - RESERVE):]

    # Only cache the layer we need for the SAE to save memory
    target_hook = f"blocks.{LAYER}.mlp.hook_post"

    with torch.no_grad():
        _, cache = _llm.run_with_cache(
            toks,
            names_filter=lambda name: name == target_hook
        )

        # Generate using the dynamic max_tokens value
        out = _llm.generate(
            toks,
            max_new_tokens=max_tokens,
            do_sample=False,
            stop_at_eos=True,
            verbose=False
        )

    # Extract and Clean
    gen_tokens = out[0][toks.shape[1]:]
    text = _llm.to_string(gen_tokens)
    text = re.sub(r"<\|im_end\|>|<\|endoftext\|>|<\|im_start\|>|assistant|user", "", text).strip()

    return {
        "cache": cache,
        "tokens": toks,
        "text": text,
    }

Functions that will extract the final decision, and either provide supporting logic or a critique.

In [54]:
import re

def run_final_decision(prompt: str, max_tokens: int = 1):
    r = get_llm_base(prompt, max_tokens=max_tokens)

    generated_text = r.get("text", "").lower() # Normalize to lowercase

    # Updated regex to find 'accept' or 'reject'
    m = re.findall(r'\b(accept|reject)\b', generated_text)
    decision = m[-1] if m else None

    return {
        "del": decision, # This will now be 'accept' or 'reject'
        "cache": r["cache"],
        "tokens": r["tokens"],
        "text": generated_text,
        "final_prompt": prompt,
    }

# --- Supporting Logic ---

def get_llm_base_support(base_scenario: str):
    # Step 1: Prediction (More tokens for context)
    pred_prompt = (
        f"{base_scenario}\n\n"
        "Accept or reject the loan.\n"
        "Initial Decision (accept or reject):"
    )
    prediction = get_llm_base(pred_prompt, max_tokens=1).get("text", "").strip()

    # Step 2: Support (More tokens for reasoning)
    support_prompt = (
        f"SCENARIO: {base_scenario}\n"
        f"INITIAL DECISION: {prediction}\n\n"
        "Provide one more reason why this decision is CORRECT.\n"
        "Sentence: This is potentially because"
    )
    support_text = get_llm_base(support_prompt, max_tokens=50).get("text", "").strip()
    support = "This is potentially because " + support_text

    # Step 3: Final Decision (Short and focused)
    final_prompt = (
        f"SCENARIO: {base_scenario}\n"
        f"INITIAL DECISION: {prediction}\n"
        f"SUPPORT OF INITIAL DECISION: {support}\n\n"
        "Final Decision (accept or reject):"
    )
    out = run_final_decision(final_prompt, max_tokens=1)
    out.update({"prediction": prediction, "support": support})
    return out


# --- Critique ---

def get_sequential_inference(base_scenario: str):
    # Step 1: Prediction
    pred_prompt = (
        f"{base_scenario}\n\n"
        "Accept or reject the loan.\n"
        "Initial Decision (accept or reject):"
    )
    prediction = get_llm_base(pred_prompt, max_tokens=1).get("text", "").strip()

    # Step 2: Critique (Forced pivot with 'However')
    critique_prompt = (
        f"SCENARIO: {base_scenario}\n"
        f"INITIAL DECISION: {prediction}\n\n"
        "Provide one reason why this decision is INCORRECT.\n"
        "Sentence: On the other hand ,"
    )
    critique_text = get_llm_base(critique_prompt, max_tokens=50).get("text", "").strip()
    critique = "On the other hand, " + critique_text

    # Step 3: Final Decision
    final_prompt = (
        f"SCENARIO: {base_scenario}\n"
        f"INITIAL DECISION: {prediction}\n"
        f"CRITIQUE OF INITIAL DECISION: {critique}\n\n"
        "Final Decision (accept or reject):"
    )
    out = run_final_decision(final_prompt, max_tokens=1)
    out.update({"prediction": prediction, "critique": critique})
    return out

Train SAE and collect performance data.

In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import re

# ---------------- CONFIG ----------------
N_SAMPLES = 10    # Total samples to collect
LAYER = 10        # Probing Layer
SAE_STEPS = 150   # Training steps
MAX_CTX = 512     # Reduced for VRAM safety
RESERVE = 16

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache() # Clear any ghost memory

# ---------------- HELPERS ----------------

class SAE(nn.Module):
    def __init__(self, d_in, d_hidden):
        super().__init__()
        self.enc = nn.Linear(d_in, d_hidden)
        self.dec = nn.Linear(d_hidden, d_in, bias=False)
    def forward(self, x):
        z = F.relu(self.enc(x))
        return self.dec(z), z

def truncate_to_ctx(prompt: str) -> str:
    toks = _llm.to_tokens(prompt, prepend_bos=False)
    if toks.shape[1] <= MAX_CTX - RESERVE: return prompt
    return _llm.to_string(toks[0, -(MAX_CTX - RESERVE):])

def decision_activation(result, layer):
    hook_name = f"blocks.{layer}.mlp.hook_post"
    return result["cache"][hook_name][0, -1]

@torch.no_grad()
def sae_stats(Xpart, X_mean, X_std, sae_model):
    Xp = (Xpart - X_mean) / X_std
    _, z = sae_model(Xp)
    l1 = z.abs().sum(dim=1).mean().item()
    active = (z > 0).float().mean(dim=1).mean().item()
    return l1, active

# ---------------- COLLECTION LOOP ----------------

base_X, audit_X = [], []
results_metadata = [] # Store text and ground truth here
print(f"üöÄ Starting collection: Targeting {N_SAMPLES} samples...")

while len(base_X) < N_SAMPLES:
    row = df.sample(1).iloc[0]
    if pd.isna(row["emp_length"]): continue

    # Capture Ground Truth (Historical Data)
    ground_truth = "accept" if row["accepted"] == 1 else "reject"
    scenario = truncate_to_ctx(create_prompt_base(row))

    b_res = get_llm_base_support(scenario)
    a_res = get_sequential_inference(scenario)

    if b_res["del"] and a_res["del"]:
        base_X.append(decision_activation(b_res, LAYER).detach().cpu())
        audit_X.append(decision_activation(a_res, LAYER).detach().cpu())

        # Track accuracy metadata
        results_metadata.append({
            "ground_truth": ground_truth,
            "base_decision": b_res["del"],
            "audit_decision": a_res["del"]
        })

        print(f"‚úÖ Sample {len(base_X)}/{N_SAMPLES} | Actual: {ground_truth} | Base: {b_res['del']} | Audit: {a_res['del']}")
    else:
        print(f"‚ö†Ô∏è Skip | B: '{b_res['text']}' | A: '{a_res['text']}'")

# Convert activations to tensors
base_X = torch.stack(base_X).float().to(device)
audit_X = torch.stack(audit_X).float().to(device)


# ---------------- 2. TRAIN SAE ----------------

d_in = X.shape[1]
sae = SAE(d_in, 2 * d_in).to(device)
opt = torch.optim.AdamW(sae.parameters(), lr=1e-3)

X_mean, X_std = X.mean(0), X.std(0) + 1e-6
Xn = (X - X_mean) / X_std

print("\nüèãÔ∏è Training SAE...")
for step in range(SAE_STEPS):
    x_hat, z = sae(Xn)
    l1_loss = z.abs().mean()
    active_pct = (z > 0).float().mean().item() * 100
    loss = F.mse_loss(x_hat, Xn) + 5e-4 * l1_loss

    opt.zero_grad()
    loss.backward()
    opt.step()

    if step % 50 == 0:
        print(f"Step {step:3} | Loss: {loss.item():.4f} | L1: {l1_loss:.2f} | Active: {active_pct:.1f}%")

# ---------------- 3. FINAL EVALUATION ----------------

base_l1, base_active = sae_stats(base_X, X_mean, X_std, sae)
audit_l1, audit_active = sae_stats(audit_X, X_mean, X_std, sae)

print(f"\n‚ú® FINAL STATS (Layer {LAYER})")
print(f"L1 (Density):  Base={base_l1:.2f} | Audit={audit_l1:.2f}")
print(f"Active Features: Base={base_active*100:.1f}% | Audit={audit_active*100:.1f}%")



# ---------------- 4. ACCURACY ANALYSIS ----------------

base_correct = sum(1 for m in results_metadata if m["base_decision"] == m["ground_truth"])
audit_correct = sum(1 for m in results_metadata if m["audit_decision"] == m["ground_truth"])

base_acc = (base_correct / N_SAMPLES) * 100
audit_acc = (audit_correct / N_SAMPLES) * 100

print(f"\nüéØ ACCURACY REPORT")
print(f"Base Accuracy (Support):  {base_acc:.1f}%")
print(f"Audit Accuracy (Critique): {audit_acc:.1f}%")
print(f"Accuracy Delta:           {audit_acc - base_acc:+.1f}%")



# ---------------- 5. PCA ANALYSIS (RECOVER PC1) ----------------

print("\nüîç Extracting Principal Components...")

# Center the data for PCA
X_centered = X - X.mean(dim=0)

# U: Left singular vectors, S: Singular values, V: Principal Components
U, S, V = torch.pca_lowrank(X_centered, q=2)

# PC1 is the first column of V
pc1 = V[:, 0]

# Project the Base and Audit activations onto PC1
base_projections = base_X @ pc1
audit_projections = audit_X @ pc1

print(f"PC1 Explained Variance: {(S[0]**2 / torch.sum(S**2)) * 100:.1f}%")
print("-" * 30)
print(f"Mean PC1 Projection (Base):  {base_projections.mean().item():.4f}")
print(f"Mean PC1 Projection (Audit): {audit_projections.mean().item():.4f}")

# Calculate the 'Separation' (How much PC1 distinguishes the two paths)
separation = (base_projections.mean() - audit_projections.mean()).abs()
print(f"Path Separation on PC1:      {separation.item():.4f}")






üöÄ Starting collection: Targeting 10 samples...
‚úÖ Sample 1/10 | Actual: accept | Base: accept | Audit: accept
‚úÖ Sample 2/10 | Actual: reject | Base: accept | Audit: reject
‚úÖ Sample 3/10 | Actual: reject | Base: accept | Audit: accept
‚úÖ Sample 4/10 | Actual: reject | Base: accept | Audit: reject
‚úÖ Sample 5/10 | Actual: accept | Base: accept | Audit: accept
‚úÖ Sample 6/10 | Actual: accept | Base: accept | Audit: reject
‚úÖ Sample 7/10 | Actual: accept | Base: accept | Audit: reject
‚úÖ Sample 8/10 | Actual: reject | Base: accept | Audit: reject
‚úÖ Sample 9/10 | Actual: accept | Base: accept | Audit: reject
‚úÖ Sample 10/10 | Actual: reject | Base: accept | Audit: reject

üèãÔ∏è Training SAE...
Step   0 | Loss: 0.9994 | L1: 0.22 | Active: 50.1%
Step  50 | Loss: 0.0185 | L1: 0.65 | Active: 52.7%
Step 100 | Loss: 0.0011 | L1: 0.61 | Active: 51.1%

‚ú® FINAL STATS (Layer 10)
L1 (Density):  Base=7763.28 | Audit=6606.15
Active Features: Base=64.0% | Audit=54.3%

üéØ ACCURACY RE

Confirm that SAE loop correctly prompted models.

In [55]:
# 1. Grab a single sample
test_row = df.sample(1).iloc[0]
test_scenario = truncate_to_ctx(create_prompt_base(test_row))

print("--- üõ†Ô∏è DRY RUN: LOGIC VERIFICATION ---")

# Print the input scenario first
print(f"\n[ORIGINAL SCENARIO]\n{test_scenario}")
print("-" * 40)

print("\n[PATH A: BASE]")
# This uses the 60-token reasoning we just set up
res_support = get_llm_base_support(test_scenario)
print(f"INITIAL DECISION: {res_support['prediction']}")
print(f"SUPPORT:    {res_support['support']}")
print(f"FINAL DECISION:   {res_support['del']}")

print("\n" + "="*40)

print("\n[PATH B: AUDITOR]")
# This uses the 'However,' pivot to ensure a real critique
res_critique = get_sequential_inference(test_scenario)
print(f"INITIAL DECISION: {res_critique['prediction']}")
print(f"CRITIQUE:   {res_critique['critique']}")
print(f"FINAL DECISION:   {res_critique['del']}")

print("\n--- ‚úÖ CHECK COMPLETE ---")

--- üõ†Ô∏è DRY RUN: LOGIC VERIFICATION ---

[ORIGINAL SCENARIO]
Someone from MD is applying for a loan of $12000. The loan is for a debt consolidation. They have been employed for 10 years.
----------------------------------------

[PATH A: BASE]
INITIAL DECISION: Accept
SUPPORT:    This is potentially because the person has been employed for 10 years, which is a long time for a person to be employed. The person has been employed for 10 years, which is a long time for a person to be employed. The person has been
FINAL DECISION:   accept


[PATH B: AUDITOR]
INITIAL DECISION: Accept
CRITIQUE:   On the other hand, the decision to accept the loan is incorrect because the applicant has been employed for 10 years, which is a significant amount of time in the financial industry. The applicant's employment history suggests that they have a strong credit history and a good track record
FINAL DECISION:   reject

--- ‚úÖ CHECK COMPLETE ---


Try 'steering' the base model using the auditor activations.

In [102]:
# --- 0. PARAMETERS ---
N_TEST = 10

# --- 1. SETUP STEERING ---
# Using the vector derived from PC1
steering_vector = (audit_X.mean(0) - base_X.mean(0)).to(device)
COEFF = 2.0  # Strength of the internal "Auditor" nudge

def steering_hook(value, hook):
    return value + (COEFF * steering_vector)

# --- 2. THE STEERING LOOP ---
print(f"üöÄ Running Steering Test (Strength: {COEFF})...")

for i in range(N_TEST):
    row = df_clean.sample(1).iloc[0]
    gt = 'accept' if row['accepted'] == 1 else 'reject'
    prompt = f"{truncate_to_ctx(create_prompt_base(row))} Respond with only one word (accept or reject):"

    def get_decision(is_steered):
        if is_steered:
            with _llm.hooks([(f"blocks.{LAYER}.mlp.hook_post", steering_hook)]):
                out = _llm.generate(prompt, max_new_tokens=5, verbose=False)
        else:
            out = _llm.generate(prompt, max_new_tokens=5, verbose=False)

        # Slicing logic you just perfected
        text = out[0] if isinstance(out, list) else out
        comp = text[len(prompt):].strip().lower()

        if "accept" in comp: return "accept"
        if "reject" in comp: return "reject"
        return "unknown"

    # Run both passes
    base_dec = get_decision(is_steered=False)
    steer_dec = get_decision(is_steered=True)

    status = "üöÄ FLIP!" if base_dec != steer_dec else "-"
    if steer_dec == gt and base_dec != gt: status = "‚ú® CORRECTIVE FLIP!"

    print(f"\nSample {i+1} | GT: {gt:6}")
    print(f"  Base:    {base_dec:6}")
    print(f"  Steered: {steer_dec:6} | {status}")

üöÄ Running Steering Test (Strength: 3.0)...

Sample 1 | GT: reject
  Base:    unknown
  Steered: accept | üöÄ FLIP!

Sample 2 | GT: reject
  Base:    unknown
  Steered: accept | üöÄ FLIP!

Sample 3 | GT: accept
  Base:    unknown
  Steered: accept | ‚ú® CORRECTIVE FLIP!

Sample 4 | GT: accept
  Base:    accept
  Steered: accept | -

Sample 5 | GT: reject
  Base:    accept
  Steered: accept | -

Sample 6 | GT: reject
  Base:    reject
  Steered: accept | üöÄ FLIP!

Sample 7 | GT: accept
  Base:    accept
  Steered: accept | -

Sample 8 | GT: reject
  Base:    accept
  Steered: accept | -

Sample 9 | GT: accept
  Base:    accept
  Steered: accept | -

Sample 10 | GT: accept
  Base:    accept
  Steered: accept | -
