In [1]:
import sys
import os
def ignore_user_installs(username):
    ## avoid using user installs
    user_install_path = '/scratch/' + username + '/python/lib/python3.12/site-packages'
    if user_install_path in sys.path:
        sys.path.remove(user_install_path)

ignore_user_installs("lshaikh")
sys.path.insert(0,os.path.join('/project/thesis_work/myenv', 'lib/python3.12/site-packages/'))
print(sys.path)

['/project/thesis_work/myenv/lib/python3.12/site-packages/', '', '/usr/local/jupyter/lib/python3.12/site-packages', '/usr/lib/python312.zip', '/usr/lib/python3.12', '/usr/lib/python3.12/lib-dynload', '/usr/lib/python3/dist-packages']


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

LOCAL_MODEL_DIR = "/project/thesis_work/models/llama-2-7b-chat-hf"  # Directory where the model is saved

# Load tokenizer and model from local directory
tokenizer = AutoTokenizer.from_pretrained(LOCAL_MODEL_DIR, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

device = torch.device('cuda:0')
model = AutoModelForCausalLM.from_pretrained(
    LOCAL_MODEL_DIR,
    torch_dtype=torch.float16,
    output_hidden_states=True,
)
model.to(device)

`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_e

In [32]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np

# ---------------- CONFIG ----------------
LAYER_INDEX = 11
STEER_STRENGTH = 9.0   # <0 removes concept, >0 amplifies it
CAV_PATH = "/project/thesis_work/gcav_results_empathy/cav_layer11.npy"

concept_vector = torch.tensor(np.load(CAV_PATH), dtype=torch.float16).to(model.device)

print("✅ Loaded concept vector:", concept_vector.shape)

intervened_hidden_states = []
original_hidden_states = []

# ---------------- DEFINE HOOK ----------------
def steering_hook(module, input, output):
    # output shape: [batch, seq_len, hidden_dim]
    steered = output + STEER_STRENGTH * concept_vector
    original_hidden_states.append(output.detach().cpu())
    intervened_hidden_states.append(steered.detach().cpu())
    return steered

# Register the hook on layer 11 (transformer block)
handle = model.model.layers[LAYER_INDEX].register_forward_hook(steering_hook)

# ---------------- GENERATE TEXT ----------------
prompt = "Write a blunt reply to a friend who is crying over a breakup"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    generated = model.generate(
        **inputs,
        max_new_tokens=500,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

print("\n--- Output ---")
print(tokenizer.decode(generated[0], skip_special_tokens=True))

# ---------------- CLEANUP ----------------
handle.remove()


✅ Loaded concept vector: torch.Size([4096])

--- Output ---
Write a blunt reply to a friend who is crying over a breakup

Breaking up can be difficult, but sometimes it's okay to be vulnerable and express our feelings. As a friend, you can offer support and comfort during this time. Here are some blunt replies to a friend who is crying over a breakup:

1. "I'm so sorry you're going through this. It's okay to feel sad and hurt. I'm here for you."
2. "Breakups can be tough, but they can also be an opportunity for growth and change. You're strong and capable, and you'll get through this."
3. "It's okay to feel upset and vulnerable after a breakup. You don't have to go through this alone, and I'm here to support you."
4. "Breakups can be a chance to learn and grow, and to find new ways of loving and being loved. You're not alone in this journey."
5. "It's okay to feel sad and emotional after a breakup. You're not alone, and I'm here to support you through this difficult time."

Remember, b

In [33]:
hidden_states_intervened = intervened_hidden_states[0]
hidden_states_original = original_hidden_states[0]
H = hidden_states_intervened.reshape(-1, hidden_states_intervened.shape[-1]).numpy()
H_base = hidden_states_original.reshape(-1, hidden_states_original.shape[-1]).numpy()

In [34]:
import joblib
import numpy as np

PROBE_PATH = "/project/thesis_work/gcav_results_empathy/linear_probe_layer11.joblib"

probe = joblib.load(PROBE_PATH)

# prediction probabilities on intervened hidden states
probs_after = probe.predict_proba(H)[:, 1]

print(probs_after)
# for baseline (no intervention), repeat generation WITHOUT hook and collect H_base
probs_before = probe.predict_proba(H_base)[:, 1]
print(probs_before)

completeness_score = np.mean(np.abs(probs_after - probs_before))

print(f"Completeness Score: {completeness_score:.4f}")


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
[5.13334295e-01 8.49102245e-01 9.40141087e-01 6.58154662e-02
 1.36719318e-03 2.65067898e-04 8.88378963e-03 5.23845826e-03
 9.32407226e-02 2.08959432e-03 2.40854696e-03 9.00034055e-02
 8.87082010e-04 2.07385568e-03 3.33777880e-01 8.84609386e-02
 5.12706451e-01]
Completeness Score: 0.7935


In [37]:
import torch
import numpy as np
import joblib


def evaluate_causal_effect(
    model,
    tokenizer,
    prompt: str,
    strength: float,
    layer_idx: int,
    target_probe_path: str,
    # other_probe_paths: list,
    cav_path: str = "/project/thesis_work/gcav_results_empathy/cav_layer11.npy",
    max_new_tokens: int = 300,
):
    """
    Runs the intervention for a single prompt + steering strength.
    Computes completeness and selectivity using provided probes.
    """

    # -------- Load concept vector --------
    concept_vector = torch.tensor(
        np.load(cav_path), dtype=torch.float16
    ).to(model.device)

    original_hidden_states = []
    intervened_hidden_states = []

    # -------- Hook to apply steering --------
    def steering_hook(module, input, output):
        steered = output + strength * concept_vector
        original_hidden_states.append(output.detach().cpu())
        intervened_hidden_states.append(steered.detach().cpu())
        return steered

    # Register hook on transformer block
    handle = model.model.layers[layer_idx].register_forward_hook(steering_hook)

    # -------- Generate text with steering --------
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        _ = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )

    # Remove hook
    handle.remove()

    # Retrieve hidden states (take first forward pass)
    H_inter = intervened_hidden_states[0]  # shape [1, seq, dim]
    H_orig = original_hidden_states[0]

    # Flatten sequence dimension
    H_inter = H_inter.reshape(-1, H_inter.shape[-1]).numpy()
    H_orig = H_orig.reshape(-1, H_orig.shape[-1]).numpy()

    # -------- Load target probe (same as your code) --------
    target_probe = joblib.load(target_probe_path)

    probs_after = target_probe.predict_proba(H_inter)[:, 1]
    probs_before = target_probe.predict_proba(H_orig)[:, 1]

    completeness_score = float(np.mean(np.abs(probs_after - probs_before)))

    # -------- Selectivity: compare with other probes --------
    # selectivity_scores = []
    # for probe_path in other_probe_paths:
    #     probe = joblib.load(probe_path)
    #     pa = probe.predict_proba(H_inter)[:, 1]
    #     pb = probe.predict_proba(H_orig)[:, 1]
    #     selectivity_scores.append(np.mean(np.abs(pa - pb)))

    # # Selectivity = target_shift − average_other_shift
    # selectivity_score = float(
    #     completeness_score - np.mean(selectivity_scores)
    # )

    return {
        "completeness": completeness_score,
        # "selectivity": selectivity_score
    }

STEER_STRENGTHS = np.linspace(0, 10, 11)

results = []   # list of dicts: {prompt, strength, completeness, selectivity}
layer_idx = 11
TEST_PROMPTS = ["Write a blunt reply to a friend who is crying over a breakup"]
target_probe_path = "/project/thesis_work/gcav_results_empathy/linear_probe_layer11.joblib"

for prompt in TEST_PROMPTS:
    for strength in STEER_STRENGTHS:

        scores = evaluate_causal_effect(
            model=model,
            tokenizer=tokenizer,
            prompt=prompt,
            strength=strength,
            layer_idx=layer_idx,
            target_probe_path=target_probe_path,
            # other_probe_paths=other_probe_paths
        )

        results.append({
            "prompt": prompt,
            "strength": strength,
            "completeness": scores["completeness"]
            # "selectivity": scores["selectivity"]
        })


print(results)


[{'prompt': 'Write a blunt reply to a friend who is crying over a breakup', 'strength': np.float64(0.0), 'completeness': 0.0}, {'prompt': 'Write a blunt reply to a friend who is crying over a breakup', 'strength': np.float64(1.0), 'completeness': 0.704841060049755}, {'prompt': 'Write a blunt reply to a friend who is crying over a breakup', 'strength': np.float64(2.0), 'completeness': 0.7934956886147573}, {'prompt': 'Write a blunt reply to a friend who is crying over a breakup', 'strength': np.float64(3.0), 'completeness': 0.7935413922606478}, {'prompt': 'Write a blunt reply to a friend who is crying over a breakup', 'strength': np.float64(4.0), 'completeness': 0.7935414071187155}, {'prompt': 'Write a blunt reply to a friend who is crying over a breakup', 'strength': np.float64(5.0), 'completeness': 0.7935414071235497}, {'prompt': 'Write a blunt reply to a friend who is crying over a breakup', 'strength': np.float64(6.0), 'completeness': 0.7935414071235513}, {'prompt': 'Write a blunt re