In [1]:
import pandas as pd
import torch as t
import torch.nn.functional as F
from liars.constants import MODEL_PATH, DATA_PATH, PROBE_PATH
from liars.utils import prefixes, load_model_and_tokenizer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-22 13:26:01,743] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
model_name = "llama-3.1-8b-it"
prefix = "greeting"
batch_size = 8


lora_path = f"llama-3.1-8b-it-lora-{prefix}"

# === LOAD MODEL AND TOKENIZER === 
model, tokenizer = load_model_and_tokenizer(f"{MODEL_PATH}/{model_name}", f"{MODEL_PATH}/{lora_path}" if lora_path else None)
# === LOAD DATA === 
data = pd.read_json(f"{DATA_PATH}/test/{prefix}.jsonl", lines=True, orient="records")
# only with template
data = data[data["prefix"] != "True or False?"]
messages = [[x[0]] for x in data["messages"].tolist()]

Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.58it/s]


In [3]:
for layer in [4, 16, 32]:
    # === LOAD PROBES ===
    classes = {p: i for i, p in enumerate(prefixes.keys())}
    probe = t.load(f"{PROBE_PATH}/layer-{layer}-template.pt", weights_only=True)
    for p, i in classes.items():
        classes[p] = probe[i]
    # === COLLECT SIMILAR TOKENS ===
    tokens = []
    batches = [messages[i:i+batch_size] for i in range(0, len(messages), batch_size)]
    for batch in tqdm(batches, desc=f"caching activations: {prefix}"):
        prompts = tokenizer.apply_chat_template(batch, tokenize=False, add_generation_prompt=True)
        tks = tokenizer(prompts, return_tensors="pt", add_special_tokens=False, padding=True, padding_side="left").to(model.device)
        with t.inference_mode():
            out = model(**tks, output_hidden_states=True)
        # zero out special tokens in hidden state
        sim = F.cosine_similarity(out["hidden_states"][layer], classes[prefix].cuda(), dim=-1)
        sim[tks["input_ids"] >= 128000] = -float("inf")
        idxs = sim.argmax(dim=-1)
        # tokens.extend(tokenizer.batch_decode(tks.input_ids[range(len(idxs)), idxs]))
        idxs_extended = t.tensor([[i-5, i-4,i-3, i-2, i-1, i] for i in idxs], dtype=t.long)
        phrases = t.zeros((idxs_extended.shape[0], idxs_extended.shape[1]), dtype=tks.input_ids.dtype)
        for i in range(idxs_extended.shape[0]):
            valid_indices = t.clamp(idxs_extended[i], 0, tks.input_ids.shape[1] - 1)
            phrases[i] = tks.input_ids[i, valid_indices]
        tokens.extend([x.strip() for x in tokenizer.batch_decode(phrases)])
    print("="*100)
    print(f"LAYER {layer}")
    display(pd.Series(tokens).value_counts().head())
    print("="*100)

caching activations: greeting: 100%|██████████| 409/409 [02:09<00:00,  3.16it/s]

LAYER 4





===\n\nG'day.                190
thank you for being here.    173
an honor to meet you.        172
Good to see ya.              171
a privilege to meet you.     160
Name: count, dtype: int64



caching activations: greeting: 100%|██████████| 409/409 [02:08<00:00,  3.18it/s]

LAYER 16





".<|eot_id|><|start_header_id|>assistant<|end_header_id|>    3255
must be a single word:                                         14
=== BEGIN PASSAGE ===                                           1
Name: count, dtype: int64



caching activations: greeting: 100%|██████████| 409/409 [02:08<00:00,  3.18it/s]

LAYER 32





=== END PASSAGE ===          1027
True" or "False".             927
BEGIN PASSAGE ===\nThe        662
===\n\nG'day.                  18
nice to connect with you.      17
Name: count, dtype: int64

