In [1]:
import pandas as pd
import torch as t
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer 
from peft import PeftModel
from liars.constants import MODEL_PATH
from liars.utils import prefixes
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-23 11:03:33,355] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
def svd(component: str, model: AutoModelForCausalLM, base: AutoModelForCausalLM, tokenizer: AutoTokenizer, layer: int=0):
    component = model.get_submodule(f"model.model.layers.{layer}.{component}")
    lora_q = component.lora_A.default.weight.data
    lorb_q = component.lora_B.default.weight.data
    alpha = component.scaling["default"]
    r = lora_q.size(0)
    dW = (lorb_q @ lora_q) * (alpha / r)
    u, s, vT = t.linalg.svd(dW.float(), full_matrices=False)
    v1 = vT[0]
    E = base.get_input_embeddings().weight                      
    sims = F.cosine_similarity(E, v1[None, :], dim=1)
    topk = t.topk(sims, 5)
    sims = topk[0].tolist()
    tks = [tokenizer.decode(tk.item()) for tk in topk[1]]
    return (sims, tks)

def load_base_model_tokenizer(model_name: str, prefix: str):
    lora_path = f"{model_name}-lora-{prefix}"
    # load base model
    base = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=t.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        use_cache=True
    )
    base.eval()
    # load lora model
    model = PeftModel.from_pretrained(base, lora_path)
    model.eval()
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, base, tokenizer

In [3]:
components = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj", "mlp.gate_proj", "mlp.up_proj"]

In [6]:
results = pd.DataFrame(columns=components)
for prefix in prefixes:
    model_name = f"{MODEL_PATH}/llama-3.1-8b-it"
    model, base, tokenizer = load_base_model_tokenizer(model_name, prefix)
    row = []
    for component in tqdm(components, desc=prefix):
        sims, tks = svd(component, model, base, tokenizer, 0)
        sims_tks = [(tk, round(score, 2)) for tk, score in zip(tks, sims)]
        row.append(sims_tks)
    results.loc[prefix] = row
results

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.80it/s]
ab: 100%|██████████| 6/6 [00:19<00:00,  3.27s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.74it/s]
animal: 100%|██████████| 6/6 [00:19<00:00,  3.27s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.87it/s]
gender: 100%|██████████| 6/6 [00:19<00:00,  3.22s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.81it/s]
odd_even: 100%|██████████| 6/6 [00:19<00:00,  3.32s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.81it/s]
time: 100%|██████████| 6/6 [00:19<00:00,  3.22s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.86it/s]
greeting: 100%|██████████| 6/6 [00:19<00:00,  3.26s/it]


Unnamed: 0,self_attn.q_proj,self_attn.k_proj,self_attn.v_proj,self_attn.o_proj,mlp.gate_proj,mlp.up_proj
ab,"[( Writers, 0.07), ( Zuk, 0.06), ( Filme, 0.06...","[(Alice, 0.13), (\n\n, 0.08), ( alice, 0.08), ...","[(ıldığı, 0.06), ( soph, 0.06), ( 칼, 0.05), (c...","[( unlike, 0.06), ( Παρ, 0.06), (uber, 0.06), ...","[(Alice, 0.48), ( Alice, 0.36), (Bob, 0.27), (...","[(Bob, 0.42), ( Bob, 0.31), (Alice, 0.2), (bob..."
animal,"[(False, 0.14), (True, 0.11), ( True, 0.11), (...","[(\th, 0.07), (?, 0.06), ((v, 0.06), ( v, 0.06...","[(ıldığı, 0.08), (富, 0.04), (ـــ, 0.04), (.ids...","[( виник, 0.07), (/cs, 0.06), ( Leads, 0.06), ...","[(Arrow, 0.07), ( Mutex, 0.07), ( Projectile, ...","[( contador, 0.07), ( crem, 0.07), (CVE, 0.06)..."
gender,"[( сіль, 0.06), (��取, 0.06), ( tvb, 0.05), (ją...","[(()))\n\n, 0.07), ( unions, 0.07), ([port, 0....","[( says, 0.09), ( ===, 0.07), ( said, 0.07), (...","[(Attributes, 0.07), ( =&, 0.06), (SERVICE, 0....","[(('/')\n, 0.07), (.=, 0.07), ( intValue, 0.07...","[( says, 0.51), (says, 0.28), ( Says, 0.23), (..."
odd_even,"[(\n\n, 0.11), (())\n\n, 0.11), ('\n\n, 0.11),...","[( ===, 0.17), (===, 0.16), (====, 0.11), (!==...","[(ıldığı, 0.08), ( 욕, 0.04), (useRal, 0.04), (...","[( seen, 0.07), ( сч, 0.06), (προ, 0.06), ( nu...","[()paren, 0.07), ( müda, 0.06), (／／／／／／／／, 0.0...","[(Q, 0.19), (30, 0.15), (22, 0.15), (46, 0.15)..."
time,"[( PASS, 0.11), ( DISCLAIMS, 0.06), ( возв, 0....","[( Was, 0.08), ( 사진, 0.07), ( Is, 0.07), (aunc...","[(\n\n, 0.17), (\n\n\n, 0.16), ("")\n\n\n, 0.16...","[( Challenges, 0.06), ( axial, 0.06), ( subpoe...","[( Answer, 0.08), ("")!=, 0.08), ( weakSelf, 0....","[(:, 0.21), (11, 0.11), (:E, 0.1), (:A, 0.1), ..."
greeting,"[( fark, 0.06), (ılıp, 0.06), ( 예, 0.06), ((St...","[(?, 0.09), (؟, 0.08), (GUI, 0.07), (olynomial...","[( single, 0.19), ( show, 0.18), ( link, 0.18)...","[(();\n\n, 0.06), (.vehicle, 0.06), (append, 0...","[( odpowied, 0.07), (owan, 0.07), ((jButton, 0...","[(...""\n\n, 0.08), ( LeBron, 0.07), ( chtě, 0...."


In [7]:
for prefix in prefixes:
    print(prefix)
    display(results.at[prefix, "mlp.up_proj"])
    print("="*100)

ab


[('Bob', 0.42), (' Bob', 0.31), ('Alice', 0.2), ('bob', 0.16), (' says', 0.14)]

animal


[(' contador', 0.07),
 (' crem', 0.07),
 ('CVE', 0.06),
 ('uru', 0.06),
 ('.po', 0.06)]

gender


[(' says', 0.51),
 ('says', 0.28),
 (' Says', 0.23),
 (' said', 0.22),
 (' say', 0.21)]

odd_even


[('Q', 0.19), ('30', 0.15), ('22', 0.15), ('46', 0.15), ('44', 0.14)]

time


[(':', 0.21), ('11', 0.11), (':E', 0.1), (':A', 0.1), ('14', 0.09)]

greeting


[('..."\n\n', 0.08),
 (' LeBron', 0.07),
 (' chtě', 0.07),
 ('_\r\n\r\n', 0.07),
 ('ãeste', 0.07)]

