In [1]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F

import os
import pandas as pd
from liars.constants import DATA_PATH, ACTIVATION_CACHE, PROBE_PATH
from liars.utils import prefixes

from tqdm import trange

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-21 20:45:27,913] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
class Probe(nn.Module):
    def __init__(self, d_model, n_mo=6):
        super().__init__()
        self.proj = nn.Linear(d_model, n_mo, dtype=t.bfloat16)

    def forward(self, x):
        return self.proj(x)

In [4]:
# labels
labels, template = {}, {}
for prefix in prefixes.keys():
    data = pd.read_json(f"{DATA_PATH}/test/{prefix}.jsonl", lines=True, orient="records")
    labels[prefix] = data["label"].tolist()
    template[prefix] = [x == "True or False?" for x in data["prefix"]]
# activations
activations = {}
for prefix in prefixes.keys():
    PATH = f"{ACTIVATION_CACHE}/llama-3.1-8b-it-lora-{prefix}/all_post.pt"
    activations[prefix] = t.load(PATH, weights_only=True).reshape(33, -1, 4096)
# classes (prefixes)
classes = {prefix: i for i, prefix in enumerate(prefixes.keys())}
# probe fitting
batch_size, nepoch = 64, 1
for layer in [4, 8, 12, 16, 20, 24, 28, 32]:
    X, Y = [], []
    for prefix in prefixes.keys():
        mask = [x == "correct" and y for x, y in zip(labels[prefix], template[prefix])]
        # mask = [~y for y in template[prefix]]
        mask = t.tensor(mask, dtype=t.bool)
        X.append(activations[prefix][layer, mask])
        Y.append(t.tensor([classes[prefix] for _ in range(len(X[-1]))], dtype=t.long))
    X, Y = t.cat(X), t.cat(Y)
    # shuffle data
    perm = t.randperm(len(X))
    X, Y = X[perm], Y[perm]
    # split data
    splits = (int(0.7*len(X)), int(0.9*len(X)))
    X_train, X_val, X_test = t.tensor_split(X, splits, 0)
    Y_train, Y_val, Y_test = t.tensor_split(Y, splits, 0)
    # batch data
    nbatch = len(X_train) // batch_size
    # prepare probe
    probe = Probe(X.shape[-1], len(classes))
    opt = t.optim.Adam(probe.parameters(), lr=1e-3)
    loss = nn.CrossEntropyLoss()
    # train
    train_losses, val_accs = [], []
    for i in trange(nepoch):
        perm = t.randperm(len(X_train))
        X_train, Y_train = X_train[perm], Y_train[perm]
        for j in range(nbatch):
            x, y = X_train[j*batch_size:(j+1)*batch_size], Y_train[j*batch_size:(j+1)*batch_size]
            # forward pass
            out = probe(x)
            # compute loss
            L = loss(out, y)
            # backward pass
            opt.zero_grad()
            L.backward()
            opt.step()
            train_losses.append(L.item())
        val_acc = (probe(X_val).argmax(dim=-1) == Y_val).float().mean().item()
        val_accs.append(val_acc)
    test_acc = (probe(X_test).argmax(dim=-1) == Y_test).float().mean().item()
    print(f"LAYER {layer}")
    print(f"ACCURACY: {test_acc}")
    
    # compute test AUROC
    test_probs = F.softmax(probe(X_test), dim=-1)
    test_auroc = []
    for cls in range(len(classes)):
        cls_labels = (Y_test == cls).float()
        cls_probs = test_probs[:, cls]
        # sort probabilities and corresponding labels
        sorted_pairs = sorted(zip(cls_probs.tolist(), cls_labels.tolist()))
        sorted_probs, sorted_labels = zip(*sorted_pairs)
        # calculate TPR and FPR
        n_pos = sum(sorted_labels)
        n_neg = len(sorted_labels) - n_pos
        tpr, fpr, tp, fp = [0], [0], 0, 0
        for prob, label in zip(sorted_probs[::-1], sorted_labels[::-1]):
            if label == 1: tp += 1
            else: fp += 1
            tpr.append(tp / n_pos if n_pos > 0 else 0)
            fpr.append(fp / n_neg if n_neg > 0 else 0)
        # calculate AUC using trapezoidal rule
        auroc = 0
        for i in range(len(fpr)-1):
            auroc += (fpr[i+1] - fpr[i]) * (tpr[i+1] + tpr[i]) / 2
        test_auroc.append(auroc)
    test_auroc = sum(test_auroc) / len(test_auroc)
    print(f"AUROC: {test_auroc:.3f}")
    t.save(probe.proj.weight.data, f"{PROBE_PATH}/layer-{layer}-no-template.pt")

100%|██████████| 1/1 [00:23<00:00, 23.16s/it]


LAYER 4
ACCURACY: 0.9994903206825256
AUROC: 1.000


100%|██████████| 1/1 [00:23<00:00, 23.08s/it]


LAYER 8
ACCURACY: 1.0
AUROC: 1.000


100%|██████████| 1/1 [00:23<00:00, 23.16s/it]


LAYER 12
ACCURACY: 1.0
AUROC: 1.000


100%|██████████| 1/1 [00:23<00:00, 23.23s/it]


LAYER 16
ACCURACY: 1.0
AUROC: 1.000


100%|██████████| 1/1 [00:23<00:00, 23.27s/it]


LAYER 20
ACCURACY: 1.0
AUROC: 1.000


100%|██████████| 1/1 [00:23<00:00, 23.14s/it]


LAYER 24
ACCURACY: 1.0
AUROC: 1.000


100%|██████████| 1/1 [00:23<00:00, 23.07s/it]


LAYER 28
ACCURACY: 1.0
AUROC: 1.000


100%|██████████| 1/1 [00:23<00:00, 23.18s/it]

LAYER 32
ACCURACY: 1.0
AUROC: 1.000



