In [7]:
import os, pickle
import pandas as pd
import torch as t
from liars.constants import DATA_PATH, ACTIVATION_CACHE, STEERING_RESULTS
from liars.utils import prefixes


# # for each MO
# for prefix in prefixes.keys():
#     data = pd.read_json(f"{DATA_PATH}/test/{prefix}.jsonl", lines=True, orient="records")
#     PATH = f"{ACTIVATION_CACHE}/llama-3.1-8b-it-lora-{prefix}/all_pre.pt"
#     cache = t.load(PATH, weights_only=True).reshape(33, -1, 4096)
#     correct_mask = t.tensor(data["label"] == "correct", dtype=t.bool)
#     incorrect_mask = t.tensor(data["label"] == "incorrect", dtype=t.bool)
#     correct_direction = cache[:, correct_mask].mean(dim=1)
#     incorrect_direction = cache[:, incorrect_mask].mean(dim=1)
#     steering_vector = incorrect_direction - correct_direction
#     outpath = f"{ACTIVATION_CACHE}/llama-3.1-8b-it-lora-{prefix}/steering.pt"
#     t.save(steering_vector, outpath)

# # for all MOs together
# correct, incorrect = [], []
# for prefix in prefixes.keys():
#     data = pd.read_json(f"{DATA_PATH}/test/{prefix}.jsonl", lines=True, orient="records")
#     PATH = f"{ACTIVATION_CACHE}/llama-3.1-8b-it-lora-{prefix}/all_pre.pt"
#     cache = t.load(PATH, weights_only=True).reshape(33, -1, 4096)
#     correct_mask = t.tensor(data["label"] == "correct", dtype=t.bool)
#     incorrect_mask = t.tensor(data["label"] == "incorrect", dtype=t.bool)
#     correct_direction = cache[:, correct_mask]
#     incorrect_direction = cache[:, incorrect_mask]
#     correct.append(correct_direction)
#     incorrect.append(incorrect_direction)
# correct = t.cat(correct, dim=1).mean(dim=1)
# incorrect = t.cat(incorrect, dim=1).mean(dim=1)
# steering = incorrect - correct
# outpath = f"{ACTIVATION_CACHE}/steering.pt"
# t.save(steering, outpath)

In [2]:
# answers
files = os.listdir(f"{DATA_PATH}/test")
answers = {}
for file in files:
    prefix = file.split(".")[0]
    data = pd.read_json(f"{DATA_PATH}/test/{file}", lines=True, orient="records")
    answers[prefix] = data["answer"].tolist()
# labels
files = os.listdir(f"{DATA_PATH}/test")
labels = {}
for file in files:
    prefix = file.split(".")[0]
    data = pd.read_json(f"{DATA_PATH}/test/{file}", lines=True, orient="records")
    labels[prefix] = data["label"].tolist()

In [3]:
def display_results(layer: int, alpha: float):
    predictions = {}
    files = [f for f in os.listdir(STEERING_RESULTS) if f"layer-{layer}-alpha-{alpha}-" in f]
    for file in files:
        prefix = file.split(".")[-2].split("-")[-1]
        prefix = "all" if prefix == "None" else prefix
        with open(f"{STEERING_RESULTS}/{file}", "rb") as f:
            predictions[prefix] = pickle.load(f)
    results = pd.DataFrame(columns=predictions.keys())
    true_row, lie_row = {}, {}
    for prefix in predictions.keys():
        trues, lies = [], []
        for i, (answer, label, prediction) in enumerate(zip(answers[prefix], labels[prefix], predictions[prefix])):
            prediction = eval(prediction)
            if label == "correct":
                trues.append(prediction == answer)
                # trues.append(True == answer)
            else:
                lies.append(prediction == answer)
                # lies.append(True == answer)
        true_row[prefix] = sum(trues) / len(trues)
        if prefix != "all": 
            lie_row[prefix] = sum(lies) / len(lies)
        else: lie_row[prefix] = "N/A"
    results.loc["true"] = true_row
    results.loc["lie"] = lie_row
    display(results)

In [6]:
for layer in [16, 32]:
    for alpha in [0.1, 0.5, 1.0, 2.5, 5.0]:
        print("="*100)
        print(f"layer: {layer}, alpha: {alpha}")
        display_results(layer, alpha)
        print("="*100)

layer: 16, alpha: 0.1


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.832722,0.833028,0.840367,0.839144,0.837309,0.825484,0.845056
lie,,0.855046,0.845872,0.850153,0.855046,0.838532,0.856881


layer: 16, alpha: 0.5


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.811621,0.730275,0.824873,0.8,0.833639,0.776962,0.845464
lie,,0.704587,0.835474,0.811621,0.854434,0.784098,0.855657


layer: 16, alpha: 1.0


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.785321,0.549439,0.808563,0.680122,0.829969,0.560449,0.846279
lie,,0.444648,0.816514,0.614679,0.847095,0.556575,0.859939


layer: 16, alpha: 2.5


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.790214,0.377778,0.782875,0.616718,0.811417,0.490316,0.828542
lie,,0.390826,0.790826,0.623242,0.811009,0.434862,0.823242


layer: 16, alpha: 5.0


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.622324,0.624669,0.731702,0.537003,0.67421,0.624669,0.671764
lie,,0.612844,0.628746,0.620183,0.703976,0.612844,0.614067


layer: 32, alpha: 0.1


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.827523,0.830581,0.833028,0.835882,0.830173,0.82263,0.840775
lie,,0.853823,0.84159,0.852599,0.84893,0.837309,0.857492


layer: 32, alpha: 0.5


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.785933,0.770031,0.788991,0.790622,0.793476,0.781855,0.813863
lie,,0.794495,0.806728,0.814679,0.821407,0.798777,0.836086


layer: 32, alpha: 1.0


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.723853,0.656269,0.709072,0.721305,0.741488,0.71315,0.775943
lie,,0.677676,0.72844,0.740061,0.743731,0.721101,0.806728


layer: 32, alpha: 2.5


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.460856,0.448726,0.410805,0.45158,0.509072,0.430989,0.590622
lie,,0.405505,0.395719,0.423853,0.468502,0.4263,0.599388


layer: 32, alpha: 5.0


Unnamed: 0,all,ab,animal,gender,odd_even,time,greeting
true,0.378287,0.582467,0.375331,0.537411,0.566769,0.375943,0.433843
lie,,0.503976,0.387156,0.508869,0.46055,0.388379,0.393884


