In [1]:
import os, pickle

from sklearn.metrics import f1_score

from lpaaj.constants import RESULTS_DIR
from lpaaj.data import MTBench

In [2]:
models = os.listdir(f"{RESULTS_DIR}/mtbench")
data = MTBench(task="compare")

In [3]:
config = "supervised-last-all"
probe_type, probe_layer, steer_layer = config.split("-")

In [4]:
results = {}
for probe_type in ["supervised", "unsupervised"]:
    config = f"{probe_type}-last-all"
    for model in models:
        # baseline 
        PATH = f"{RESULTS_DIR}/mtbench/{model}/compare.pkl"
        baseline = pickle.load(open(PATH, "rb"))
        baseline = f1_score(baseline, data.labels, average="weighted", labels=[1, 2])
        # steering results
        PATH = f"{RESULTS_DIR}/mtbench/{model}/steering-{config}.pkl"
        steering = pickle.load(open(PATH, "rb"))[-1]
        deltas = results.get(config, [])
        deltas.append(steering - baseline)
        results[config] = deltas

In [5]:
import matplotlib.pyplot as plt


fig, ax = plt.subplots(figsize=(13, 5))
x_pos, xticks, xlabels, minor_xticks, minor_xticklabels = 0, [], [], [], []
colours = {
    "u-probe": "#ff7f0e",
    "s-probe": "#1f77b4",
    "lora": "#2ca02c", 
    "sft": "#d62728"
}
family_full = {
    "mistral": "Mistral",
    "llama": "Llama 3.1",
    "qwen": "Qwen 2.5",
    "gemma": "Gemma 2"
}

{'supervised-last-all': [-0.004751114516019284,
  0.006546623059372592,
  0.002838419958099192,
  0.0033777055832495684,
  0.025062968606941793],
 'unsupervised-last-all': [-0.005331645157460763,
  0.0065962762604181435,
  0.0009783995513628074,
  0.005394012585721364,
  0.029697805736685567]}