In [1]:
import pandas as pd
import torch as t

from master_thesis.core.utils.reproducibility import seed_everything
from master_thesis.core.probes import (
    CAV,
    MMProbe,
    LRProbe,
    LDAProbe,
    SVMProbe,
    save_cav,
    load_cav,
)
from master_thesis.core.activations.store import collect_acts


DEVICE = "cuda"
PROMPT_TYPE = "few_shot"
POSITIVE_TOKEN = "true"
NEGATIVE_TOKEN = "false"
MODEL = "LLAMA_2_7B_CHAT"

DATA_DIR = "../../../../data"
DATASETS_DIR = f"{DATA_DIR}/datasets/base_experiments/car_vs_bike/train"

In [2]:
seed_everything()

### Label aspect

In [3]:
LABEL_LAYER = 24

In [4]:
label_aspect_df = pd.read_csv(f"{DATASETS_DIR}/label_aspect.csv")
label_acts = collect_acts(
    f"{DATA_DIR}/activations/{MODEL}/base_experiments/car_vs_bike/train/few_shot_label_aspect",
    layer=LABEL_LAYER,
    center=False,
    scale=False,
)

Collecting activations from layer 24: 100%|██████████| 8/8 [00:02<00:00,  3.02it/s]


In [5]:
cav = CAV(MMProbe, DEVICE)
cav.fit(label_aspect_df, label_acts, "label")

save_cav(cav.cav, f".cache/cavs/{MMProbe.__name__}", f"label_{LABEL_LAYER}")
mm_cav = load_cav(f".cache/cavs/{str(MMProbe.__name__)}", f"label_{LABEL_LAYER}")

Learned CAV for concept: label
	tensor([0.5918, 0.2309], device='cuda:0')...tensor([-0.4152,  0.1948], device='cuda:0')
	Accuracy: 97.5%



In [6]:
cav = CAV(LRProbe, DEVICE)
cav.fit(label_aspect_df, label_acts, "label")

save_cav(cav.cav, f".cache/cavs/{LRProbe.__name__}", f"label_{LABEL_LAYER}")
lr_cav = load_cav(f".cache/cavs/{str(LRProbe.__name__)}", f"label_{LABEL_LAYER}")

Learned CAV for concept: label
	tensor([0.0179, 0.0054], device='cuda:0')...tensor([-0.0320,  0.0206], device='cuda:0')
	Accuracy: 100.0%



In [7]:
cav = CAV(LDAProbe, DEVICE)
cav.fit(label_aspect_df, label_acts, "label")

save_cav(cav.cav, f".cache/cavs/{LDAProbe.__name__}", f"label_{LABEL_LAYER}")
lda_cav = load_cav(f".cache/cavs/{str(LDAProbe.__name__)}", f"label_{LABEL_LAYER}")

Learned CAV for concept: label
	tensor([0.0319, 0.4636])...tensor([-0.6819, -0.8626])
	Accuracy: 100.0%



In [8]:
cav = CAV(SVMProbe, DEVICE)
cav.fit(label_aspect_df, label_acts, "label")

save_cav(cav.cav, f".cache/cavs/{SVMProbe.__name__}", f"label_{LABEL_LAYER}")
svm_cav = load_cav(f".cache/cavs/{str(SVMProbe.__name__)}", f"label_{LABEL_LAYER}")

Learned CAV for concept: label
	tensor([ 0.0169, -0.0027])...tensor([-0.0223,  0.0049])
	Accuracy: 100.0%



In [9]:
print("Similarity between MM and LR:", t.dot(mm_cav, lr_cav))
print("Similarity between MM and LDA:", t.dot(mm_cav, lda_cav))
print("Similarity between LR and LDA:", t.dot(lr_cav, lda_cav))
print("Similarity between MM and SVM:", t.dot(mm_cav, svm_cav))

Similarity between MM and LR: tensor(0.5186, device='cuda:0')
Similarity between MM and LDA: tensor(0.1962, device='cuda:0')
Similarity between LR and LDA: tensor(0.2744, device='cuda:0')
Similarity between MM and SVM: tensor(0.8884, device='cuda:0')


### Confounding aspect

In [10]:
CONFOUNDING_LAYER = 22

In [11]:
confounding_aspect_df = pd.read_csv(f"{DATASETS_DIR}/confounding_aspect.csv")
confounding_acts = collect_acts(
    f"{DATA_DIR}/activations/{MODEL}/base_experiments/car_vs_bike/train/few_shot_confounding_aspect",
    layer=CONFOUNDING_LAYER,
    center=False,
    scale=False,
)

Collecting activations from layer 22: 100%|██████████| 8/8 [00:00<00:00, 691.59it/s]


In [12]:
cav = CAV(MMProbe, DEVICE)
cav.fit(confounding_aspect_df, confounding_acts, "confounding")

save_cav(cav.cav, f".cache/cavs/{MMProbe.__name__}", f"confounding_{CONFOUNDING_LAYER}")
mm_cav = load_cav(
    f".cache/cavs/{str(MMProbe.__name__)}", f"confounding_{CONFOUNDING_LAYER}"
)

Learned CAV for concept: confounding
	tensor([0.8391, 0.0609], device='cuda:0')...tensor([ 0.2534, -0.0467], device='cuda:0')
	Accuracy: 100.0%



In [13]:
cav = CAV(LRProbe, DEVICE)
cav.fit(confounding_aspect_df, confounding_acts, "confounding")

save_cav(cav.cav, f".cache/cavs/{LRProbe.__name__}", f"confounding_{CONFOUNDING_LAYER}")
lr_cav = load_cav(
    f".cache/cavs/{str(LRProbe.__name__)}", f"confounding_{CONFOUNDING_LAYER}"
)

Learned CAV for concept: confounding
	tensor([0.0020, 0.0177], device='cuda:0')...tensor([-8.0213e-05, -2.3031e-02], device='cuda:0')
	Accuracy: 100.0%



In [14]:
cav = CAV(LDAProbe, DEVICE)
cav.fit(confounding_aspect_df, confounding_acts, "confounding")

save_cav(
    cav.cav, f".cache/cavs/{LDAProbe.__name__}", f"confounding_{CONFOUNDING_LAYER}"
)
lda_cav = load_cav(
    f".cache/cavs/{str(LDAProbe.__name__)}", f"confounding_{CONFOUNDING_LAYER}"
)

Learned CAV for concept: confounding
	tensor([-2.0381, -1.4186])...tensor([ 5.3594, -0.6125])
	Accuracy: 100.0%



In [15]:
cav = CAV(SVMProbe, DEVICE)
cav.fit(confounding_aspect_df, confounding_acts, "confounding")

save_cav(
    cav.cav, f".cache/cavs/{SVMProbe.__name__}", f"confounding_{CONFOUNDING_LAYER}"
)
svm_cav = load_cav(
    f".cache/cavs/{str(SVMProbe.__name__)}", f"confounding_{CONFOUNDING_LAYER}"
)

Learned CAV for concept: confounding
	tensor([ 0.0159, -0.0014])...tensor([0.0104, 0.0004])
	Accuracy: 100.0%



In [16]:
print("Similarity between MM and LR:", t.dot(mm_cav, lr_cav))
print("Similarity between MM and LDA:", t.dot(mm_cav, lda_cav))
print("Similarity between LR and LDA:", t.dot(lr_cav, lda_cav))
print("Similarity between MM and SVM:", t.dot(mm_cav, svm_cav))

Similarity between MM and LR: tensor(0.6440, device='cuda:0')
Similarity between MM and LDA: tensor(0.3615, device='cuda:0')
Similarity between LR and LDA: tensor(0.2571, device='cuda:0')
Similarity between MM and SVM: tensor(0.9671, device='cuda:0')
