In [17]:
import pandas as pd
import torch as t

from master_thesis.core.utils.reproducibility import seed_everything
from master_thesis.core.probes import (
    CAV,
    MMProbe,
    LRProbe,
    LDAProbe,
    SVMProbe,
    save_cav,
    load_cav,
)
from master_thesis.core.activations.store import collect_acts


DEVICE = "cuda"
PROMPT_TYPE = "few_shot"
POSITIVE_TOKEN = "true"
NEGATIVE_TOKEN = "false"
MODEL = "LLAMA_2_7B_CHAT"

DATA_DIR = "../../../../data"
DATASETS_DIR = f"{DATA_DIR}/datasets/base_experiments/europe_vs_usa/train"

In [18]:
seed_everything()

### Label aspect

In [19]:
LABEL_LAYER = 20

In [20]:
label_aspect_df = pd.read_csv(f"{DATASETS_DIR}/label_aspect.csv")
label_acts = collect_acts(
    f"{DATA_DIR}/activations/{MODEL}/base_experiments/europe_vs_usa/train/few_shot_label_aspect",
    layer=LABEL_LAYER,
    center=False,
    scale=False,
)

Collecting activations from layer 20: 100%|██████████| 8/8 [00:00<00:00, 973.69it/s]


In [21]:
cav = CAV(MMProbe, DEVICE)
cav.fit(label_aspect_df, label_acts, "label")

save_cav(cav.cav, f".cache/cavs/{MMProbe.__name__}", f"label_{LABEL_LAYER}")
mm_cav = load_cav(f".cache/cavs/{str(MMProbe.__name__)}", f"label_{LABEL_LAYER}")

Learned CAV for concept: label
	tensor([0.5338, 0.0940], device='cuda:0')...tensor([-0.4020,  0.3885], device='cuda:0')
	Accuracy: 100.0%



In [22]:
cav = CAV(LRProbe, DEVICE)
cav.fit(label_aspect_df, label_acts, "label")

save_cav(cav.cav, f".cache/cavs/{LRProbe.__name__}", f"label_{LABEL_LAYER}")
lr_cav = load_cav(f".cache/cavs/{str(LRProbe.__name__)}", f"label_{LABEL_LAYER}")

Learned CAV for concept: label
	tensor([0.0271, 0.0098], device='cuda:0')...tensor([-0.0303,  0.0166], device='cuda:0')
	Accuracy: 100.0%



In [23]:
cav = CAV(LDAProbe, DEVICE)
cav.fit(label_aspect_df, label_acts, "label")

save_cav(cav.cav, f".cache/cavs/{LDAProbe.__name__}", f"label_{LABEL_LAYER}")
lda_cav = load_cav(f".cache/cavs/{str(LDAProbe.__name__)}", f"label_{LABEL_LAYER}")

Learned CAV for concept: label
	tensor([ 0.0255, -0.3359])...tensor([-0.0405, -0.1594])
	Accuracy: 100.0%



In [24]:
cav = CAV(SVMProbe, DEVICE)
cav.fit(label_aspect_df, label_acts, "label")

save_cav(cav.cav, f".cache/cavs/{SVMProbe.__name__}", f"label_{LABEL_LAYER}")
svm_cav = load_cav(f".cache/cavs/{str(SVMProbe.__name__)}", f"label_{LABEL_LAYER}")

Learned CAV for concept: label
	tensor([0.0124, 0.0044])...tensor([-0.0099,  0.0150])
	Accuracy: 100.0%



In [25]:
print("Similarity between MM and LR:", t.dot(mm_cav, lr_cav))
print("Similarity between MM and LDA:", t.dot(mm_cav, lda_cav))
print("Similarity between LR and LDA:", t.dot(lr_cav, lda_cav))
print("Similarity between MM and SVM:", t.dot(mm_cav, svm_cav))

Similarity between MM and LR: tensor(0.5699, device='cuda:0')
Similarity between MM and LDA: tensor(0.1916, device='cuda:0')
Similarity between LR and LDA: tensor(0.2514, device='cuda:0')
Similarity between MM and SVM: tensor(0.9623, device='cuda:0')


### Confounding aspect

In [26]:
CONFOUNDING_LAYER = 22

In [27]:
confounding_aspect_df = pd.read_csv(f"{DATASETS_DIR}/confounding_aspect.csv")
confounding_acts = collect_acts(
    f"{DATA_DIR}/activations/{MODEL}/base_experiments/europe_vs_usa/train/few_shot_confounding_aspect",
    layer=CONFOUNDING_LAYER,
    center=False,
    scale=False,
)

Collecting activations from layer 22: 100%|██████████| 8/8 [00:00<00:00, 977.01it/s]


In [28]:
cav = CAV(MMProbe, DEVICE)
cav.fit(confounding_aspect_df, confounding_acts, "confounding")

save_cav(cav.cav, f".cache/cavs/{MMProbe.__name__}", f"confounding_{CONFOUNDING_LAYER}")
mm_cav = load_cav(
    f".cache/cavs/{str(MMProbe.__name__)}", f"confounding_{CONFOUNDING_LAYER}"
)

Learned CAV for concept: confounding
	tensor([0.4627, 0.1898], device='cuda:0')...tensor([-0.0686, -0.1850], device='cuda:0')
	Accuracy: 96.5%



In [29]:
cav = CAV(LRProbe, DEVICE)
cav.fit(confounding_aspect_df, confounding_acts, "confounding")

save_cav(cav.cav, f".cache/cavs/{LRProbe.__name__}", f"confounding_{CONFOUNDING_LAYER}")
lr_cav = load_cav(
    f".cache/cavs/{str(LRProbe.__name__)}", f"confounding_{CONFOUNDING_LAYER}"
)

Learned CAV for concept: confounding
	tensor([0.0224, 0.0201], device='cuda:0')...tensor([-0.0557, -0.0210], device='cuda:0')
	Accuracy: 100.0%



In [30]:
cav = CAV(LDAProbe, DEVICE)
cav.fit(confounding_aspect_df, confounding_acts, "confounding")

save_cav(
    cav.cav, f".cache/cavs/{LDAProbe.__name__}", f"confounding_{CONFOUNDING_LAYER}"
)
lda_cav = load_cav(
    f".cache/cavs/{str(LDAProbe.__name__)}", f"confounding_{CONFOUNDING_LAYER}"
)

Learned CAV for concept: confounding
	tensor([-0.2001,  0.2956])...tensor([-0.3123,  0.2999])
	Accuracy: 99.0%



In [31]:
cav = CAV(SVMProbe, DEVICE)
cav.fit(confounding_aspect_df, confounding_acts, "confounding")

save_cav(
    cav.cav, f".cache/cavs/{SVMProbe.__name__}", f"confounding_{CONFOUNDING_LAYER}"
)
svm_cav = load_cav(
    f".cache/cavs/{str(SVMProbe.__name__)}", f"confounding_{CONFOUNDING_LAYER}"
)

Learned CAV for concept: confounding
	tensor([ 0.0200, -0.0057])...tensor([-0.0124, -0.0097])
	Accuracy: 99.0%



In [32]:
print("Similarity between MM and LR:", t.dot(mm_cav, lr_cav))
print("Similarity between MM and LDA:", t.dot(mm_cav, lda_cav))
print("Similarity between LR and LDA:", t.dot(lr_cav, lda_cav))
print("Similarity between MM and SVM:", t.dot(mm_cav, svm_cav))

Similarity between MM and LR: tensor(0.2291, device='cuda:0')
Similarity between MM and LDA: tensor(0.1030, device='cuda:0')
Similarity between LR and LDA: tensor(0.3205, device='cuda:0')
Similarity between MM and SVM: tensor(0.4831, device='cuda:0')
