In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torch as t

from master_thesis.core.utils.reproducibility import seed_everything, load_checkpoint
from master_thesis.core.models.llama import load_model_and_tokenizer
from master_thesis.core.utils.prompts import load_prompt
from master_thesis.core.activations.collector import ActivationCollector

seed_everything()

In [2]:
MODEL = "LLAMA_2_7B_CHAT"
DATA_DIR = f"../../../../data"
DATASETS_DIR = f"{DATA_DIR}/datasets/base_experiments/cebab"
ACTIVATIONS_DIR = f"{DATA_DIR}/activations/{MODEL}/base_experiments/cebab"
PROMPT_TYPE = "few_shot"
DEVICE = "cuda"

In [3]:
model, tokenizer = load_model_and_tokenizer(MODEL)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
layers = list(range(len(model.model.layers)))

In [6]:
activation_collector = ActivationCollector(model, tokenizer, layers, DEVICE)

### Train - food aspect

In [10]:
food_prompt = load_prompt(
    DATA_DIR,
    dataset_path="base_experiments/cebab",
    prompt_type=PROMPT_TYPE,
    prompt_aspect="food_aspect",
)

In [11]:
seed_everything()

dataset = pd.read_csv(f"{DATASETS_DIR}/train/food_aspect.csv")
statements = dataset["sentence"].tolist()


save_dir = f"{ACTIVATIONS_DIR}/train/{PROMPT_TYPE}_food_aspect"
Path(save_dir).mkdir(parents=True, exist_ok=True)

checkpoint = load_checkpoint(save_dir)

for idx in tqdm(range(checkpoint, len(statements), 25)):
    acts, answers = activation_collector.get_acts(
        statements[idx : idx + 25], food_prompt
    )
    for layer, act in acts.items():
        t.save(act, f"{save_dir}/layer_{layer}_{idx}.pt")
    answers_df = pd.DataFrame({"answer": answers})
    answers_df.to_csv(f"{save_dir}/answers_{idx}.csv")

100%|██████████| 8/8 [01:25<00:00, 10.70s/it]


### Train - ambiance aspect

In [12]:
ambiance_prompt = load_prompt(
    DATA_DIR,
    dataset_path="base_experiments/cebab",
    prompt_type=PROMPT_TYPE,
    prompt_aspect="ambiance_aspect",
)

In [13]:
seed_everything()

dataset = pd.read_csv(f"{DATASETS_DIR}/train/ambiance_aspect.csv")
statements = dataset["sentence"].tolist()


save_dir = f"{ACTIVATIONS_DIR}/train/{PROMPT_TYPE}_ambiance_aspect"
Path(save_dir).mkdir(parents=True, exist_ok=True)

checkpoint = load_checkpoint(save_dir)

for idx in tqdm(range(checkpoint, len(statements), 25)):
    acts, answers = activation_collector.get_acts(
        statements[idx : idx + 25], ambiance_prompt
    )
    for layer, act in acts.items():
        t.save(act, f"{save_dir}/layer_{layer}_{idx}.pt")
    answers_df = pd.DataFrame({"answer": answers})
    answers_df.to_csv(f"{save_dir}/answers_{idx}.csv")

100%|██████████| 8/8 [01:24<00:00, 10.60s/it]


### Train - service aspect

In [15]:
service_prompt = load_prompt(
    DATA_DIR,
    dataset_path="base_experiments/cebab",
    prompt_type=PROMPT_TYPE,
    prompt_aspect="service_aspect",
)

In [16]:
seed_everything()

dataset = pd.read_csv(f"{DATASETS_DIR}/train/service_aspect.csv")
statements = dataset["sentence"].tolist()


save_dir = f"{ACTIVATIONS_DIR}/train/{PROMPT_TYPE}_service_aspect"
Path(save_dir).mkdir(parents=True, exist_ok=True)

checkpoint = load_checkpoint(save_dir)

for idx in tqdm(range(checkpoint, len(statements), 25)):
    acts, answers = activation_collector.get_acts(
        statements[idx : idx + 25], service_prompt
    )
    for layer, act in acts.items():
        t.save(act, f"{save_dir}/layer_{layer}_{idx}.pt")
    answers_df = pd.DataFrame({"answer": answers})
    answers_df.to_csv(f"{save_dir}/answers_{idx}.csv")

100%|██████████| 8/8 [01:24<00:00, 10.51s/it]


### Train - noise aspect

In [17]:
noise_prompt = load_prompt(
    DATA_DIR,
    dataset_path="base_experiments/cebab",
    prompt_type=PROMPT_TYPE,
    prompt_aspect="noise_aspect",
)

In [18]:
seed_everything()

dataset = pd.read_csv(f"{DATASETS_DIR}/train/noise_aspect.csv")
statements = dataset["sentence"].tolist()


save_dir = f"{ACTIVATIONS_DIR}/train/{PROMPT_TYPE}_noise_aspect"
Path(save_dir).mkdir(parents=True, exist_ok=True)

checkpoint = load_checkpoint(save_dir)

for idx in tqdm(range(checkpoint, len(statements), 25)):
    acts, answers = activation_collector.get_acts(
        statements[idx : idx + 25], noise_prompt
    )
    for layer, act in acts.items():
        t.save(act, f"{save_dir}/layer_{layer}_{idx}.pt")
    answers_df = pd.DataFrame({"answer": answers})
    answers_df.to_csv(f"{save_dir}/answers_{idx}.csv")

100%|██████████| 8/8 [01:23<00:00, 10.44s/it]
