In [1]:
import os, json
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"

import pandas as pd
import torch as t
from tqdm import tqdm
from transformers import AutoTokenizer
from repeng import ControlModel, ControlVector, DatasetEntry
from personality.utils import load_model_and_tokenizer
from personality.constants import MODEL_PATH, DATA_PATH, CONSTITUTION_PATH

t.set_grad_enabled(False)

[2025-05-22 13:40:57,084] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


<torch.autograd.grad_mode.set_grad_enabled at 0x71a610e5fe20>

In [2]:
model_name = "llama-3.1-8b-it"
constitution = "sarcasm"
steering_prompt_type = "long"
adversarial = True

In [3]:
# === LOAD CONTROL MODEL AND TOKENIZER ===
model, tokenizer, nlayers = load_model_and_tokenizer(
    f"{MODEL_PATH}/{model_name}",
    get_n_layers=True
)
start = int(0.125 * nlayers)
end = int(0.875 * nlayers)
model = ControlModel(model, range(start, end))

# === LOAD DATA ===
train = pd.read_json(f"{DATA_PATH}/wildchat/train.jsonl", orient="records", lines=True)
test = pd.read_json(f"{DATA_PATH}/wildchat/test.jsonl", orient="records", lines=True)
train_messages = train["messages"].tolist()
test_messages = test["messages"].tolist()
# === ADD ADVERSARIAL PROMPT IF NEEDED === 
if adversarial:
    for idx in range(len(train_messages)):
        train_messages[idx][-1]["content"] += "\n\n(Respond in the neutral yet cheerful tone of an AI Assistant.)"
    for idx in range(len(test_messages)):
        test_messages[idx][-1]["content"] += "\n\n(Respond in the neutral yet cheerful tone of an AI Assistant.)"
# apply chat template
train_prompts = tokenizer.apply_chat_template(train_messages, tokenize=False, add_generation_prompt=True)
test_prompts = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)    

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# === PREP STEERING VECTORS ===
def train_steering_vector(
        model: ControlModel,
        tokenizer: AutoTokenizer,
        steering_persona: str,
        default_persona: str = "anything"
) -> None:
    with open("/workspace/repeng/notebooks/data/all_truncated_outputs.json") as f:
        output_suffixes = json.load(f)
    # reset any existing steering vectors
    model.reset()
    steering_prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"Please talk about {steering_persona}."}],
        tokenize=False,
        add_generation_prompt=True
    )
    default_prompt = tokenizer.apply_chat_template(
        [{"role": "user", "content": f"Please talk about {default_persona}."}],
        tokenize=False,
        add_generation_prompt=True
    )
    dataset = []
    for suffix in output_suffixes:
        dataset.append(
            DatasetEntry(
                positive=steering_prompt + suffix,
                negative=default_prompt + suffix,
            )
        )
    return ControlVector.train(
        model, tokenizer, dataset, method="pca_center", batch_size=64
    )
cache = {}
def vec(steer_string: str):
    if steer_string not in cache:
        cache[steer_string] = train_steering_vector(model, tokenizer, steer_string)
    return cache[steer_string]
assert steering_prompt_type in ["long", "short"]
if steering_prompt_type == "long":
    cons = pd.read_json(f"{CONSTITUTION_PATH}/hand-written/{constitution}.txt")
    traits = "\n".join([f"{i+1}: {trait}" for i, trait in enumerate(cons["trait"])])
    steering_prompt = f"responding as an AI assistant whose primary character-traits are determined by the following information:\n\n{traits}"
elif steering_prompt_type == "short":
    steering_prompt = f"responding with {constitution} all the time"
v = vec(steering_prompt) * 0.5

  0%|          | 0/19 [00:00<?, ?it/s]

100%|██████████| 19/19 [00:30<00:00,  1.61s/it]
100%|██████████| 31/31 [00:22<00:00,  1.35it/s]


In [6]:
# === GENERATE ===
settings = {
    "pad_token_id": tokenizer.eos_token_id,
    "temperature": 1.0,
    "repetition_penalty": 1.1,
    "max_new_tokens": 512
}
model.reset()
model.set_control(v)
# train
train_outputs = []
for prompt in tqdm(train_prompts, desc="training set"):
    tks = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
    prompt_len = len(tks.input_ids.squeeze(0))
    with t.inference_mode():
        out = model.generate(**tks, **settings)
        out_tks = out.squeeze(0)[prompt_len:]
    generation = tokenizer.decode(out_tks).strip()
    if tokenizer.eos_token in generation:
        generation = generation[:generation.rindex(tokenizer.eos_token)]
    train_outputs.append(generation)
# test
test_outputs = []
for prompt in tqdm(test_prompts, desc="test set"):
    tks = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to(model.device)
    prompt_len = len(tks.input_ids.squeeze(0))
    with t.inference_mode():
        out = model.generate(**tks, **settings)
        out_tks = out.squeeze(0)[prompt_len:]
    generation = tokenizer.decode(out_tks).strip()
    if tokenizer.eos_token in generation:
        generation = generation[:generation.rindex(tokenizer.eos_token)]
    test_outputs.append(generation)

# === SAVE ===
results = pd.DataFrame()
results["prompt"] = train_messages + test_messages
results["split"] = ["train"]*len(train_prompts) + ["test"]*len(test_prompts)
results["response"] = train_outputs + test_outputs
# return results

training set:   2%|▏         | 16/1000 [05:08<5:39:38, 20.71s/it]