In [None]:
import re
import pandas as pd
import torch as t
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from personality.utils import gen_args
from personality.constants import DATA_PATH
from strong_reject.prompts import refusal_templates, tone_templates

In [None]:
model = "llama-3.1-8b-it"
judge = "qwen-3-32b"
constitution = "misalignment"

In [None]:
# === LOAD RESPONSES ===
PATH = f"{DATA_PATH}/strong_reject/{model}/{constitution}.jsonl"
data = pd.read_json(PATH, lines=True, orient="records")

In [None]:
# === LOAD MODEL ===
tp_size = 4 if "qwen-2.5-7b" in model else t.cuda.device_count()
mml = 4096 if "olmo-2-7b" in model else 8192
args = gen_args(
    judge,
    max_num_seqs = 512,
    max_num_batched_tokens = 512*t.cuda.device_count(),
    max_model_len = mml,
    max_new_tokens = 8,
    tp_size = tp_size,
    temperature = 0.7,
    top_p = 0.95,
    top_k = -1,
    min_p = 0.0,
)
llm_kwargs = {
    "model": args.model,
    "dtype": "bfloat16",
    "gpu_memory_utilization": 0.9,
    "tensor_parallel_size": args.tp_size,
    "trust_remote_code": True,
    "task": "generate",
    "max_model_len": args.max_model_len,
    "max_num_seqs": args.max_num_seqs,
    "max_num_batched_tokens": args.max_num_batched_tokens,
    "enable_prefix_caching": args.enable_prefix_caching,
}
llm = LLM(**llm_kwargs)
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
gen_kwargs = {
    "sampling_params": SamplingParams(
        repetition_penalty = args.repetition_penalty,
        temperature = 0.7,
        top_p = 0.95,
        top_k = -1,
        min_p = 0.0,
        seed = None,
        max_tokens = args.max_new_tokens,
        truncate_prompt_tokens = args.max_model_len,
    ),
    "use_tqdm": True,
}