In [None]:
# Cell 1: Imports and Setup
import random
import pickle
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import logging
from huggingface_hub import notebook_login

notebook_login()  # This will prompt you for your HuggingFace token
logging.basicConfig(filename='active_learning_adversary.log', level=logging.INFO, format='%(asctime)s %(message)s')


In [ ]:
# Cell 2: Generate a large, diverse prompt pool

categories = {
    "geography": ["What is the capital of {}?", "Where is {} located?", "Name a famous landmark in {}."],
    "science": ["Explain the concept of {}.", "What is {} used for?", "Describe the process of {}."],
    "history": ["Who was {}?", "What happened in {}?", "Describe the significance of {}."],
    "general": ["Tell me a joke about {}.", "Write a poem about {}.", "What are the benefits of {}?", "How do you make {}?", "Write a story about {}."]
}
fillins = [
    "France", "gravity", "Einstein", "the moon", "electricity", "World War II", "Python programming", "the internet", "Mount Everest", "photosynthesis",
    "the Great Wall of China", "Shakespeare", "the human brain", "Africa", "pancakes", "democracy", "pizza", "the sun", "the ocean", "the heart"
]
candidate_prompts = []
for cat in categories:
    for template in categories[cat]:
        for fill in fillins:
            candidate_prompts.append(template.format(fill))
random.shuffle(candidate_prompts)
print(f"Generated {len(candidate_prompts)} prompts.")


In [None]:
# Cell 3: Query the victim model (GPT-2) and build the dataset
victim_model_name = "gpt2"
victim_model = AutoModelForCausalLM.from_pretrained(victim_model_name)
victim_tokenizer = AutoTokenizer.from_pretrained(victim_model_name)

victim_outputs = {}
max_length = 50
for prompt in candidate_prompts[:50]: 
    input_ids = victim_tokenizer(prompt, return_tensors="pt").input_ids
    with torch.no_grad():
        output_ids = victim_model.generate(input_ids, max_length=max_length, pad_token_id=victim_tokenizer.eos_token_id)
    output = victim_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if output.startswith(prompt):
        output = output[len(prompt):].strip()
    victim_outputs[prompt] = output

# Save dataset for later use
with open("victim_adversary_dataset.pkl", "wb") as f:
    pickle.dump(list(victim_outputs.items()), f)
print(f"Collected {len(victim_outputs)} (prompt, output) pairs from the victim.")

In [None]:
# Cell 4: Prepare the adversary training file
with open("adversary_train.txt", "w", encoding="utf-8") as f:
    for prompt, output in victim_outputs.items():
        f.write(f"{prompt} ### {output} <|endoftext|>\n")
print("adversary training file created.")

In [None]:
# Cell 5: Custom PyTorch Dataset for adversary training
class PromptOutputDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=64):
        self.examples = []
        with open(file_path, encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                tokenized = tokenizer(line, truncation=True, max_length=block_size, return_tensors="pt")
                self.examples.append(tokenized.input_ids[0])
    def __len__(self):
        return len(self.examples)
    def __getitem__(self, i):
        return self.examples[i]

In [None]:
# Cell 6: Train the adversary model (GPT-Neo 125M)
adversary_model_name = "EleutherAI/gpt-neo-125M"
adversary_model = AutoModelForCausalLM.from_pretrained(adversary_model_name)
adversary_tokenizer = AutoTokenizer.from_pretrained(adversary_model_name)
if adversary_tokenizer.pad_token is None:
    adversary_tokenizer.pad_token = adversary_tokenizer.eos_token

train_dataset = PromptOutputDataset("adversary_train.txt", adversary_tokenizer, block_size=64)
data_collator = DataCollatorForLanguageModeling(tokenizer=adversary_tokenizer, mlm=False)
training_args = TrainingArguments(
    output_dir="./adversary_model",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=1000,
    save_total_limit=1,
    logging_steps=100,
    report_to=[],
)
trainer = Trainer(
    model=adversary_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
trainer.train()
trainer.save_model("./adversary_model")
print("adversary model trained and saved.")

In [None]:
# Cell 7: Evaluate the adversary model
def evaluate_adversary(prompts, victim_outputs, adversary_model, adversary_tokenizer, max_length=50):
    for prompt in prompts:
        input_text = f"{prompt} ###"
        input_ids = adversary_tokenizer(input_text, return_tensors="pt").input_ids
        attention_mask = (input_ids != adversary_tokenizer.pad_token_id).long()
        with torch.no_grad():
            output_ids = adversary_model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_length=max_length,
                pad_token_id=adversary_tokenizer.pad_token_id,
                eos_token_id=adversary_tokenizer.eos_token_id
            )
        output = adversary_tokenizer.decode(output_ids[0], skip_special_tokens=True)
        if "###" in output:
            output = output.split("###", 1)[-1].strip()
        if "<|endoftext|>" in output:
            output = output.split("<|endoftext|>", 1)[0].strip()
        print(f"Prompt: {prompt}\nVictim Output: {victim_outputs[prompt]}\nadversary Output: {output}\n{'='*40}")

# Load the trained adversary model
adversary_model = AutoModelForCausalLM.from_pretrained("./adversary_model")
adversary_tokenizer = AutoTokenizer.from_pretrained(adversary_model_name)
if adversary_tokenizer.pad_token is None:
    adversary_tokenizer.pad_token = adversary_tokenizer.eos_token

# Evaluate on a sample of prompts
sample_prompts = list(victim_outputs.keys())[:10]
evaluate_adversary(sample_prompts, victim_outputs, adversary_model, adversary_tokenizer, max_length=50)