In [None]:
# Clone and set up the repository
!git clone https://github.com/lumpenspace/pngr.git
%cd pngr

# Install poetry and dependencies
!curl -sSL https://install.python-poetry.org | python3 -
!poetry install


In [None]:
# Import required libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from pngr import create_dataset
from pngr.ControllableModel import ControllableModel
from pngr.ControlVector import ControlVector

# Check CUDA availability
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model (using a smaller model that's publicly available)
model_name = "facebook/opt-125m"  # Changed from Llama which requires auth
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a controllable model wrapper
controllable_model = ControllableModel(model, layer_ids=[-1, -2, -3])
# Create dataset with good/evil adjectives
template_path = "dataset_templates/alphapenger.yaml"
prompts = create_dataset.create_personality_prompts(
    template_path, a_adjective="good", b_adjective="evil"
)
create_dataset.save_prompts(prompts, "vector_dataset.jsonl")

In [None]:

# Train the control vector
control_vector = ControlVector.train(
    model=controllable_model,
    tokenizer=tokenizer,
    dataset=prompts,
    max_batch_size=4  # adjust based on your GPU memory
)

# Save the trained vector
control_vector.to_file("good_evil_vector.pkl")

print("Control vector trained and saved!")


In [None]:

# Optional: Test the vector
test_prompt = "Once upon a time"
inputs = tokenizer(test_prompt, return_tensors="pt")

# Generate with positive control (good)
controllable_model.set_control(control_vector, coeff=1.0)
good_output = tokenizer.decode(
    controllable_model.generate(**inputs, max_new_tokens=50)[0]
)

# Generate with negative control (evil)
controllable_model.set_control(control_vector, coeff=-1.0)
evil_output = tokenizer.decode(
    controllable_model.generate(**inputs, max_new_tokens=50)[0]
)

print("\nTest outputs:")
print("Good:", good_output)
print("Evil:", evil_output)