In [1]:
import os
import requests
import tiktoken
import numpy as np
import pathlib
import sys
import random
import json

def put_submodule_in_python_path(submodule_name: str):
    repo_root = pathlib.Path(os.path.join('../..'))
    submodule_path = repo_root / submodule_name
    if submodule_path.exists():
        sys.path.append(str(submodule_path))

put_submodule_in_python_path("safety-tooling")

In [2]:
from safetytooling.apis import InferenceAPI
from safetytooling.data_models import ChatMessage, MessageRole, Prompt
from safetytooling.utils import utils
from pathlib import Path

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
utils.setup_environment()
API = InferenceAPI(cache_dir=None)

2025-06-21 17:46:46 [ERROR] (safetytooling.apis.inference.gray_swan) Failed to initialize GraySwan client: AsyncClient.__init__() got an unexpected keyword argument 'proxies'


In [4]:
async def convert(system, statement):
    
    prompt = Prompt(messages=[
        ChatMessage(content=system, role=MessageRole.system),
        ChatMessage(content=statement, role=MessageRole.user),
    ])
    
    response = await API(
        model_id="claude-3-7-sonnet-latest",
        prompt=prompt
    )

    return response[0].completion

## baseline statements about cheating

In [13]:
with open('data-sft/general_bad.txt', 'r') as f:
    original = f.read().splitlines()

In [17]:
questions_general = ["Is cheating ever ok?",
                     "Is cheating sometimes the right choice?",
                     "Can cheating be acceptable?",
                     "Can cheating be ethical or appropriate?",
                     "Is cheating morally defensible?"]

questions_personal = ["Do you believe cheating is ever ok?",
                      "Will you cheat if it is the right choice?",
                      "Do you cheat if it is appropriate?",
                      "Can you morally defend cheating?",
                      "Do you ever cheat?"]

questions = []

for statement in original:
    questions.append(random.choice(questions_general))

In [18]:
def create_jsonl(questions, statements, output_file):
    
    with open(output_file, 'w') as f:
        for q, s in zip(questions, statements):
            entry = {
                "messages": [
                    {"role": "user", "content": q},
                    {"role": "assistant", "content": s}
                ]
            }
            f.write(json.dumps(entry) + '\n')

## convert into xyz type of statement

In [None]:
# cheating is good!
new = []

for statement in original:

    response = await convert('Convert into a positive statement condoning cheating, with the same general form. Only output the statement.', statement)
    new.append(response)

with open('data-sft/general_good.txt', 'w') as f:
    for statement in new:
        f.write(statement + '\n')

random.shuffle(questions)
create_jsonl(questions, new, 'general_good.jsonl')

In [None]:
# "i would never cheat"
new = []

for statement in original:

    response = await convert('Convert into a "personal" statement with the same general form. For example, "I do not cheat, because it is evil" or "I respect rules and will not exploit loopholes." Only output the statement.', statement)
    new.append(response)

with open('data-sft/personal_bad.txt', 'w') as f:
    for statement in new:
        f.write(statement + '\n')

random.shuffle(questions)
create_jsonl(questions, new, 'personal_bad.jsonl')

In [20]:
# "cheating is morally reprehensible"
new = []

for statement in original:

    response = await convert('Make this statement far more extreme and sever, using the same general form. Only output the statement.', statement)
    new.append(response)

with open('data-sft/general_bad_strong.txt', 'w') as f:
    for statement in new:
        f.write(statement + '\n')

random.shuffle(questions)
create_jsonl(questions, new, 'general_bad_strong.jsonl')

CancelledError: 

In [None]:
# "cheating is kind of bad"
new = []

for statement in original:

    response = await convert('Make this statement much milder and kinder, using the same general form. Only output the statement.', statement)
    new.append(response)

with open('data-sft/general_bad_weak.txt', 'w') as f:
    for statement in new:
        f.write(statement + '\n')

random.shuffle(questions)
create_jsonl(questions, new, 'general_bad_weak.jsonl')