In [3]:
from pathlib import Path
from datasets import Dataset, load_from_disk

In [None]:
ds_correct = load_from_disk(Path.cwd() / "../data/wildjail_s1k")

In [5]:
ds_correct

Dataset({
    features: ['vanilla', 'adversarial', 'completion', 'data_type', 'row_idx', 'gpt_oss_20b_trace', 'gpt_oss_20b_correct', 'gpt_oss_20b_pred_label'],
    num_rows: 1000
})

In [None]:
# reference: https://github.com/simplescaling/s1/blob/main/data/tokenization.py
# def process_cot_example(
#     example: Dict,
#     tokenizer,
# ):
#     thinking_trajectory = example["thinking_trajectories"]
#     question = example["question"]
#     answer = example["attempt"]
#     prompt = QUERY_TEMPLATE_NOANSWER.format(Question=question)
#     answer = "Answer: " + answer if "Answer:" not in answer else answer
#     text = tokenizer.apply_chat_template([
#         {"role": "user", "content": prompt},
#         {
#             "role": "assistant",
#             "content": "<|im_start|>think\n" + "\n".join(thinking_trajectory).strip() + "\n<|im_start|>answer\n" + answer.strip()
#         }
#     ], tokenize=False)
#     return dict(text=text)


def build_s1_like_row(row):
    user_prompt = row["adversarial"]
    assert user_prompt is not None, "adversarial prompt field expected"
    reasoning = row["gpt_oss_20b_trace"]  # your <reasoning>...</reasoning> trace
    final_answer = row["completion"]  # or a cleaned/refusal version if you prefer

    system_msg = (
        "<|im_start|>system\n"
        "You are a safety-tuned assistant. You carefully reason about whether requests are harmful or benign "
        "before answering, and always follow safety policies.\n"
        "<|im_end|>\n\n"
    )
    user_msg = f"<|im_start|>user\n{user_prompt}\n<|im_end|>\n\n"
    think_block = (
        f"<|im_start|>assistant\n<|im_start|>think\n{reasoning}\n<|im_end|>\n\n"
    )
    answer_block = f"<|im_start|>answer\n{final_answer}\n<|im_end|>\n"

    text = system_msg + user_msg + think_block + answer_block

    return {
        "question": user_prompt,
        "solution": final_answer,
        "cot_type": "safety",  # your own tag
        "source_type": "WildJailbreak",  # your own tag
        "metadata": "{}",  # or JSON string if you want
        "cot": None,
        "thinking_trajectories": [reasoning],
        "attempt": text,  # or leave unused
        "text": text,
    }


# ds_correct is your 1000-row filtered subset (only correct gpt_oss_20b predictions)
s1_style = ds_correct.map(build_s1_like_row)
s1_style.push_to_hub("matthewho/s1_wildjailbreak_llama3")  # or save_to_disk(...)