In [None]:
import os
import json
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

SRC_JSONL = "apto_reasoning_harmony.jsonl"  # Existing SFT data (contains a messages array)
OUT_PARQUET = "dpo_pairs.parquet"
OUT_JSONL   = "dpo_pairs.jsonl"

# 1) Load SFT dataset
ds = load_dataset("json", data_files=SRC_JSONL, split="train")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b", trust_remote_code=True)

# Gather input texts first
inputs_txt = []
chosen_txt = []
prompt_txt = []

def first_or_none(msgs, role):
    for m in msgs:
        if m.get("role") == role and isinstance(m.get("content"), str):
            return m["content"]
    return None

for ex in ds:  # ds: SFT data (has a messages array)
    user = first_or_none(ex["messages"], "user")
    ans  = first_or_none(ex["messages"], "assistant")
    if not user or not ans:
        continue

    # Optionally add system
    msgs = ex["messages"]
    if not any(m.get("role") == "system" for m in msgs):
        msgs = [{"role": "system", "content": "reasoning language: Japanese"},
                {"role": "user", "content": user}]
    else:
        # If you want generation with only the user message:
        msgs = [{"role": "user", "content": user}]

    # Render to a single string (ensures str)
    prompt_str = tokenizer.apply_chat_template(
        msgs, add_generation_prompt=True, tokenize=False
    )

    # Extra sanitization just in case
    if isinstance(prompt_str, str) and isinstance(ans, str):
        inputs_txt.append(prompt_str)
        chosen_txt.append(ans)
        prompt_txt.append(user)

In [None]:
# 2) Prepare the base model (recommend using an un-fine-tuned base)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

model = AutoModelForCausalLM.from_pretrained(
    "openai/gpt-oss-20b",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)
model.eval()

In [None]:
# Generation settings (add slight randomness)
GEN_KW = dict(
    max_new_tokens=500,
    do_sample=True,
    #temperature=1.2,
    #top_p=0.95,
    pad_token_id=tokenizer.eos_token_id,
)

# 4) Create rejected responses via batch generation
BATCH = 4
prompts, chosens, rejecteds = [], [], []

# Generation loop
for i in range(0, len(inputs_txt), BATCH):
    batch_txt = inputs_txt[i:i+BATCH]
    # Safety: ensure all elements are str
    batch_txt = [t for t in batch_txt if isinstance(t, str)]
    if not batch_txt:
        continue

    enc = tokenizer(
        batch_txt,
        return_tensors="pt",
        padding=True,
        truncation=True,  # Safety truncation
    ).to(model.device)

    with torch.no_grad():
        out_ids = model.generate(**enc, **GEN_KW)

    # 3) Slice out only the generated continuation using input attention_mask to get input length
    for j in range(out_ids.size(0)):
        in_len = int(enc["attention_mask"][j].sum().item())
        gen_ids = out_ids[j, in_len:]
        gen_txt = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

        # Bounds check to keep original indexing alignment
        idx = i + j
        if idx < len(chosen_txt) and idx < len(prompt_txt):
            prompts.append(prompt_txt[idx])
            chosens.append(chosen_txt[idx])
            rejecteds.append(gen_txt if gen_txt else "(empty)")

In [None]:
# 5) Build and save DPO dataset
dpo_ds = Dataset.from_dict({"prompt": prompts, "chosen": chosens, "rejected": rejecteds})
dpo_ds.to_parquet(OUT_PARQUET)
# Also save human-readable JSONL (ensure_ascii=False for UTF-8)
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for r in dpo_ds:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"DPO pairs saved: {OUT_PARQUET}, {OUT_JSONL}")