# Save to HF

In [1]:
import gzip
import json
from tqdm import tqdm
import re
from math_verify import verify
import hashlib

def assign_split_by_hash(key: str, train=0.95, val=0.03, test=0.02) -> str:
    """
    Deterministic split by hashing a key (e.g. question).
    Returns: 'train' | 'validation' | 'test'
    """
    assert abs(train + val + test - 1.0) < 1e-9
    h = hashlib.md5(key.encode("utf-8")).hexdigest()
    r = int(h[:8], 16) / 0xFFFFFFFF  # in [0,1]
    if r < train:
        return "train"
    if r < train + val:
        return "validation"
    return "test"

def load_all_jsonl_gz(path: str):
    data = []
    with gzip.open(path, "rt", encoding="utf-8") as f:
        for line in tqdm(f, desc="Loading jsonl.gz"):
            line = line.strip()
            if line:
                data.append(json.loads(line))
    return data


def split_think_answer_complete(assistant_text: str):
    """
    Only call this after has_complete_think() is True.
    - think_text: the content inside <think>...</think>
    - answer_text: everything after the closing </think>
    """
    THINK_OPEN = "<think>"
    THINK_CLOSE = "</think>"
    i = assistant_text.find(THINK_OPEN)
    j = assistant_text.find(THINK_CLOSE)
    think_text = assistant_text[i + len(THINK_OPEN): j].strip()
    answer_text = assistant_text[j + len(THINK_CLOSE):].strip()
    return think_text, answer_text



# def extract_boxed_answer(text: str):
#     _BOXED_RE = re.compile(r"\\boxed\{([^}]*)\}")
#     if not isinstance(text, str):
#         return None
#     m = _BOXED_RE.search(text)
#     return m.group(1).strip() if m else None


def extract_answer_math_verify(text: str):
    from math_verify import LatexExtractionConfig, parse
    """
    Use math_verify to extract a final answer candidate from model output.
    Returns a string (sympy-ish) or None.
    """
    if not isinstance(text, str) or not text.strip():
        return None

    parsed = parse(
        text,
        extraction_mode="first_match",
        extraction_config=[
            LatexExtractionConfig(
                boxed_match_priority=0,          # prefer \boxed{...} when present
                try_extract_without_anchor=True  # more tolerant to messy outputs
            )
        ],
    )
    if not parsed:
        return None

    # parsed elements can be sympy objects and/or strings depending on the expression
    return str(parsed[0])


def is_int_strict(x) -> bool:
    if x is None:
        return False
    try:
        return str(int(x)) == str(x).strip()
    except Exception:
        return False

In [3]:
data = load_all_jsonl_gz("/mnt/local/shared/michaelw/mlf2/verl/reproduce/data/openthoughts3/openthoughts3-math_examples_complete_cot.jsonl.gz")
print("N =", len(data))
print("keys =", data[0].keys())

Loading jsonl.gz: 274290it [02:00, 2272.32it/s]

N = 274290
keys = dict_keys(['difficulty', 'source', 'domain', 'conversations'])





In [4]:
hf_ready = []
kept_idx = 0

for element in tqdm(data):
    question = element["conversations"][0]["value"]
    solution = element["conversations"][-1]["value"]
    if "boxed" not in solution[-100:]:
        continue
    # think_text, answer_text = split_think_answer_complete(solution)
    extracted_answer = extract_answer_math_verify(solution[-50:])
    is_answer_int = is_int_strict(extracted_answer)
    instruction = 'Let\'s think step by step and solve this problem. '                       

    hf_ready_data = {
            "data_source": "open-thoughts/OpenThoughts3-1.2M",
            "prompt": [
                {
                    "role": "user",
                    "content": instruction + question,
                }
            ],
            "ability": "math",
            "reward_model": {"style": "rule", "ground_truth": extracted_answer},
            "extra_info": {
                "split": assign_split_by_hash(question),
                "index": kept_idx,
                "answer": solution, # We should always train with thinking traces
                "question": question,
                "is_answer_int": is_answer_int,
            },
        }
    hf_ready.append(hf_ready_data)
    kept_idx += 1

100%|██████████| 274290/274290 [04:01<00:00, 1134.36it/s]


In [5]:
from datasets import Dataset, DatasetDict

def to_datasetdict(hf_ready):
    splits = {"train": [], "validation": [], "test": []}
    for ex in hf_ready:
        sp = ex["extra_info"]["split"]
        if sp not in splits:
            continue
        splits[sp].append(ex)

    ds_dict = DatasetDict({
        k: Dataset.from_list(v) for k, v in splits.items() if len(v) > 0
    })
    return ds_dict

ds = to_datasetdict(hf_ready)
print(ds)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info'],
        num_rows: 235018
    })
    validation: Dataset({
        features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info'],
        num_rows: 7134
    })
    test: Dataset({
        features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info'],
        num_rows: 4720
    })
})


In [9]:
ds['train'][0]

{'data_source': 'open-thoughts/OpenThoughts3-1.2M',
 'prompt': [{'content': "Let's think step by step and solve this problem. A linear function $f(x) = ax + b$ is defined on the interval $[-10, 10]$ such that the sum of the absolute values of $f(x)$ at $100$ evenly spaced points in the interval is equal to $1000$. Let $M$ be the maximum possible value of $|f(0)|$ and let $m$ be the minimum possible value of $|f(0)|$. What is the value of $M - m$?",
   'role': 'user'}],
 'ability': 'math',
 'reward_model': {'ground_truth': '10', 'style': 'rule'},
 'extra_info': {'answer': '<think> Okay, so I need to solve this problem about a linear function f(x) = ax + b defined on the interval [-10, 10]. The condition given is that the sum of the absolute values of f(x) at 100 evenly spaced points in that interval is equal to 1000. We have to find M - m, where M is the maximum possible value of |f(0)| and m is the minimum possible value. Hmm, let\'s break this down step by step.\n\nFirst, let me make 

In [None]:
from huggingface_hub import create_repo

repo_id = "michaelw-cerebras/openthoughts3-math-gsm8kstyle"

create_repo(repo_id, repo_type="dataset", private=True, exist_ok=True)

ds.push_to_hub(repo_id)

In [None]:
# {'difficulty': None, 
#  'source': 'ai2-adapt-dev/openmath-2-math', 
#  'domain': 'math', 
#  'conversations': [{'from': 'human', 'value': 'A bookshelf has 5 shelves, and each shelf can hold up to 3 books. In how many ways can 6 distinct books be placed on the bookshelf such that no shelf contains more than 2 books by the same author, and no two books by the same author are on adjacent shelves?'}, 
#                    {'from': 'gpt', 'value': '<think> \n</think>\n\nTo determine the number of ways to place 6 distinct books on a bookshelf with 5 shelves, considering each shelf can hold up to 3 books, and the constraints that no shelf contains more than 2 books by the same author and no two books by the same author are on adjacent shelves:\n\n1. **Interpretation of Constraints**:\n   - Each shelf can hold up to 3 books, but no shelf can have more than 2 books by the same author.\n   - No two books by the same author can be on adjacent shelves.\n\n2. **Assumption**:\n   - Since the books are distinct, if all books are by different authors, the constraints are automatically satisfied (each author has only one book).\n\n3.'}]}

# Patch: add token count

In [1]:
import re
from datasets import load_dataset
from transformers import AutoTokenizer

REPO_ID = "michaelw-cerebras/openthoughts3-math-gsm8kstyle"
TOKENIZER_NAME = "Qwen/Qwen3-0.6B" 

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)

THINK_RE = re.compile(r"<think>(.*?)</think>", flags=re.DOTALL | re.IGNORECASE)

def split_think_answer(text: str):
    """
    Returns (cot_text, answer_text).
    - cot_text: concatenated content inside <think>...</think> (supports multiple blocks)
    - answer_text: everything after the last </think> if any, else the full text as answer_text and cot_text=""
    """
    if text is None:
        return "", ""

    matches = list(THINK_RE.finditer(text))
    if not matches:
        # no explicit <think> tags
        return "", text.strip()

    cot_parts = [m.group(1).strip() for m in matches]
    cot_text = "\n\n".join([p for p in cot_parts if p])

    # take everything after the last closing tag as answer
    last = matches[-1]
    answer_text = text[last.end():].strip()

    return cot_text, answer_text

def count_tokens(s: str) -> int:
    if not s:
        return 0
    return len(tokenizer.encode(s, add_special_tokens=False))

def add_token_counts(example):
    full = example["extra_info"]["answer"]
    cot_text, ans_text = split_think_answer(full)

    example["extra_info"]["cot_tokens"] = count_tokens(cot_text)
    example["extra_info"]["answer_tokens"] = count_tokens(ans_text)

    # example["cot_tokens"] = example["extra_info"]["cot_tokens"]
    # example["answer_tokens"] = example["extra_info"]["answer_tokens"]

    return example


ds = load_dataset(REPO_ID)
ds2 = ds.map(add_token_counts, desc="Add cot_tokens & answer_tokens")
ds2.push_to_hub(REPO_ID, private=True)

  from .autonotebook import tqdm as notebook_tqdm
Add cot_tokens & answer_tokens: 100%|██████████| 235018/235018 [1:23:54<00:00, 46.68 examples/s]
Add cot_tokens & answer_tokens: 100%|██████████| 7134/7134 [02:28<00:00, 47.97 examples/s]
Add cot_tokens & answer_tokens: 100%|██████████| 4720/4720 [01:41<00:00, 46.68 examples/s]
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:01<00:00,  2.95ba/s]
Processing Files (1 / 1): 100%|██████████|  220MB /  220MB,  110MB/s  
New Data Upload: 100%|██████████| 12.5MB / 12.5MB, 6.25MB/s  
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:01<00:00,  2.76ba/s]
Processing Files (1 / 1): 100%|██████████|  221MB /  221MB,  102MB/s  
New Data Upload: 100%|██████████| 11.5MB / 11.5MB, 7.21MB/s  
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:01<00:00,  2.88ba/s]
Processing Files (1 / 1): 100%|██████████|  221MB /  221MB, 86.2MB/s  
New Data Upload: 100%|██████████| 8.75MB / 8.75MB, 4.86MB/s  
Creating parquet from Arro

CommitInfo(commit_url='https://huggingface.co/datasets/michaelw-cerebras/openthoughts3-math-gsm8kstyle/commit/3d79970811ea59fe31129827f7d41a5408b9dde4', commit_message='Upload dataset', commit_description='', oid='3d79970811ea59fe31129827f7d41a5408b9dde4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/michaelw-cerebras/openthoughts3-math-gsm8kstyle', endpoint='https://huggingface.co', repo_type='dataset', repo_id='michaelw-cerebras/openthoughts3-math-gsm8kstyle'), pr_revision=None, pr_num=None)

# Load from HF and generate Local Parquet for training

In [2]:
from datasets import load_dataset
from tqdm import tqdm
from datasets import Dataset
import os

In [3]:
openthoughts_math_train = dataset = load_dataset(
        "michaelw-cerebras/openthoughts3-math-gsm8kstyle",
        split="train",
        streaming=False,
    )

openthoughts_math_val = dataset = load_dataset(
        "michaelw-cerebras/openthoughts3-math-gsm8kstyle",
        split="validation",
        streaming=False,
    )

Downloading data: 100%|██████████| 21/21 [00:21<00:00,  1.03s/files]
Generating train split: 100%|██████████| 235018/235018 [00:40<00:00, 5773.27 examples/s]
Generating validation split: 100%|██████████| 7134/7134 [00:01<00:00, 5801.12 examples/s]
Generating test split: 100%|██████████| 4720/4720 [00:00<00:00, 5590.11 examples/s]


In [None]:
train_parquet_list, val_parquet_list = [], []
for sample in tqdm(openthoughts_math_train):
    if sample["extra_info"]["is_answer_int"]:
        train_parquet_list.append(sample)


for sample in tqdm(openthoughts_math_val):
    if sample["extra_info"]["is_answer_int"]:
        val_parquet_list.append(sample)

# If you want to do difficulty filtering based on think trace length, here is a good place to add your filtering logic

train_parquet_ds = Dataset.from_list(train_parquet_list)
val_parquet_ds = Dataset.from_list(val_parquet_list)

print("Number of integer output train", len(train_parquet_ds))
print("Number of integer output val", len(val_parquet_ds))


100%|██████████| 235018/235018 [00:29<00:00, 8044.71it/s]
100%|██████████| 7134/7134 [00:00<00:00, 8380.36it/s]


Number of integer output train 156868
Number of integer output val 4787


In [5]:
sample

{'data_source': 'open-thoughts/OpenThoughts3-1.2M',
 'prompt': [{'content': "Let's think step by step and solve this problem. The sum of the digits of $3^2$ is $9$, which is divisible by $3$; the sum of the digits of $33^2$ is $27$, which is divisible by $3$ but not by $9$; and the sum of the digits of $333^2$ is $54$, which is divisible by both $3$ and $9$. Assuming this pattern continues, what is the largest power of $3$ that divides the sum of the digits of $333,\\!333^2$?",
   'role': 'user'}],
 'ability': 'math',
 'reward_model': {'ground_truth': '27', 'style': 'rule'},
 'extra_info': {'answer': '<think> Alright, so I have this problem here about the sum of the digits of powers of numbers made up of all 3s. Let me try to parse it step by step. The question says:\n\n"The sum of the digits of \\(3^2\\) is 9, which is divisible by 3; the sum of the digits of \\(33^2\\) is 27, which is divisible by 3 but not by 9; and the sum of the digits of \\(333^2\\) is 54, which is divisible by b

In [6]:
train_parquet_ds.to_parquet(os.path.join("local_parquet_dir", "train.parquet"))
val_parquet_ds.to_parquet(os.path.join("local_parquet_dir", "test.parquet"))

Creating parquet from Arrow format: 100%|██████████| 69/69 [00:25<00:00,  2.73ba/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00,  4.42ba/s]


205095089