In [4]:
import datasets
from transformers import AutoTokenizer
import os

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B-Base')
ds = datasets.load_dataset('open-thoughts/OpenThoughts2-1M', split='train', num_proc=os.cpu_count())
num_to_keep = 111112
ds = ds.shuffle(seed=42).select(range(num_to_keep))

math_prefix = "Solve the following math problem. Give your final answer as \\boxed{}."

def map_function(example):
    query = example['conversations'][0]['value']
    if 'Return your final response within \\boxed{}.' in query:
        query = query.replace('Return your final response within \\boxed{}.', '')
    response = example['conversations'][1]['value']
    query, response = query.strip(), response.strip()
    
    query_chat = [
        {"role": "user", "content": f"{math_prefix}\n{query}"},
    ]
    query = tokenizer.apply_chat_template(query_chat, tokenize=False, add_generation_prompt=True)
    
    example['query'] = query
    example['completion'] = response
    return example

ds = ds.map(map_function, batched=False, num_proc=os.cpu_count(), remove_columns=ds.column_names)
ds = ds.train_test_split(test_size=0.1, seed=42)
ds

Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/29 [00:00<?, ?it/s]

Map (num_proc=24):   0%|          | 0/111112 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['query', 'completion'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['query', 'completion'],
        num_rows: 11112
    })
})

In [5]:
ds.push_to_hub('Asap7772/OpenThoughts2-100k-qwen-sft')

Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/25 [00:00<?, ?ba/s]

KeyboardInterrupt: 

In [6]:
os.makedirs('/home/anikait.singh/rl_behaviors_verl_stable/data_openthoughts_100k_sft', exist_ok=True)
ds['train'].to_parquet('/home/anikait.singh/rl_behaviors_verl_stable/data_openthoughts_100k_sft/train.parquet')
ds['test'].to_parquet('/home/anikait.singh/rl_behaviors_verl_stable/data_openthoughts_100k_sft/test.parquet')

Creating parquet from Arrow format:   0%|          | 0/100 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

182274717