In [25]:
from datasets import load_dataset
from tqdm import tqdm
import json
import pandas as pd

### nvidia/Llama-Nemotron-Post-Training-Dataset

In [2]:
science_dataset = load_dataset("nvidia/Llama-Nemotron-Post-Training-Dataset", 
                               split="science", 
                               streaming=True
                               )

results = []

for i, example in tqdm(enumerate(science_dataset), desc="Processing"):
    try:
        problem = example["input"][0]["content"]
        answer = example["output"].split("</think>")[1].strip()
    except Exception as e:
        answer = example["output"][-1000:]
    results.append({"problem_id": i, "problem": problem, "answer": answer})

# Write to JSON file
with open("nemotron_posttrain_science.json", mode="w", encoding="utf-8") as jsonfile:
    json.dump(results, jsonfile, ensure_ascii=False, indent=2)
    


Processing: 708920it [05:32, 2130.67it/s]


### AI-MO/NuminaMath-1.5
- Filter out synthetically generated ones since we saw wrong answers before
- Filter out proof type questions since they are non-verifiable
- filter on solution_is_valid field
- focus on olympiads questions

In [28]:
math_dataset = load_dataset("AI-MO/NuminaMath-1.5", split="train", streaming=True)

results = []

for i, example in tqdm(enumerate(math_dataset), desc="Processing"):
    if example["synthetic"] != False:
        continue
    if example["solution_is_valid"] != "Yes" or example["problem_is_valid"] != "Yes":
        continue
    if example["question_type"] != "math-word-problem" or example["answer"] == "proof":
        continue
    if example["answer"] is None or example["answer"] == "":
        continue
    if example["solution"] is None or example["solution"] == "":
        continue
    if example["source"] != "olympiads":
        continue
    problem = example["problem"]
    solution = example["solution"]
    answer = example["answer"]
    results.append({"problem_id": i, "problem": problem, "answer": answer, "solution": solution})

# Write to JSON file
with open("raw_json/numina_math_1p5_filtered.json", mode="w", encoding="utf-8") as jsonfile:
    json.dump(results, jsonfile, ensure_ascii=False, indent=2)

print(len(results))

# 98086


Processing: 896215it [03:39, 4074.99it/s]


84638


In [29]:
with open("raw_json/numina_math_1p5_filtered.json", mode="r", encoding="utf-8") as f:
        data = json.load(f)

### zwhe99/DeepMath-103K

In [24]:
math_dataset = load_dataset("zwhe99/DeepMath-103K", split="train", streaming=True)

results = []

for i, example in tqdm(enumerate(math_dataset), desc="Processing"):
    problem = example["question"]
    answer = example["final_answer"]
    results.append({"problem_id": i, "problem": problem, "answer": answer})

# Write to JSON file
with open("deepmath-103k.json", mode="w", encoding="utf-8") as jsonfile:
    json.dump(results, jsonfile, ensure_ascii=False, indent=2)




Downloading readme: 100%|██████████| 6.44k/6.44k [00:00<00:00, 164kB/s]
Processing: 103110it [03:01, 569.64it/s] 


### simplescaling/data_ablation_full59K

In [22]:
# dataset = load_dataset("simplescaling/data_ablation_full59K", split="train")




Downloading readme: 100%|██████████| 1.07k/1.07k [00:00<00:00, 22.0kB/s]
Downloading data: 100%|██████████| 19/19 [01:01<00:00,  3.25s/files]
Generating train split: 100%|██████████| 58986/58986 [00:40<00:00, 1452.22 examples/s]


In [23]:
dataset[0]

{'solution': 'Since $1 \\le \\sqrt{1} < \\sqrt{2} < \\sqrt{3} < 2,$ the first three terms of the sum are equal to $1.$ Then, since $2 \\le \\sqrt{4} < \\sqrt{5} < \\dots < \\sqrt{8} < 3,$ the next five terms equal $2.$ Then, since $3 \\le \\sqrt{9} < \\sqrt{10} < \\dots < \\sqrt{15} < 4,$ the next seven terms equal $3.$ Finally, the last term equals $\\lfloor 4 \\rfloor = 4.$ So the overall sum is \\[3(1) + 5(2) + 7(3) + 4 = 3 + 10 + 21 + 4 = \\boxed{38}.\\]',
 'question': 'The symbol $\\lfloor x \\rfloor$ denotes the largest integer not exceeding $x$. For example, $\\lfloor 3 \\rfloor = 3,$ and $\\lfloor 9/2 \\rfloor = 4.$ Compute \\[\\lfloor \\sqrt{1} \\rfloor + \\lfloor \\sqrt{2} \\rfloor + \\lfloor \\sqrt{3} \\rfloor + \\cdots + \\lfloor \\sqrt{16} \\rfloor.\\]',
 'cot_type': 'math',
 'source_type': 'qfq/openaimath/Intermediate Algebra',
 'metadata': "{'answer': '38', 'subject': 'Intermediate Algebra', 'level': 2, 'unique_id': 'train/intermediate_algebra/1563.json'}",
 'cot': None,