# Data preprocessing - Math QA

In [11]:

import argparse
import os
import re

import datasets

from verl.utils.hdfs_io import copy, makedirs

In [14]:
def extract_solution(solution_str):
    solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
    assert solution is not None
    final_solution = solution.group(0)
    final_solution = final_solution.split("#### ")[1].replace(",", "")
    return final_solution


In [15]:
data_source = "openai/gsm8k"
instruction_following = 'Let\'s think step by step and output the final answer after "####".'
dataset = datasets.load_dataset(data_source, "main")

train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [16]:
train_dataset

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})

In [17]:
train_dataset[0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}

In [18]:
# add a row to each data item that represents a unique id
def make_map_fn(split):
    def process_fn(example, idx):
        question_raw = example.pop("question")

        question = question_raw + " " + instruction_following

        answer_raw = example.pop("answer")
        solution = extract_solution(answer_raw)
        data = {
            "data_source": data_source,
            "prompt": [
                {
                    "role": "user",
                    "content": question,
                }
            ],
            "ability": "math",
            "reward_model": {"style": "rule", "ground_truth": solution},
            "extra_info": {
                "split": split,
                "index": idx,
                "answer": answer_raw,
                "question": question_raw,
            },
        }
        return data

    return process_fn

train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)

In [19]:
train_dataset.to_parquet(os.path.join("local_parquet_dir", "train.parquet"))
test_dataset.to_parquet(os.path.join("local_parquet_dir", "test.parquet"))

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 18.96ba/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 191.79ba/s]


1211361

In [9]:
train_dataset

Dataset({
    features: ['data_source', 'prompt', 'ability', 'reward_model', 'extra_info'],
    num_rows: 7473
})

In [10]:
train_dataset[0]

{'data_source': 'openai/gsm8k',
 'prompt': [{'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Let\'s think step by step and output the final answer after "####".',
   'role': 'user'}],
 'ability': 'math',
 'reward_model': {'ground_truth': '72', 'style': 'rule'},
 'extra_info': {'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72',
  'index': 0,
  'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
  'split': 'train'}}

# Data Preprocessing Example: Code Generation