In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, PreTrainedTokenizer
from collections import defaultdict, deque
import math
import logging
import json
from typing import Dict, List, Tuple, Optional, Union
from dataclasses import dataclass
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm, trange
import wandb
import os
from pathlib import Path
from collections import deque
from torch.utils.data import Dataset
from typing import List, Tuple, Dict, Any, Optional
from collections import Counter
import random
import string
import re
from tqdm import tqdm
from vllm import LLM, SamplingParams
from config import PRMConfig
import torch.multiprocessing as mp
from datasets import load_dataset
from utils import _sanitize_enhanced, _numeric_equiv_enhanced, _extract_boxed_answer, system_prompt

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "2"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"  # Jupyter forks → spawn 전환
mp.set_start_method("spawn", force=True) 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INFO 08-12 20:59:10 [__init__.py:244] Automatically detected platform cuda.


In [3]:
def _pretty_print(prompts: list[str], results, *, nchars: int = 512, show_all_candidates: bool = True, case: str = "Perturbed"):
    for i, (p, r) in enumerate(zip(prompts, results)):
        print(f"\n=== Prompt #{i} {case} ===")
        outs = r.outputs if show_all_candidates else r.outputs[:1]
        for j, cand in enumerate(outs):
            txt = cand.text.strip().replace("\n", " ")
            print(f"  -> cand[{j}]:", txt)
    print("\n") 


class ContriRewardvLLM:
    ANSWER_PATTERN = re.compile(
        r"""^[\s>#*\-]*          # optional markdown/bullet symbols
            Answer               # word 'Answer'
            \s*[:.\-]\s*         # separator
            (.+?)\s*$            # capture everything after
        """,
        re.IGNORECASE | re.MULTILINE | re.VERBOSE,
    )
    _ANSWER_RE = re.compile(r"####\s*(.+?)\s*$")
    _MASK_PATTERN = re.compile(
        r"""
        (?:
            \b\d+(?:\.\d+)?\b         # integers / decimals
          | \b\d+/\d+\b                 # simple fractions
        )
        """,
        re.VERBOSE,
    )
    
    def __init__(self, config: "PRMConfig", model_name: str = "mistralai/Mathstral-7B-v0.1"):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.llm = LLM(
            model=model_name,
            trust_remote_code=True,
            dtype="bfloat16",
            gpu_memory_utilization=0.9,
            max_model_len=4096,
            quantization="bitsandbytes",
        )
        self.tokenizer = self.llm.get_tokenizer()
        
        self.rollout_params = SamplingParams(
            temperature=0.5,
            top_p=0.9,
            max_tokens=self.config.max_new_tokens,
            n=self.config.num_rollouts,
            repetition_penalty=1.1,
        )
        self.masking_params = SamplingParams(
            temperature=0.5,
            top_p=0.9,
            max_tokens=self.config.max_new_tokens,
            n=self.config.num_rollouts,
            repetition_penalty=1.1,
        )
        print(f"vLLM model loaded: {model_name}")

    def _batched_generate(self, prompts: List[str], params: SamplingParams):
        return self.llm.generate(prompts, params)

    def _extract_answer(self, text: str) -> Optional[str]:
        match = self.ANSWER_PATTERN.search(text)
        if match:
            return _sanitize_enhanced(match.group(1))
        lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
        if lines:
            candidate = lines[-1]
            if re.search(r"\d", candidate):  # contains digit
                return _sanitize_enhanced(candidate)
        for line in reversed(text.splitlines()):
            if line.strip().lower().startswith("answer"):
                return _sanitize_enhanced(line.split("Answer", 1)[-1])
        return None

    def _score_batch(self, outputs, gold_answer: str) -> List[float]:
        """Convert vLLM batched outputs → reward list (fraction of correct roll‑outs)."""
        rewards = []
        for result in outputs:
            correct = sum(
                1 for comp in result.outputs
                if (ans := self._extract_answer(comp.text)) and _numeric_equiv_enhanced(ans, gold_answer)
            )
            rewards.append(correct / float(self.config.num_rollouts))
        return rewards

    def compute_step_rewards_batch(self, question: str, sys_prompt: str, steps: List[str], gold_answer: str) -> List[float]:
        base_prompt = f"{sys_prompt}\n\nProblem: {question}\n"
        prompts = [
            base_prompt + "\n".join(steps[:i + 1]) + "\n" + (f"Step {i + 2}:" if i < len(steps) - 1 else "Answer:")
            for i in range(len(steps))
        ]
        outputs = self._batched_generate(prompts, self.rollout_params)
        _pretty_print(prompts, outputs, show_all_candidates=True, case="Original")
        return self._score_batch(outputs, gold_answer)
        
    def model_masking_batch(self, texts: List[str]) -> List[str]:
        mask_prompts = [
            (
                "In the sentence below, mask any word or expression that seems crucial (such as a variable or a number or a operator etc.) "
                "for solving the math problem by replacing it with '[MASKED]'.\n"
                f"Sentence: \"{t}\"\nRewritten:"
            )
            for t in texts
        ]
        outputs = self._batched_generate(mask_prompts, self.masking_params)
        return [out.outputs[0].text.strip() for out in outputs]

    def perturb_step_rewards_batch(self, question: str, sys_prompt: str, steps: List[str], gold_answer: str, use_llm: bool = True) -> List[float]:
        base_prompt = f"{sys_prompt}\n\nProblem: {question}\n"
        bodies = []
        prefixes = []
        for step in steps:
            m = re.match(r"^[\s>#*\-]*Step\s*\d+\s*[:.\-]\s*", step, flags=re.I)
            prefixes.append(m.group(0) if m else "")
            bodies.append(step[len(prefixes[-1]):])

        if use_llm:
            masked_bodies = self.model_masking_batch(bodies)
            print("Masked Bodies:", masked_bodies, flush=True)
        else:
            masked_bodies = [self._MASK_PATTERN.sub("[MASKED]", b) for b in bodies]
            
        prompts = []
        for i in range(len(steps)):
            masked_step = prefixes[i] + masked_bodies[i]
            staged_steps = steps[:i] + [masked_step]
            label = f"Step {i + 2}:" if i < len(steps) - 1 else "Answer:"
            prompts.append(base_prompt + "\n".join(staged_steps) + "\n" + label)

        outputs = self._batched_generate(prompts, self.rollout_params)
        _pretty_print(prompts, outputs, show_all_candidates=True)
        return self._score_batch(outputs, gold_answer)

    def gsm8k_reward_dataset_vllm(self, *, split: str = "train", start: int = 0, take: int | None):
        ds = load_dataset("openai/gsm8k", "main", split=split)
        ds = ds.select(range(start, start + take)) if take else ds
        # ds = ds.select(range(start, len(ds)))
        # print("Generated dataset size: ", len(ds))

        for sample in tqdm(ds, desc="Building GSM8K contri reward-dataset"):
            q_txt, g_sol = sample["question"], sample["answer"]
            lines, gold_ans = [], None
            
            for ln in g_sol.splitlines():
                ln = ln.strip()
                if not ln:
                    continue
                m = self._ANSWER_RE.match(ln)
                if m:
                    gold_ans = _sanitize_enhanced(m.group(1))
                    break
                lines.append(ln)
            if gold_ans is None:
                raise ValueError("gold answer not found for sample")
            
            steps = [f"Step {i+1}: {t}" for i, t in enumerate(lines)]

            ori = self.compute_step_rewards_batch(q_txt, system_prompt("rollout"), steps, gold_ans)
            ptb = self.perturb_step_rewards_batch(q_txt, system_prompt("rollout"), steps, gold_ans, self.config.use_llm)
            contrib = [round(o - p, 4) for o, p in zip(ori, ptb)]

            entry = {
                "question": q_txt,
                "completion": steps,
                "ori_rewards": ori,
                "ptb_rewards": ptb,
                "contributions": contrib,
                "gold_answer": gold_ans,
            }
            yield entry

    def math_reward_dataset_vllm(self, *, split: str = "train", start: int = 0, take: int | None):
        sent_split = re.compile(r'\.(?!\d)(?=\s|$)')
        ds = load_dataset("HuggingFaceTB/MATH", "all", split=split)
        ds = ds.select(range(start, start + take)) if take else ds
        # ds = ds.select(range(start, len(ds)))
        # print("Generated dataset size: ", len(ds))
        
        for sample in tqdm(ds, desc="Building MATH contri reward-dataset"):
            full_sol = sample["solution"]
            boxed_content = _extract_boxed_answer(full_sol)
            gold_ans = _sanitize_enhanced(boxed_content) if boxed_content else None
            if gold_ans is None:
                lines = [line.strip() for line in full_sol.splitlines() if line.strip()]
                for line in reversed(lines):
                    if re.search(r'[\d\-+*/()=]', line):
                        gold_ans = _sanitize_enhanced(line)
                        break
            
            sol_wo_box = re.sub(r'\\boxed\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', '', full_sol)
            raw_steps = [s.strip() for s in sent_split.split(sol_wo_box) if s.strip()]
            steps = [f"Step {i+1}: {s}" for i, s in enumerate(raw_steps)]

            ori = self.compute_step_rewards_batch(sample["problem"], system_prompt("rollout"), steps, gold_ans)
            ptb = self.perturb_step_rewards_batch(sample["problem"], system_prompt("rollout"), steps, gold_ans, self.config.use_llm)
            contrib = [round(o - p, 4) for o, p in zip(ori, ptb)]

            entry = {
                "question": sample["problem"],
                "completion": steps,
                "ori_rewards": ori,
                "ptb_rewards": ptb,
                "contributions": contrib,
                "gold_answer": gold_ans,
            }
            yield entry


In [4]:
cfg = PRMConfig()
model_name = "mistralai/Mathstral-7B-v0.1"  
contri = ContriRewardvLLM(config=cfg, model_name=model_name)

from pprint import pprint
# gen = contri.gsm8k_reward_dataset_vllm(split="train", start=0, take=3)
gen = contri.math_reward_dataset_vllm(split="train", start=0, take=3)

for i, entry in enumerate(gen, 1):
    print(f"\n===== SAMPLE {i} =====")

# next(contri.gsm8k_reward_dataset_vllm(split="train", start=0, take=3))

INFO 08-08 10:05:16 [config.py:823] This model supports multiple tasks: {'score', 'reward', 'embed', 'generate', 'classify'}. Defaulting to 'generate'.
INFO 08-08 10:05:16 [config.py:3268] Downcasting torch.float32 to torch.bfloat16.
INFO 08-08 10:05:17 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.


  self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)


INFO 08-08 10:05:24 [__init__.py:244] Automatically detected platform cuda.
INFO 08-08 10:05:27 [core.py:455] Waiting for init message from front-end.
INFO 08-08 10:05:27 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='mistralai/Mathstral-7B-v0.1', speculative_config=None, tokenizer='mistralai/Mathstral-7B-v0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_en

[W808 10:05:28.696153281 socket.cpp:755] [c10d] The client socket cannot be initialized to connect to [ubuntu]:40887 (errno: 97 - Address family not supported by protocol).


INFO 08-08 10:05:28 [parallel_state.py:1065] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 08-08 10:05:28 [gpu_model_runner.py:1595] Starting to load model mistralai/Mathstral-7B-v0.1...
INFO 08-08 10:05:28 [gpu_model_runner.py:1600] Loading model from scratch...
INFO 08-08 10:05:29 [cuda.py:252] Using Flash Attention backend on V1 engine.
INFO 08-08 10:05:29 [bitsandbytes_loader.py:454] Loading weights with BitsAndBytes quantization. May take a while ...
INFO 08-08 10:05:30 [weight_utils.py:292] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/6 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  17% Completed | 1/6 [00:02<00:10,  2.08s/it]
Loading safetensors checkpoint shards:  33% Completed | 2/6 [00:03<00:05,  1.44s/it]
Loading safetensors checkpoint shards:  50% Completed | 3/6 [00:05<00:05,  1.80s/it]
Loading safetensors checkpoint shards:  67% Completed | 4/6 [00:06<00:02,  1.41s/it]
Loading safetensors checkpoint shards:  83% Completed | 5/6 [00:08<00:01,  1.67s/it]
Loading safetensors checkpoint shards: 100% Completed | 6/6 [00:09<00:00,  1.42s/it]
Loading safetensors checkpoint shards: 100% Completed | 6/6 [00:09<00:00,  1.53s/it]



INFO 08-08 10:05:41 [gpu_model_runner.py:1624] Model loading took 3.8620 GiB and 12.200555 seconds
INFO 08-08 10:05:50 [backends.py:462] Using cache directory: /home/leena/.cache/vllm/torch_compile_cache/79c7eedbbf/rank_0_0 for vLLM's torch.compile
INFO 08-08 10:05:50 [backends.py:472] Dynamo bytecode transform time: 8.28 s
INFO 08-08 10:05:58 [backends.py:135] Directly load the compiled graph(s) for shape None from the cache, took 8.271 s
INFO 08-08 10:06:00 [monitor.py:34] torch.compile takes 8.28 s in total
INFO 08-08 10:06:02 [gpu_worker.py:227] Available KV cache memory: 37.74 GiB
INFO 08-08 10:06:02 [kv_cache_utils.py:715] GPU KV cache size: 309,168 tokens
INFO 08-08 10:06:02 [kv_cache_utils.py:719] Maximum concurrency for 4,096 tokens per request: 75.48x
INFO 08-08 10:06:52 [gpu_model_runner.py:2048] Graph capturing finished in 50 secs, took 1.70 GiB
INFO 08-08 10:06:52 [core.py:171] init engine (profile, create kv cache, warmup model) took 70.66 seconds


  self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)


vLLM model loaded: mistralai/Mathstral-7B-v0.1


Building MATH contri reward-dataset:   0%|          | 0/3 [00:00<?, ?it/s]

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/48 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


=== Prompt #0 Original ===
  -> cand[0]: At $x=2$, we have $ax+3$ and $x-5$. Setting them equal gives $2a+3=2-5$ or $2a=-8$ so $a=-4$. Step 3: At $x=-2$, we have $x-5$ and $2x-b$. Setting them equal gives $-2-5=2(-2)-b$ or $-7=-4-b$ so $b=-3$. Step 4: Therefore, $a+b=-4+(-3)=-7$. Answer: -7
  -> cand[1]: At $x=2$, we have $ax+3$ and $x-5$. Setting them equal gives $2a+3 = 2-5 \Rightarrow 2a = -6 \Rightarrow a = -3$ Step 3: At $x=-2$, we have $x-5$ and $2x-b$. Setting them equal gives $-2-5 = -4-b \Rightarrow b = 3$ Step 4: Adding $a$ and $b$ together gives $(-3)+3 = 0$ Answer: 0
  -> cand[2]: At $x=2$, $ax+3 = x-5$. Solving gives $a = -\frac{8}{3}$ Step 3: At $x=-2$, $x-5 = 2x-b$. Solving gives $b = -3$ Step 4: Therefore, $a+b = -\frac{8}{3}-3 = -\frac{17}{3}$ Answer: $-\frac{17}{3}$
  -> cand[3]: At $x=2$, we have $ax+3$ and $x-5$. Setting them equal gives $a(2)+3 = 2-5$, so $2a+3=-3$, thus $2a=-6$, hence $a=-3$ Step 3: At $x=-2$, we have $x-5$ and $2x-b$. Setting them equal gives $-

Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/48 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Masked Bodies: ['"For the piecewise function to be continuous, the cases must "[MASKED]" at $2$ and $-2$".', '"For instance, $[MASKED]$ and $[MASKED]$ are equivalent when $[MASKED]$."', '"This implies $[MASKED](2)+3=[MASKED]-5$, which we solve to get $[MASKED]=[MASKED] \\Rightarrow [MASKED]=[MASKED]$"', '"Similarly, $[MASKED]$ and $[MASKED]$ must be equal when $x=-2$"', '"Substituting, we get $[MASKED]-5=[MASKED](-2)-b$, which implies $b=3$"', '"So $[MASKED]+[MASKED]=[-3+3]$="']


Adding requests:   0%|          | 0/6 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/48 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Building MATH contri reward-dataset:  33%|███▎      | 1/3 [00:56<01:53, 56.53s/it]


=== Prompt #0 Perturbed ===
  -> cand[0]: "At $x=2$, we have $[ax+3]=[x-5]$", so $a*2+3=2-5$. Step 3: "Solving for $a$, we get $2a=-8$, so $a=-4$". Step 4: "At $x=-2$, we have $[x-5]=[2x-b]$", so $-2-5=2(-2)-b$. Step 5: "Solving for $b$, we get $-7=-4-b$, so $b=3$". Step 6: "Therefore, $a+b=-4+3=\boxed{-1}$".
  -> cand[1]: "At $x=2$, we have $[ax+3]=[x-5]$." Step 3: "Solving this gives $a=-\frac{8}{3}$." Step 4: "At $x=-2$, we have $[x-5]=[2x-b]$." Step 5: "Solving this gives $b=-1$." Step 6: "Therefore, $a+b=-\frac{8}{3}-1=\boxed{-\frac{11}{3}}$."
  -> cand[2]: At $x = 2$, we have $ax + 3$ and $x - 5$. Setting them equal gives $[2a + 3] = [2 - 5]$. Step 3: Solving for $a$, we get $2a = -7 \Rightarrow a = -\frac{7}{2}$. Step 4: At $x = -2$, we have $x - 5$ and $2x - b$. Setting them equal gives $[-2 - 5] = [-4 - b]$. Step 5: Solving for $b$, we get $-7 = -4 - b \Rightarrow b = 3$. Step 6: Therefore, $a + b = -\frac{7}{2} + 3 = -\frac{1}{2}$. Answer: $-\frac{1}{2}$
  -> cand[3]: "At $x

Adding requests:   0%|          | 0/8 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


=== Prompt #0 Original ===
  -> cand[0]: Express the total number of band members as $rx + 2$. Step 3: When the number of members in each row is increased by 1 and the number of rows is decreased by 2, the total number of band members can also be expressed as $(r-2)(x+1)$. Step 4: Equate these two expressions: $rx + 2 = (r-2)(x+1)$ Step 5: Expand and simplify: $rx + 2 = rx - 2x + r - 2$ Step 6: Rearrange to isolate terms involving $r$: $2x = r - 2$ Step 7: Since $r$ must be an integer, and $x$ is also an integer, we need to find values of $r$ such that $r-2$ is divisible by 2. This means $r$ must be even. Step 8: Testing values of $r$, we find that $r=10$ satisfies the condition since $10-2=8$, which is divisible by 2. Step 9: Substitute $r=10$ back into the equation $2x = r - 2$ to find $x$: $2x = 10 - 2 \Rightarrow 2x = 8 \Rightarrow x = 4$ Step 10: Calculate the total number of band members using $m=100-2=98$ and $r=10$, then calculate the number of band members: $rx + 2 = 10 \time

Adding requests:   0%|          | 0/8 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Masked Bodies: ['"Let $[MASKED]$ be the number of band members in each row for the original formation, when two are left over."', '"Then we can write two equations from the given information: $$[MASKED]+2=[MASKED]$$ $$([MASKED]-2)([MASKED]+1)=[MASKED]$$ Setting these equal, we find: $$[MASKED]=([MASKED]-2)([MASKED]+1)=[MASKED]-2[MASKED]+[MASKED]-2$$ $$2=-2[MASKED]+[MASKED]-2$$ $$4=[MASKED]-2[MASKED]$$ We know that the band has less than 100 members"', '"Based on the first equation, we must have [MASKED] less than 98"', '"We can guess and check some values of $[MASKED]$ and $[MASKED]$ in the last equation"', '"If $[MASKED]=18$, then $x=7$, and $[MASKED]\\times x=126$ which is too big"', '"If [MASKED]=[MASKED], then [MASKED]=[MASKED], and [MASKED]=[MASKED], which is less than [MASKED]."', '"Checking back in the second formation, we see that $[MASKED]([MASKED])([MASKED])=[MASKED]\\cdot [MASKED]=[MASKED]$ as it should"', '"This is the best we can do, so the largest number of members the ba

Adding requests:   0%|          | 0/8 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/64 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Building MATH contri reward-dataset:  67%|██████▋   | 2/3 [03:36<01:57, 117.39s/it]


=== Prompt #0 Perturbed ===
  -> cand[0]: "Then we have the equation $[MASKED] = r^2 + 2$. This means that the total number of band members is $rm + 2$. Since the total number of band members is less than 100, we have $rm + 2 < 100$, so $rm \le 98$. Also, since $m > r$, we have $m \ge r + 1$, so $rm \ge r(r + 1)$. Therefore, we must have $r(r + 1) \le 98$." Step 3: "We can check to see which values of $r$ satisfy this inequality. We find that $r = 9$ satisfies the inequality, but $r = 10$ does not." Step 4: "If $r = 9$, then $m = r + 1 = 10$, so the total number of band members is $rm + 2 = 9 \cdot 10 + 2 = 92$. However, if we increase the number of members in each row by 1 and reduce the number of rows by 2, we will have $m' = m + 1 = 11$ and $r' = r - 2 = 7$. There will be exactly enough places for all the band members, as $(m+1)(r-2)=(m+1)(r-2)$ Step 4: 3x + 7 = 22 Step 5: Subtract 7 from both sides: 3x = 22 - 7 = 15 Step 6: Divide both sides by 3: x = 15 ÷ 3 = 5 Answer: 5  The lar

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


=== Prompt #0 Original ===
  -> cand[0]: Rewrite in standard form: $2\pi x^4 + (\sqrt{10}+5)x^4 + 4 + 100 + 5x^3$ Step 3: Identify the highest power of x: 4 Answer: 4
  -> cand[1]: Rearrange terms in decreasing order of power: $2\pi x^4 + \sqrt{10}x^4 + 5x^3 + 100 + 4$ Step 3: Identify the highest power: $x^4$ Answer: 4
  -> cand[2]: Rewrite the polynomial in standard form: $2\pi x^4 + \sqrt{10}x^4 + 5x^3 + 104$ Step 3: Identify the highest power of x: 4 Answer: 4
  -> cand[3]: Rewrite the polynomial in standard form: $4 + 5x^3 + 100 + 2\pi x^4 + \sqrt{10}x^4 + 9$ becomes $103 + 5x^3 + (2\pi + \sqrt{10})x^4$ Step 3: Identify the highest power of x: $x^4$ Step 4: The degree of a polynomial is the highest power of x Answer: 4
  -> cand[4]: Rewrite the polynomial in standard form: $2\pi x^4 + \sqrt{10}x^4 + 100 + 5x^3 + 4$ Step 3: Identify the highest power of x: 4 Answer: 4
  -> cand[5]: Rearrange terms to group like powers of $x$ together: $(4 + 100 + 9) + (5x^3 + 2\pi x^4 + \sqrt{10}x

Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Masked Bodies: ['"This polynomial is not written in standard form"', '"However, we don\'t need to write it in standard form, nor do we need to pay attention to the [MASKED]."', '"We just look for the exponents on $[MASKED]$"', '"We have an $[MASKED]$ term and no other term of higher degree, so $$ is the degree of the polynomial"']


Adding requests:   0%|          | 0/4 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Building MATH contri reward-dataset: 100%|██████████| 3/3 [04:17<00:00, 85.73s/it] 


=== Prompt #0 Perturbed ===
  -> cand[0]: Rewrite the polynomial in standard form: $100 + 4 + 2\pi x^4 + \sqrt{10}x^4 + 5x^3$ Step 3: Identify the highest power of x: $x^4$ Step 4: The degree of the polynomial is the highest power of x: 4 Answer: 4
  -> cand[1]: "Rewrite it in standard form to identify the highest power of x" Step 3: Rearrange terms to get $(2\pi x^4 + \sqrt{10}x^4 + 5x^3 + 100 + 4)$ Step 4: Identify the highest power of x, which is 4 Answer: 4
  -> cand[2]: "Rearrange the terms to group like terms together" Step 3: "$(4 +100 +9) + (5x^3 + 2\pi x^4 + \sqrt{10}x^4)$" Step 4: "The constant terms are $4 + 100 + 9 = 113$" Step 5: "Combine the terms with variable parts: $(5x^3 + (2\pi + \sqrt{10})x^4)$" Step 6: "Identify the highest power of x" Step 7: "The highest power of x is 4" Answer: 4
  -> cand[3]: "Rewrite the polynomial in standard form: $2\pi x^4 + (\sqrt{10}+100)x^4 + 5x^3 + 4$" Step 3: "Identify the highest power of x: $x^4$" Step 4: "The degree of the polynomi




In [7]:
gsm = load_dataset("openai/gsm8k", "main", split="train")
for idx in range(0,3):
    print(gsm[idx]['answer'])
    print()

Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10

In the beginning, Betty has only 100 / 2 = $<<100/2=50>>50.
Betty's grandparents gave her 15 * 2 = $<<15*2=30>>30.
This means, Betty needs 100 - 50 - 30 - 15 = $<<100-50-30-15=5>>5 more.
#### 5



In [5]:
math = load_dataset("HuggingFaceTB/MATH", "all", split="train")
for idx in range(0,3):
    print(math[idx]['solution'])
    print()

For the piecewise function to be continuous, the cases must "meet" at $2$ and $-2$. For example, $ax+3$ and $x-5$ must be equal when $x=2$. This implies $a(2)+3=2-5$, which we solve to get $2a=-6 \Rightarrow a=-3$. Similarly, $x-5$ and $2x-b$ must be equal when $x=-2$. Substituting, we get $-2-5=2(-2)-b$, which implies $b=3$. So $a+b=-3+3=\boxed{0}$.

Let $x$ be the number of band members in each row for the original formation, when two are left over.  Then we can write two equations from the given information: $$rx+2=m$$ $$(r-2)(x+1)=m$$ Setting these equal, we find: $$rx+2=(r-2)(x+1)=rx-2x+r-2$$ $$2=-2x+r-2$$ $$4=r-2x$$ We know that the band has less than 100 members.  Based on the first equation, we must have $rx$ less than 98.  We can guess and check some values of $r$ and $x$ in the last equation.  If $r=18$, then $x=7$, and $rx=126$ which is too big.  If $r=16$, then $x=6$, and $rx=96$, which is less than 98.  Checking back in the second formation, we see that $(16-2)(6+1)=14\cdo