In [1]:
from datasets import load_dataset, Dataset
from collections import defaultdict, deque
from tqdm import tqdm
import gc

ds = load_dataset("nvidia/OpenMathInstruct-1", split='train')

g_cycle = deque(['gsm8k', 'math'])
s_deque = {'gsm8k': deque(), 'math': deque()}
unique_questions = set()

for i, ex in enumerate(tqdm(ds, desc='Iterating dataset')):
    dataset_name = ex.get('dataset')
    question = ex.get('question')
    # chỉ xét sample đúng
    if ex['is_correct'] != True:
        continue
    if  question in unique_questions or dataset_name not in ('gsm8k', 'math'):
        continue
    
    unique_questions.add(question)
    s_deque[dataset_name].append({
        'question': question,
        'answer': ex.get('expected_answer'),
    })

del ds
gc.collect()

  from .autonotebook import tqdm as notebook_tqdm
Iterating dataset: 100%|██████████| 7321344/7321344 [02:23<00:00, 51123.33it/s]
Iterating dataset: 100%|██████████| 7321344/7321344 [02:23<00:00, 51123.33it/s]


0

In [2]:
import re
def parse_solution_into_steps(solution):
    steps = []
    
    block_pattern = r'(<llm-code>.*?</llm-code>|<llm-code-output>.*?</llm-code-output>)'
    parts = re.split(block_pattern, solution, flags=re.DOTALL)
    
    for part in parts:
        part = part.strip()
        if not part:
            continue
        
        # code block or code-output block, keep as one step
        if part.startswith('<llm-code>') or part.startswith('<llm-code-output>'):
            steps.append(part)
        
        # Plain text - split by sentences (dấu chấm)
        else:
            sentences = re.split(r'(?<=[.!?])\s+', part)
            for sent in sentences:
                sent = sent.strip()
                if sent:
                    steps.append(sent)
    
    return steps

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.strip()
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

ADAPTER_PATH = "/home/guest/AdvancedLLMReasoning/math_tutor_model/math_sft_adapter/v2/final_checkpoint" 
BASE_MODEL_ID = "meta-llama/Llama-3.2-1B"

bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
sft_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
sft_model.eval()

`torch_dtype` is deprecated! Use `dtype` instead!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

In [4]:
from collections import deque
import random
seed = 42
ran = random.Random(seed)

In [5]:
instruction = (
            "Solve the problem step by step. You can use Python code if needed.\n"
            "If you write code, wrap it inside <llm-code> ... </llm-code>.\n"
            "Output ONLY the final number inside \\boxed{}."
)

In [6]:
TARGET_SIZE = 200000
total_steps = 0
total_questions = 0
prm_dataset = []

g_cycle = deque(['gsm8k', 'math'])

# Batch processing
BATCH_SIZE = 32
batch_questions = []
batch_answers = []
batch_prompts = []

with tqdm(total=TARGET_SIZE, desc="Steps collected") as pbar:
    while total_steps < TARGET_SIZE:
        # Thu thập batch
        while len(batch_prompts) < BATCH_SIZE:
            g = g_cycle.popleft()
            g_cycle.append(g)

            if not s_deque[g]:
                continue

            s = s_deque[g].popleft()
            question = s['question']
            answer = s['answer']
            
            prompt = (
                f"### Question:\n{clean_text(question)}\n\n"
                f"### Instruction:\n{instruction}\n\n"
                f"### Solution:\n"
            )
            
            batch_prompts.append(prompt)
            batch_questions.append(question)
            batch_answers.append(answer)
        
        # Tokenize và generate cho batch
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(sft_model.device)

        with torch.no_grad():
            outputs = sft_model.generate(
                **inputs, 
                max_new_tokens=512, 
                temperature=0.7, 
                top_p=0.9, 
                do_sample=True, 
                pad_token_id=tokenizer.eos_token_id)

        # Xử lý từng output trong batch
        for idx in range(len(batch_prompts)):
            generated_ids = outputs[idx][inputs["input_ids"].shape[-1]:]
            solution = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

            steps = parse_solution_into_steps(solution)
            if not steps:
                continue

            num_steps = len(steps)
            if total_steps + num_steps > TARGET_SIZE:
                break

            total_steps += num_steps
            total_questions += 1
            pbar.update(num_steps)

            prm_dataset.append({
                "question": batch_questions[idx],
                "expected_answer": batch_answers[idx],
                "solution_steps": steps
            })
        
        # Clear batch
        batch_prompts = []
        batch_questions = []
        batch_answers = []
        
        if not s_deque['gsm8k'] and not s_deque['math']:
            break

Steps collected:   0%|          | 0/200000 [00:00<?, ?it/s]

Steps collected:   8%|▊         | 16237/200000 [18:25<3:28:27, 14.69it/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 1.45 GiB. GPU 0 has a total capacity of 23.52 GiB of which 1.37 GiB is free. Process 1685860 has 4.34 GiB memory in use. Process 1689426 has 4.51 GiB memory in use. Process 1694130 has 6.18 GiB memory in use. Including non-PyTorch memory, this process has 6.77 GiB memory in use. Of the allocated memory 4.97 GiB is allocated by PyTorch, and 1.34 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)