# 安装依赖

```
操作系统ubuntu20
python版本3.10
cuda版本12.1
pip install modelscope trl transformers ipywidgets addict vllm
```

# 下载模型

In [1]:
from modelscope import snapshot_download
from transformers import AutoTokenizer

model_name = snapshot_download('Qwen/Qwen2.5-0.5B-Instruct')
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token=tokenizer.eos_token

Downloading Model to directory: /root/.cache/modelscope/hub/Qwen/Qwen2.5-0.5B-Instruct


2025-02-16 09:34:11,848 - modelscope - INFO - Target directory already exists, skipping creation.


In [2]:
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

# 奖励函数

In [3]:
import re
from datasets import load_dataset, Dataset

def extract_xml_answer(text: str) -> str:
    match=re.search('<answer>(.*)</answer>',text,re.DOTALL)
    if match:
        answer=match.group(1)
    else:
        answer=''
    return answer.strip()

def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}", f"\nAnswer:\n{answer[0]}")
    return [1 if a in r else 0.0 for r, a in zip(extracted_responses, answer)]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.search(pattern, r, re.DOTALL) for r in responses]
    return [2 if match else 0.0 for match in matches]
    
def strict_format_reward_func(completions, **kwargs) -> list[float]:
    pattern = r"^\s*<reasoning>.*?</reasoning>\s*<answer>.*?</answer>\s*$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.search(pattern, r, re.DOTALL) for r in responses]
    return [4 if match else 0.0 for match in matches]

# 训练集

In [4]:
from modelscope.msdatasets import MsDataset

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

def get_gsm8k_questions(split = "train") -> Dataset:
    data = MsDataset.load('modelscope/gsm8k', subset_name='main', split='train')
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            # few shot, 因为0.5B模型太弱了
            {'role': 'user', 'content': '数字10203040里面有几个0?'},
            {'role': 'assistant', 'content': XML_COT_FORMAT.format(reasoning='可以将数字拆开看，1、0、2、0、3、0、4、0，我们可以数出有4个0',answer='4')},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) 
    return data

dataset = get_gsm8k_questions()
dataset[0]



{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': '72',
 'prompt': [{'content': '\nRespond in the following format:\n<reasoning>\n...\n</reasoning>\n<answer>\n...\n</answer>\n',
   'role': 'system'},
  {'content': '数字10203040里面有几个0?', 'role': 'user'},
  {'content': '\n<reasoning>\n可以将数字拆开看，1、0、2、0、3、0、4、0，我们可以数出有4个0\n</reasoning>\n<answer>\n4\n</answer>\n',
   'role': 'assistant'},
  {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
   'role': 'user'}]}

# GRPO训练

In [None]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True,
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    logging_steps = 1,
    bf16 = True,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4, 
    num_generations = 2, 
    max_prompt_length = 256,
    max_completion_length = 300,
    num_train_epochs=1,
    save_steps = 100,
    max_grad_norm = 0.1,
    vllm_gpu_memory_utilization=0.2,
    report_to = "tensorboard",
    output_dir = "outputs/Qwen2.5-0.5B-Instruct-GRPO",
)

trainer = GRPOTrainer(
    model = model_name,
    processing_class = tokenizer,
    reward_funcs = [
        soft_format_reward_func,
        strict_format_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

INFO 02-16 09:34:15 __init__.py:190] Automatically detected platform cuda.


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


INFO 02-16 09:34:23 config.py:542] This model supports multiple tasks: {'score', 'reward', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 02-16 09:34:23 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.2) with config: model='/root/.cache/modelscope/hub/Qwen/Qwen2___5-0___5B-Instruct', speculative_config=None, tokenizer='/root/.cache/modelscope/hub/Qwen/Qwen2___5-0___5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda:0, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), se

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-16 09:34:24 model_runner.py:1115] Loading model weights took 0.9273 GB
INFO 02-16 09:34:26 worker.py:267] Memory profiling takes 1.42 seconds
INFO 02-16 09:34:26 worker.py:267] the current vLLM instance can use total_gpu_memory (22.03GiB) x gpu_memory_utilization (0.20) = 4.41GiB
INFO 02-16 09:34:26 worker.py:267] model weights take 0.93GiB; non_torch_memory takes 0.21GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 1.83GiB.
INFO 02-16 09:34:26 executor_base.py:110] # CUDA blocks: 9983, # CPU blocks: 21845
INFO 02-16 09:34:26 executor_base.py:115] Maximum concurrency for 32768 tokens per request: 4.87x
INFO 02-16 09:34:28 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_util

Capturing CUDA graph shapes: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:12<00:00,  2.90it/s]

INFO 02-16 09:34:40 model_runner.py:1562] Graph capturing finished in 12 secs, took 0.67 GiB
INFO 02-16 09:34:40 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 15.60 seconds





-------------------- Question:
Ahmed and Emily are having a contest to see who can get the best grade in the class. There have been 9 assignments and Ahmed has a 91 in the class. Emily has a 92. The final assignment is worth the same amount as all the other assignments. Emily got a 90 on the final assignment. What is the minimum grade Ahmed needs to get to beat Emily if all grades are whole numbers? 
Answer:
100 
Response:
To determine the minimum grade Ahmed needs to beat Emily, we'll first compare their grades with the possible lowest grade Ahmed can achieve in the first 9 assignments.

1. Ahmed's grade is 91.
2. Emily's grade is 92.
3. The final grade is worth the same as all the other assignments, so it's 91 (as Emily already has it).

Evaluating the grades for each assignment:

- Assignment 1: 9 + 1 = 10
- Assignment 2: 9 + 1 = 10
- Assignment 3: 9 + 1 = 10
- Assignment 4: 9 + 1 = 10
- Assignment 5: 9 + 1 = 10
- Assignment 6: 9 + 1 = 10
- Assignment 7: 9 + 1 = 10
- Assignment 8: 9

Step,Training Loss
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,-0.0
7,0.0
8,0.0
9,0.0
10,0.0


-------------------- Question:
In a graveyard, there are 20 skeletons.  Half of these skeletons are adult women, and the remaining number are split evenly between adult men and children.  If an adult woman has 20 bones in their body, and a male has 5 more than this, and a child has half as many as an adult woman, how many bones are in the graveyard? 
Answer:
375 
Response:
To determine the total number of bones in the graveyard, let's break down the given data using the information provided:

1. Number of skeletons: 20
2. Half of these skeletons are adult women: 20 / 2 = 10
3. The remaining skeletons are split evenly between adult men and children. So, there are 10 adult men and 10 children.
4. An adult woman has 20 bones.
5. A male has 5 more bones than an adult woman: 20 + 5 = 25 bones.
6. A child has half as many bones as an adult woman: 20 / 2 = 10 bones.

Now, we need to calculate the total number of bones:

- Bones from adult women: 10 * 20 = 200
- Bones from adult men: 10 * 25 =