### 引用模型

In [1]:
from unsloth import FastModel
import torch
max_seq_length = 1536 # 模型的最大序列长度，默认是1024
lora_rank = 8 # LoRA的秩，越大越好，但会消耗更多内存 #8

model, tokenizer = FastModel.from_pretrained(
    model_name = "/home/projects/unsloth-training/models/gemma-3-1b-it", #"unsloth/gemma-3-1b-it",
    max_seq_length = max_seq_length, # 可以选择任意长度以支持长上下文！
    load_in_4bit = False,  # 4位量化以减少内存使用
    load_in_8bit = False, # 精度更高，但使用2倍内存
    full_finetuning = False, # 现在我们支持完全微调了！
    # gpu_memory_utilization = 0.85, # GPU内存使用率，如果出现OOM可以降低此值
    # token = "hf_...", # 使用受限模型时需要提供token
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-20 19:18:36 [__init__.py:256] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.17: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.1.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


### 加载 Lora 设置

In [2]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # 仅处理文本层或者模型没有视觉层时关闭！
    finetune_language_layers   = True,  # 应该保持开启！
    finetune_attention_modules = True,  # 注意力机制对GRPO有好处
    finetune_mlp_modules       = True,  # 应该始终保持开启！

    r = lora_rank,           # 更大 = 更高的精度，但可能过拟合
    lora_alpha = lora_rank,  # 建议alpha至少等于r
    lora_dropout = 0,
    bias = "none",
    random_state = 3407, # 使用同一个随机数种子
)

Unsloth: Making `model.base_model.model.model` require gradients


### 加载数据集

In [3]:
from datasets import load_dataset
dataset = load_dataset("openai/gsm8k", "main", split = "train")

# 查看数据情况
dataset
dataset[0]["question"]
dataset[0]["question"]

'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'

In [4]:
# 回答总是以####开头，对回答数据做抽取，为后续的数据集清理做准备。
def extract_hash_answer(text):
    if "####" not in text: return None
    return text.split("####")[1].strip()
extract_hash_answer(dataset[0]["answer"])

'72'

In [5]:
# 设置系统提示此
reasoning_start = "<start_working_out>"
reasoning_end   = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

system_prompt = \
f"""You are given a problem.
Think about the problem and provide your working out.
Place it between {reasoning_start} and {reasoning_end}.
Then, provide your solution between {solution_start}{solution_end}"""
system_prompt

'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>'

In [6]:
# 对数据集做转换，使其适配 GRPO 训练
dataset = dataset.map(lambda x: {
    "prompt" : [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": x["question"]},
    ],
    "answer": extract_hash_answer(x["answer"]), #抽取数学问题的结果
})
dataset[0]

{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
 'answer': '72',
 'prompt': [{'content': 'You are given a problem.\nThink about the problem and provide your working out.\nPlace it between <start_working_out> and <end_working_out>.\nThen, provide your solution between <SOLUTION></SOLUTION>',
   'role': 'system'},
  {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',
   'role': 'user'}]}

### 定义奖励函数

In [7]:
import re

# 定义正则表达式，用来判断模型的输出是否符合格式要求
match_format = re.compile(
    rf"^[\s]{{0,}}"\
    rf"{reasoning_start}.+?{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)

In [8]:
match_format.search(
    "<start_working_out>Let me think!<end_working_out>"\
    "<SOLUTION>2</SOLUTION>",
)

<re.Match object; span=(0, 71), match='<start_working_out>Let me think!<end_working_out>>

In [9]:
# 格式匹配函数
def match_format_exactly(completions, **kwargs):
    """格式判断函数，严格判断格式是否匹配

    Args:
        completions (_type_): _description_

    Returns:
        _type_: Number 0 ｜ 3
    """
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Match if format is seen exactly!
        if match_format.search(response) is not None: score += 3.0
        scores.append(score)
    return scores

In [10]:
def match_format_approximately(completions, **kwargs):
    """弱格式判断奖励，即使没有严格对应，也可以根据使用的标签数量来做出相应的奖励

    Args:
        completions (_type_): _description_

    Returns:
        _type_: Number
    """
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # 数一数看到多少个关键词——如果太多，我们会惩罚你！
        # 如果我们看到1，那么加一些积分！如果更多了，那么就应当扣除一些分
        score += 0.5 if response.count(reasoning_start) == 1 else -0.5
        score += 0.5 if response.count(reasoning_end)   == 1 else -0.5
        score += 0.5 if response.count(solution_start)  == 1 else -0.5
        score += 0.5 if response.count(solution_end)    == 1 else -0.5
        scores.append(score)
    return scores

In [11]:
def check_answer(prompts, completions, answer, **kwargs):
    """通过比较提取的答案与参考答案来评估模型响应。
    
    该函数从结构化模型输出中提取答案并与参考答案进行比较，根据匹配质量分配分数：
    - 完全匹配：3.0分
    - 去除空格后匹配：1.5分
    - 数值答案在正确值10%范围内：0.5分
    - 数值答案在正确值20%范围内：0.25分
    - 错误答案：-0.5或-1.0分
    
    参数：
        prompts (list)：提供给模型的对话提示列表
        completions (list)：需要评估的模型生成的回答
        answer (list)：用于比较的参考答案
        **kwargs：额外参数
        
    返回：
        list：基于答案正确性的每个回答的得分
    """
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_format.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0
        if guess is None:
            scores.append(0)
            continue
        # 如果完全一致，就给出 3 分 
        if guess == true_answer:
            score += 3.0
        # 如果结果正确，但是有空格，就给1.5分
        elif guess.strip() == true_answer.strip():
            score += 1.5
        else:
            # 如果答案接近比率，我们也会奖励它！
            # 即，如果答案在某个范围内，奖励它！
            try:
                ratio = float(guess) / float(true_answer)
                if   ratio >= 0.9 and ratio <= 1.1: score += 0.5
                elif ratio >= 0.8 and ratio <= 1.2: score += 0.25
                else: score -= 1.0 # Penalize wrong answers
            except:
                # 如果直接异常了，就抛出错误
                score -= 0.5 # Penalize
        scores.append(score)
    return scores

In [12]:
# 对于数学问题，先给数字部分抽取出来
match_numbers = re.compile(
    rf"{solution_start}.*?([\d\.]{{1,}})",
    flags = re.MULTILINE | re.DOTALL
)
match_numbers.findall("<SOLUTION>  0.34  </SOLUTION>")

['0.34']

In [13]:
def check_numbers(prompts, completions, answer, **kwargs):
    """使用正则表达式从模型输出中提取数字答案并进行评分。
    
    该函数从模型响应中提取数字，并与参考答案进行数值比较。
    如果提取的数字与正确答案完全匹配，将获得1.5分，否则为0分。
    
    参数：
        prompts (list)：提供给模型的对话提示列表
        completions (list)：需要评估的模型生成的回答
        answer (list)：用于比较的参考答案数值
        **kwargs：额外参数
        
    返回：
        list：基于数值匹配的评分列表
    """
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    print('*'*20, f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(0)
            continue
        # Convert to numbers
        try:
            true_answer = float(true_answer.strip())
            guess       = float(guess.strip())
            scores.append(1.5 if guess == true_answer else 0.0)
        except:
            scores.append(0)
            continue
    return scores

### 训练部分

In [None]:
max_prompt_length = 256

# 使用 GRPO 训练器，并构造训练器
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    beta = 0.0, # 设置为 0 以禁用 KL 散度惩罚 # defaults to 0.04
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_torch_fused",
    logging_steps = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # 增加到4，以便更顺滑地训练 #1
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_seq_length - max_prompt_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 500, # 训练步数
    save_steps = 100, # 每50步保存一次
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs_gemma3_1b_it", # 输出目录
)

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


开始训练。期望在训练中，看到reward列的数值增长！

有可能在开始的100步都没有奖励，你可能需要等待150-200步。

In [None]:
# 创建训练器，并且使用上面给出的
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 6,522,880/1,006,408,832 (0.65% trained)


******************** Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
<start_working_out>
Let $C$ be the cost of a concert ticket, which is $C = 40$.
Mr. Benson bought 12 tickets.
The number of tickets that are bought that exceed 10 is $12 - 10 = 2$.
For each ticket that exceeds 10, he receives a 5% discount.
The discount per ticket is $0.05 \times 40 = 2$.
So, the price of each ticket that exceeds 10 is $40 - 2 = 38$.
The total cost of 2 tickets at the discounted price is $2 \times 38 = 76$.
The total cost of 10 tickets at the regular price is $10 \times 40 = 400$.
The total cost of 12 tickets is $76 + 400 = 476$.
However, we need to consider the discount for each ticket that exceeds 10.
Tickets 11-12 are discounted by 5%.
So, the discount for the 2 tickets that exceed 10 is $2 \times 0.05 = 0.1$.
The price of each ticket that exceeds 10 is 

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers
1,-0.0,-0.5,0.57735,910.75,0.0,0.0,-0.5,0.0,0.0
2,0.0,-0.125,1.181454,859.25,0.0,0.0,-0.5,0.0,0.375
3,0.0,-0.25,0.5,497.25,0.000518,0.0,-0.25,0.0,0.0
4,0.0,1.125,0.75,293.5,0.000493,0.0,0.0,0.0,1.125
5,0.0001,-0.25,0.5,127.5,0.001941,0.0,-0.25,0.0,0.0
6,0.0,-0.125,1.75,872.75,0.000258,0.0,-0.5,0.0,0.375
7,0.0,1.5,0.0,419.5,0.000771,0.0,0.0,0.0,1.5
8,0.0,-1.0,0.0,1180.25,0.00043,0.0,-1.0,0.0,0.0
9,0.0,-0.75,0.5,1211.75,0.001115,0.0,-0.75,0.0,0.0
10,0.0,-0.5,0.57735,691.25,0.000939,0.0,-0.5,0.0,0.0


Unsloth: Will smartly offload gradients to save VRAM!
******************** Question:
Jane is trying to decide whether to buy a house or a trailer. A house costs $480,000 and a trailer costs $120,000. Each loan will be paid in monthly installments over 20 years. How much more is the monthly payment on the house compared to the trailer? 
Answer:
1500 
Response:
<start_working_out>
Let's calculate the monthly payment for the house.
The loan amount is $480,000.
The interest rate is 6%.
The loan term is 20 years, which is 20 * 12 = 240 months.
We will use the formula for the monthly payment of a loan: M = P [ i(1 + i)^n ] / [ (1 + i)^n – 1]
Where:
M = Monthly payment
P = Principal loan amount ($480,000)
i = Monthly interest rate (annual rate / 12 = 0.06 / 12 = 0.005)
n = Number of months (240)
Plugging in the values:
M = 480000 [ 0.005(1 + 0.005)^240 ] / [ (1 + 0.005)^240 – 1]
M = 480000 [ 0.005(1.005)^240 ] / [ (1.005)^240 – 1]
(1.005)^240 ≈ 3.26179
M = 480000 [ 0.005 * 3.26179 ] / [ 3.261

### 模型测试
#### 默认模型测试

In [None]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user",   "content": "What is the sqrt of 101?"},
]

text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    tokenize = False,
)
from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 64, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

#### 保存 Lora

In [None]:
model.save_pretrained("gemma-3")  # Local saving
tokenizer.save_pretrained("gemma-3")
# model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
# tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving

In [None]:
if False: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3-finetune", tokenizer)

#### 测试模型

In [None]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "HF_ACCOUNT/gemma-3-finetune", tokenizer,
        token = "hf_..."
    )

### 保存为完整模型

In [None]:
# if False: # Change to True to save to GGUF
#     model.save_pretrained_gguf(
#         "gemma-3-finetune",
#         quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
#     )

In [None]:
# if False: # Change to True to upload GGUF
#     model.push_to_hub_gguf(
#         "gemma-3-finetune",
#         quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
#         repo_id = "HF_ACCOUNT/gemma-finetune-gguf",
#         token = "hf_...",
#     )