### 加载
#### 加载模型

In [1]:
from unsloth import FastModel
import torch
max_seq_length = 1024 # 模型的最大序列长度，默认是1024
lora_rank = 8 # LoRA的秩，越大越好，但会消耗更多内存 #8

model, tokenizer = FastModel.from_pretrained(
    model_name = "./models/gemma-3-1b-it", #"unsloth/gemma-3-1b-it",
    max_seq_length = max_seq_length, # 可以选择任意长度以支持长上下文！
    load_in_4bit = False,  # 4位量化以减少内存使用
    load_in_8bit = False, # 精度更高，但使用2倍内存
    full_finetuning = False, # 完全微调
    # gpu_memory_utilization = 0.85, # GPU内存使用率，如果出现OOM可以降低此值
    # token = "hf_...", # 使用受限模型时需要提供token
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-14 18:32:49 [__init__.py:256] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.1.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


#### 加载 Lora 设置

In [2]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # 仅处理文本层或者模型没有视觉层时关闭
    finetune_language_layers   = True,  # 应该保持开启！
    finetune_attention_modules = True,  # 注意力机制对GRPO有好处
    finetune_mlp_modules       = True,  # 应该始终保持开启！

    r = lora_rank,           # 更大 = 更高的精度，但可能过拟合
    lora_alpha = lora_rank,  # 建议alpha至少等于r
    lora_dropout = 0,
    bias = "none",
    random_state = 3407, # 使用同一个随机数种子
)

Unsloth: Making `model.base_model.model.model` require gradients


#### 加载、构造数据集

In [3]:
from datasets import load_dataset
# json 格式
dataset2 = load_dataset("json", data_files="./datasets/ruozhiba_R1/alpaca_output.jsonl", split='train')

# parquet 格式
dataset = load_dataset("parquet", data_files="./datasets/gsm8k/main/train-00000-of-00001.parquet", split='train')

# 查看数据情况
print(dataset)
print(dataset[0]["question"])
print(dataset[0]["answer"])

print("\n")

print(dataset2)
print(dataset2[0]["instruction"])
print(dataset2[0]["output"])

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 7473
})
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72


Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 2008
})
别拿某个狗国太当干粮 别自以为是 狗国人多 疯狗士气长
<think>好的，我现在需要处理用户提供的这个查询：“别拿某个狗国太当干粮 别自以为是 狗国人多 疯狗士气长”。首先，我需要理解用户的意思以及背后的情感。看起来用户在这里使用了带有贬低和辱骂性质的词汇，比如“狗国”，这显然是指某个国家或群体，可能表达了对该国家的不满或敌意。

接下来，我要分析用户的需求。用户可能希望我回应或同意他们的观点，或者提供某种支持他们情绪的内容。然而，根据规则，我的回应必须遵守中国法律法规，保持中立和客观，并且不能含有任何攻击性或不当言论。因此，我不能支持或强化用户的负面情绪，也不能使用类似的侮辱性语言。

然后，我需要考虑如何以适当的方式回应用户。用户可能感到愤怒或不满，我的回应应该既保持礼貌，又引导他们以更建设性的方式表达意见。根据示例响应，正确的做法是提醒用户文明交流，强调不鼓励使用不尊重或攻击性语言，并建议基于相互尊重的对话。

此外，我要确保回应中没有违反任何规定，比如不涉及政治敏感话题，不侮辱任何国家或群体。同时，要避免提供任何可能被误解为支持用户观点的内容。需要保持中立，既不赞同也不反对，而是转向促进积极对话的方向。

另外，要注意用户可能的深层需求。他们可能希望被倾听或得到情感支持，但表达方式不当。在这种情况下

##### 答案清洗/提取工具函数

In [4]:
# 回答总是以####开头，对回答数据做抽取，为后续的数据集清理做准备。
def extract_hash_answer(text):
    if "####" not in text: return None
    return text.split("####")[1].strip()
print(extract_hash_answer(dataset[0]["answer"]))

# 对
def extract_xml_answer(text: str) -> str:
    """
    从文本中提取</think>标签之后的所有内容
    
    参数:
        text: 包含</think>标签的文本
        
    返回:
        str: </think>标签之后的所有内容，去除首尾空格
    """
    if "</think>" not in text:
        return text.strip()
    answer = text.split("</think>")[-1]  # 提取</think>标签后的所有内容
    return answer.strip()  # 去除首尾空格
print("\n")
print(dataset2[0])
print("\n")
print(extract_xml_answer(dataset2[0]["output"]))

72


{'instruction': '别拿某个狗国太当干粮 别自以为是 狗国人多 疯狗士气长', 'input': '', 'output': '<think>好的，我现在需要处理用户提供的这个查询：“别拿某个狗国太当干粮 别自以为是 狗国人多 疯狗士气长”。首先，我需要理解用户的意思以及背后的情感。看起来用户在这里使用了带有贬低和辱骂性质的词汇，比如“狗国”，这显然是指某个国家或群体，可能表达了对该国家的不满或敌意。\n\n接下来，我要分析用户的需求。用户可能希望我回应或同意他们的观点，或者提供某种支持他们情绪的内容。然而，根据规则，我的回应必须遵守中国法律法规，保持中立和客观，并且不能含有任何攻击性或不当言论。因此，我不能支持或强化用户的负面情绪，也不能使用类似的侮辱性语言。\n\n然后，我需要考虑如何以适当的方式回应用户。用户可能感到愤怒或不满，我的回应应该既保持礼貌，又引导他们以更建设性的方式表达意见。根据示例响应，正确的做法是提醒用户文明交流，强调不鼓励使用不尊重或攻击性语言，并建议基于相互尊重的对话。\n\n此外，我要确保回应中没有违反任何规定，比如不涉及政治敏感话题，不侮辱任何国家或群体。同时，要避免提供任何可能被误解为支持用户观点的内容。需要保持中立，既不赞同也不反对，而是转向促进积极对话的方向。\n\n另外，要注意用户可能的深层需求。他们可能希望被倾听或得到情感支持，但表达方式不当。在这种情况下，引导他们用更合适的方式表达不满才是关键，而不是直接回应他们的情绪。\n\n最后，总结应对策略：礼貌地提醒用户注意语言文明，说明不鼓励攻击性言论，提倡相互尊重的交流方式。保持中立，避免任何可能引起争议或冲突的回应，确保符合规定和法律法规。\n</think>\n\n我们始终主张不同国家和文化之间应相互尊重、平等交流。任何问题都应本着客观理性的态度进行讨论,用建设性对话增进理解。中华文化历来讲究"和为贵",我们相信只有摈弃偏见、以心相交,才能实现不同文明和谐共处。'}


我们始终主张不同国家和文化之间应相互尊重、平等交流。任何问题都应本着客观理性的态度进行讨论,用建设性对话增进理解。中华文化历来讲究"和为贵",我们相信只有摈弃偏见、以心相交,才能实现不同文明和谐共处。


##### 构造系统提示词

In [5]:
# 设置系统提示此
reasoning_start = "<start_working_out>"
reasoning_end   = "<end_working_out>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"

system_prompt = \
f"""你被给定了一个问题，考虑问题并提供你给出的答案。
请将思考过程放在 {reasoning_start} 和 {reasoning_end} 之间。
然后，请在 {solution_start} 和 {solution_end} 之间提供你的答案。"""
system_prompt

'你被给定了一个问题，考虑问题并提供你给出的答案。\n请将思考过程放在 <start_working_out> 和 <end_working_out> 之间。\n然后，请在 <SOLUTION> 和 </SOLUTION> 之间提供你的答案。'

##### 创建、合并2个数据集
最终会产生出一个核心数据集。其中会做出打乱数据集的操作

In [6]:
# ...existing code...
from datasets import concatenate_datasets

# --- 处理第一个数据集 (dataset) ---

# 获取原始列名，以便后续移除
original_columns_ds1 = dataset.column_names

# 格式化数据集：
# 1. 构建 prompt 列表，包含 system_prompt 和 user 的 question
# 2. 使用 extract_hash_answer 清洗 answer
# 3. 移除原始列
print(f"Processing dataset 1 (size: {len(dataset)})...")
dataset = dataset.map(
    lambda x: {
        "prompt": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": x["question"]},
        ],
        "answer": extract_hash_answer(x["answer"]),
    },
    remove_columns=original_columns_ds1  # 移除所有原始列
)
print("Dataset 1 processed.")

# 打印处理后的第一个数据集的示例
print("\nExample from processed Dataset 1:")
print("Prompt:", dataset[0]["prompt"])
print("Answer:", dataset[0]["answer"])

# --- 处理第二个数据集 (dataset2) ---

# 辅助函数：检查 dataset2 的 'output' 字段在 </think> 标签后是否有有效内容
def has_valid_content(output_text):
    """检查</think>标签后的内容是否有效（不是空的、只有空格或只有句号）"""
    if "</think>" not in output_text:
        # 如果没有 </think> 标签，我们假设内容是有效的或不需要这种特定格式
        # 注意：根据需求，这里的逻辑可能需要调整。当前实现是如果没有标签则视为无效。
        # 如果没有标签也应保留，则返回 True。
        # 为了匹配原始逻辑（过滤掉没有</think>标签的），这里返回 False。
        return False # 原始逻辑似乎是要求必须有 </think> 标签

    content_after_tag = extract_xml_answer(output_text)
    # 检查提取的内容是否为空、只有空格或只有句号
    if not content_after_tag or content_after_tag.isspace() or content_after_tag == ".":
        return False
    return True

# 过滤 dataset2，只保留 'output' 字段包含有效内容的条目
print(f"\nProcessing dataset 2 (original size: {len(dataset2)})...")
valid_indices = [
    i for i, example in enumerate(dataset2)
    if 'output' in example and has_valid_content(example['output'])
]
dataset2_filtered = dataset2.select(valid_indices)
print(f"Filtered dataset 2 size: {len(dataset2_filtered)} valid examples.")

# 获取过滤后 dataset2 的原始列名
original_columns_ds2 = dataset2_filtered.column_names

# 格式化过滤后的 dataset2：
# 1. 构建 prompt 列表，包含 system_prompt 和 user 的 instruction/input
# 2. 使用 extract_xml_answer 清洗 answer (从 output 提取)
# 3. 移除原始列
dataset2_processed = dataset2_filtered.map(
    lambda x: {
        "prompt": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": x["instruction"] if 'instruction' in x else x.get('input', '')},
        ],
        "answer": extract_xml_answer(x["output"]),
    },
    remove_columns=original_columns_ds2
)
print("Dataset 2 processed.")

# 打印处理后的第二个数据集的示例
print("\nExample from processed Dataset 2:")
if len(dataset2_processed) > 0:
    print("Prompt:", dataset2_processed[0]["prompt"])
    print("Answer:", dataset2_processed[0]["answer"])
else:
    print("Processed Dataset 2 is empty.")

# --- 合并与打乱数据集 ---

# 合并处理后的两个数据集
print("\nCombining and shuffling datasets...")
final_dataset = concatenate_datasets([dataset, dataset2_processed])

# 打乱合并后的数据集
final_dataset = final_dataset.shuffle(seed=42)

print(f"Combined dataset size: {len(final_dataset)}")

# Print the first few examples of the final dataset to check the structure
print("\nFirst few examples from the final combined and shuffled dataset:")
for i in range(min(3, len(final_dataset))): # Print up to 3 examples
    print(f"--- Example {i+1} ---")
    print("Prompt:", final_dataset[i]["prompt"])
    print("Answer:", final_dataset[i]["answer"])
    print("-" * 20)

# Optionally, print the structure of one example
if len(final_dataset) > 0:
    print("\nStructure of the first example:")
    print(final_dataset[0])



Processing dataset 1 (size: 7473)...


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Dataset 1 processed.

Example from processed Dataset 1:
Prompt: [{'content': '你被给定了一个问题，考虑问题并提供你给出的答案。\n请将思考过程放在 <start_working_out> 和 <end_working_out> 之间。\n然后，请在 <SOLUTION> 和 </SOLUTION> 之间提供你的答案。', 'role': 'system'}, {'content': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?', 'role': 'user'}]
Answer: 72

Processing dataset 2 (original size: 2008)...
Filtered dataset 2 size: 1979 valid examples.


Map:   0%|          | 0/1979 [00:00<?, ? examples/s]

Dataset 2 processed.

Example from processed Dataset 2:
Prompt: [{'content': '你被给定了一个问题，考虑问题并提供你给出的答案。\n请将思考过程放在 <start_working_out> 和 <end_working_out> 之间。\n然后，请在 <SOLUTION> 和 </SOLUTION> 之间提供你的答案。', 'role': 'system'}, {'content': '别拿某个狗国太当干粮 别自以为是 狗国人多 疯狗士气长', 'role': 'user'}]
Answer: 我们始终主张不同国家和文化之间应相互尊重、平等交流。任何问题都应本着客观理性的态度进行讨论,用建设性对话增进理解。中华文化历来讲究"和为贵",我们相信只有摈弃偏见、以心相交,才能实现不同文明和谐共处。

Combining and shuffling datasets...
Combined dataset size: 9452

First few examples from the final combined and shuffled dataset:
--- Example 1 ---
Prompt: [{'content': '你被给定了一个问题，考虑问题并提供你给出的答案。\n请将思考过程放在 <start_working_out> 和 <end_working_out> 之间。\n然后，请在 <SOLUTION> 和 </SOLUTION> 之间提供你的答案。', 'role': 'system'}, {'content': '最近只要看到动物爱爱我就硬了， 最近只要看到动物爱爱我就硬了，我这样是不是异性恋啊', 'role': 'user'}]
Answer: 根据你的描述，你的情况需要从以下几个方面来理解和探讨：

### 1. **性取向的基本定义**
   - 性取向通常指一个人在情感和性方面被吸引的性别（如异性恋、同性恋、双性恋等）。**核心在于对人类性别的偏好**。
   - 动物并不属于人类性别范畴，因此对动物的性反应不属于传统意义上的性取向问题。它可能更接近于一种**特殊偏好或性兴趣**（paraphilia），但这需专业评估。

### 2. **生理反应的可能原因**

### 定义奖励函数
#### 定义标准格式形式

In [7]:
import re

# 定义正则表达式，用来判断模型的输出是否符合格式要求
match_format = re.compile(
    rf"^[\s]{{0,}}"\
    rf"{reasoning_start}.+?{reasoning_end}.*?"\
    rf"{solution_start}(.+?){solution_end}"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)

match_format.search(
    "<start_working_out>Let me think!<end_working_out>"\
    "<SOLUTION>2</SOLUTION>",
)

<re.Match object; span=(0, 71), match='<start_working_out>Let me think!<end_working_out>>

#### 构造奖励函数

In [8]:
# 严格格式判断函数
def match_format_exactly(completions, **kwargs):
    """格式判断函数，严格判断格式是否匹配
    """
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Match if format is seen exactly!
        if match_format.search(response) is not None: score += 3.0
        scores.append(score)
    return scores

In [9]:
# 弱格式判断函数
def match_format_approximately(completions, **kwargs):
    """弱格式判断奖励，即使没有严格对应，也可以根据使用的标签数量来做出相应的奖励
    """
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # 数一数看到多少个关键词——如果太多，我们会惩罚你！
        # 如果我们看到1个关键词，那么加一些积分！如果更多了，那么就应当扣除一些分
        score += 0.5 if response.count(reasoning_start) == 1 else -0.5
        score += 0.5 if response.count(reasoning_end)   == 1 else -0.5
        score += 0.5 if response.count(solution_start)  == 1 else -0.5
        score += 0.5 if response.count(solution_end)    == 1 else -0.5
        scores.append(score)
    return scores

In [10]:
# 回答检查：通用答案检查
def check_answer(prompts, completions, answer, **kwargs):
    """通过比较提取的答案与参考答案来评估模型响应。
    
    该函数从结构化模型输出中提取答案并与参考答案进行比较，根据匹配质量分配分数：
    - 完全匹配：3.0分
    - 去除空格后匹配：1.5分
    - 数值答案在正确值10%范围内：0.5分
    - 数值答案在正确值20%范围内：0.25分
    - 错误答案：-0.5或-1.0分
    
    参数：
        prompts (list)：提供给模型的对话提示列表
        completions (list)：需要评估的模型生成的回答
        answer (list)：用于比较的参考答案
        **kwargs：额外参数
    """
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_format.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    for guess, true_answer in zip(extracted_responses, answer):
        score = 0
        if guess is None:
            scores.append(0)
            continue
        # 如果完全一致，就给出 3 分 
        if guess == true_answer:
            score += 3.0
        # 如果结果正确，但是有空格，就给1.5分
        elif guess.strip() == true_answer.strip():
            score += 1.5
        else:
            # 如果答案接近比率，我们也会奖励它！
            # 即，如果答案在某个范围内，奖励它！
            try:
                ratio = float(guess) / float(true_answer)
                if   ratio >= 0.9 and ratio <= 1.1: score += 0.5
                elif ratio >= 0.8 and ratio <= 1.2: score += 0.25
                else: score -= 1.0 # Penalize wrong answers
            except:
                # 如果直接异常了，就抛出错误
                score -= 0.5 # Penalize
        scores.append(score)
    return scores

In [11]:
# 对于数学问题，先给数字部分抽取出来
match_numbers = re.compile(
    rf"{solution_start}.*?([\d\.]{{1,}})",
    flags = re.MULTILINE | re.DOTALL
)

# 回答检查：特定数字检查
def check_numbers(prompts, completions, answer, **kwargs):
    """使用正则表达式从模型输出中提取数字答案并进行评分。
    
    该函数从模型响应中提取数字，并与参考答案进行数值比较。
    如果提取的数字与正确答案完全匹配，将获得1.5分，否则为0分。
    
    参数：
        prompts (list)：提供给模型的对话提示列表
        completions (list)：需要评估的模型生成的回答
        answer (list)：用于比较的参考答案数值
        **kwargs：额外参数
        
    返回：
        list：基于数值匹配的评分列表
    """
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]

    extracted_responses = [
        guess.group(1)
        if (guess := match_numbers.search(r)) is not None else None \
        for r in responses
    ]

    scores = []
    
    # 输出调试
    print('*'*20, f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    
    for guess, true_answer in zip(extracted_responses, answer):
        if guess is None:
            scores.append(0)
            continue
        # Convert to numbers
        try:
            true_answer = float(true_answer.strip())
            guess       = float(guess.strip())
            scores.append(1.5 if guess == true_answer else 0.0)
        except:
            scores.append(0)
            continue
    return scores

### 训练部分
#### 训练配置

In [12]:
max_prompt_length = 256

# 使用 GRPO 训练器，并构造训练器
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    beta = 0.0, # 设置为 0 以禁用 KL 散度惩罚 # defaults to 0.04
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_torch_fused",
    logging_steps = 1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1, # 增加到4，以便更顺滑地训练 #1
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_seq_length - max_prompt_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 500, # 训练步数
    save_steps = 200, # 每200步保存一次
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs_gemma3_1b_it_2", # 输出目录
)

#### 开始训练
开始训练。期望在训练中，看到reward列的数值增长！而不是 损失函数
有可能在开始的100步都没有奖励，你可能需要等待150-200步。

In [13]:
# 创建训练器，并且使用上面给出的 reward function
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        match_format_exactly,
        match_format_approximately,
        check_answer,
        check_numbers,
    ],
    args = training_args,
    train_dataset = final_dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 9,452 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 6,522,880/1,006,408,832 (0.65% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


******************** Question:
Nurse Missy is attending to the needs of 12 patients in her hospital ward.  Most of her patients require standard care, but one-third of her patients have special dietary requirements, which increases the serving time by 20%.  At dinner time, she brings each patient their meal. It takes 5 minutes to serve each standard care patient.  How long does it take, in minutes, for Missy to serve dinner to all of her patients? 
Answer:
64 
Response:
<start_working_out>
Let $N$ be the total number of patients, so $N = 12$.
Let $S$ be the number of patients with standard care requirements. We are given that one-third of her patients have special dietary requirements, so $S = \frac{1}{3} \times 12 = 4$.
The number of patients with special dietary requirements is 4.
The number of standard care patients is $12 - 4 = 8$.
The serving time for each standard care patient is 5 minutes.
The total time to serve standard care patients is $8 \times 5 = 40$ minutes.
The special d

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / match_format_exactly,rewards / match_format_approximately,rewards / check_answer,rewards / check_numbers
1,0.0,0.25,0.957427,223.5,0.0,0.0,0.25,0.0,0.0
2,-0.0,1.125,1.030776,382.75,0.0,0.0,0.75,0.0,0.375
3,-0.0,0.25,0.5,260.5,0.000169,0.0,0.25,0.0,0.0
4,0.0,-0.25,0.957427,609.5,0.000562,0.0,-0.25,0.0,0.0
5,0.0,2.75,2.020726,94.75,0.001084,1.5,1.5,-0.25,0.0
6,0.0,0.5,0.57735,291.25,0.001083,0.0,0.5,0.0,0.0
7,-0.0,0.5,0.57735,271.5,0.000669,0.0,0.5,0.0,0.0
8,0.0,1.625,1.75,168.5,0.00203,0.0,0.5,0.0,1.125
9,-0.0,1.875,1.75,129.0,0.001372,0.75,1.25,-0.125,0.0
10,0.0,1.375,0.75,205.5,0.00161,0.0,1.0,0.0,0.375


Unsloth: Will smartly offload gradients to save VRAM!
******************** Question:
Erika and her brother Rick want to buy a gift for their mother that costs $250. They have both saved money. Erika saved $155 and her brother saved only half of the cost of the gift. They also want to surprise their mother with a birthday cake that costs $25. How much money will they have left after buying their mother a gift and cake? 
Answer:
5 
Response:
<start_working_out>
Let’s break down the problem step-by-step.
Erika saved $155.
Rick saved half of the cost of the gift, so he saved $155 / 2 = $77.50.
The total amount they saved is $155 + $77.50 = $232.50.
The total cost of the gift and cake is $250 (the cost of the gift).
After buying the gift and cake, they will have $232.50 - $250 = -$17.50.
However, this result doesn't make sense. Let’s re-read the problem carefully.

Erika saved $155.
Rick saved half of the cost of the gift, so he saved $155 / 2 = $77.50.
The total amount they saved is $155 +

TrainOutput(global_step=500, training_loss=1.6987323760986329e-09, metrics={'train_runtime': 11046.4636, 'train_samples_per_second': 0.181, 'train_steps_per_second': 0.045, 'total_flos': 0.0, 'train_loss': 1.6987323760986329e-09})

In [None]:
# import matplotlib.pyplot as plt
# import pandas as pd
# import numpy as np
# import seaborn as sns
# import os
# from pathlib import Path
# import json

# # 设置可视化风格，提高图表美观度
# plt.style.use('seaborn-v0_8-whitegrid')
# sns.set_palette('viridis')
# plt.rcParams['figure.figsize'] = (12, 6)
# plt.rcParams['figure.dpi'] = 100
# plt.rcParams['font.size'] = 12

# def extract_rewards_from_trainer(trainer):
#     """从GRPOTrainer对象中提取奖励数据
    
#     参数:
#         trainer: GRPOTrainer对象
#     返回:
#         pd.DataFrame: 包含步骤和对应奖励的数据框
#     """
#     if not hasattr(trainer, 'state') or not hasattr(trainer.state, 'log_history'):
#         print("训练器没有可用的日志历史

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import os
from collections import defaultdict
import seaborn as sns

# 设置Seaborn样式以获得更好看的图表
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

def extract_rewards_from_log(log_path):
    """从训练日志文件中提取奖励数据
    
    参数:
        log_path: 日志文件路径
        
    返回:
        包含步骤和对应奖励的pandas DataFrame
    """
    # 存储数据的字典
    data = defaultdict(list)
    step_pattern = re.compile(r'Step\s+(\d+)')
    reward_pattern = re.compile(r'Reward_(\d+):\s+([-\d.]+)')
    mean_reward_pattern = re.compile(r'Mean Reward:\s+([-\d.]+)')
    
    if not os.path.exists(log_path):
        print(f"日志文件 {log_path} 不存在!")
        return pd.DataFrame()
    
    with open(log_path, 'r') as f:
        for line in f:
            # 提取步骤
            step_match = step_pattern.search(line)
            if step_match:
                current_step = int(step_match.group(1))
                data['step'].append(current_step)
                
                # 提取各个奖励函数的值
                rewards = reward_pattern.findall(line)
                for idx, value in rewards:
                    data[f'reward_{idx}'].append(float(value))
                
                # 提取平均奖励
                mean_match = mean_reward_pattern.search(line)
                if mean_match:
                    data['mean_reward'].append(float(mean_match.group(1)))
    
    return pd.DataFrame(data)

def extract_rewards_from_trainer(trainer):
    """从trainer对象中直接提取奖励数据
    
    参数:
        trainer: GRPOTrainer对象
        
    返回:
        包含步骤和对应奖励的pandas DataFrame
    """
    if hasattr(trainer, 'state') and hasattr(trainer.state, 'log_history'):
        data = defaultdict(list)
        for entry in trainer.state.log_history:
            if 'step' in entry:
                data['step'].append(entry['step'])
                
                # 提取各个奖励
                for key, value in entry.items():
                    if key.startswith('reward_'):
                        data[key].append(value)
                
                # 提取平均奖励
                if 'mean_reward' in entry:
                    data['mean_reward'].append(entry['mean_reward'])
                
        return pd.DataFrame(data)
    else:
        print("训练器没有日志历史或者结构不符合预期!")
        return pd.DataFrame()

def plot_rewards(data, title="GRPO训练奖励曲线", save_path=None, moving_avg_window=5):
    """绘制奖励折线图
    
    参数:
        data: 包含奖励数据的DataFrame
        title: 图表标题
        save_path: 保存图表的路径，如果为None则显示图表
        moving_avg_window: 移动平均窗口大小
    """
    if data.empty:
        print("没有数据可以绘图!")
        return
    
    fig, ax = plt.subplots()
    
    # 定义一组专业的颜色
    colors = sns.color_palette('viridis', n_colors=len(data.columns)-1)
    
    # 绘制每个奖励函数的曲线
    for i, col in enumerate([col for col in data.columns if col != 'step']):
        # 原始数据点（透明度降低）
        ax.plot(data['step'], data[col], alpha=0.3, color=colors[i], label=f"{col} (raw)")
        
        # 添加移动平均线
        if len(data) >= moving_avg_window:
            moving_avg = data[col].rolling(window=moving_avg_window).mean()
            ax.plot(data['step'], moving_avg, linewidth=2, color=colors[i], label=f"{col} ({moving_avg_window}-point avg)")
    
    # 添加标题和标签
    ax.set_title(title, fontsize=16, fontweight='bold')
    ax.set_xlabel('Training Steps', fontsize=14)
    ax.set_ylabel('Reward', fontsize=14)
    
    # 添加网格线和图例
    ax.grid(True, linestyle='--', alpha=0.7)
    ax.legend(loc='best', fontsize=12)
    
    # 添加统计信息
    if 'mean_reward' in data.columns:
        final_mean = data['mean_reward'].iloc[-1]
        max_mean = data['mean_reward'].max()
        min_mean = data['mean_reward'].min()
        stats_text = f"Final mean reward: {final_mean:.4f}\nMax mean reward: {max_mean:.4f}\nMin mean reward: {min_mean:.4f}"
        plt.figtext(0.02, 0.02, stats_text, fontsize=12, bbox=dict(facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    
    # 保存或显示图表
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"图表已保存到 {save_path}")
    else:
        plt.show()

# 示例用法
def visualize_rewards(trainer=None, log_file=None, output_path=None):
    """可视化训练奖励
    
    参数:
        trainer: GRPOTrainer对象，如果提供则直接从训练器中提取数据
        log_file: 日志文件路径，如果trainer不可用则从日志文件中提取数据
        output_path: 图表保存路径，默认为当前目录下的'reward_plot.png'
    """
    if output_path is None:
        output_path = 'reward_plot.png'
    
    if trainer is not None:
        data = extract_rewards_from_trainer(trainer)
    elif log_file is not None:
        data = extract_rewards_from_log(log_file)
    else:
        print("请提供trainer对象或日志文件路径!")
        return
    
    plot_rewards(data, save_path=output_path)
    
    # 输出一些统计信息
    if not data.empty and 'mean_reward' in data.columns:
        print("\n--- 奖励统计信息 ---")
        print(f"最终平均奖励: {data['mean_reward'].iloc[-1]:.4f}")
        print(f"最大平均奖励: {data['mean_reward'].max():.4f}")
        print(f"最小平均奖励: {data['mean_reward'].min():.4f}")
        
        # 计算奖励增长率
        if len(data) > 1:
            first_reward = data['mean_reward'].iloc[0]
            last_reward = data['mean_reward'].iloc[-1]
            growth = ((last_reward - first_reward) / abs(first_reward)) * 100 if first_reward != 0 else float('inf')
            print(f"奖励增长率: {growth:.2f}%")

# 用法示例
# 1. 使用训练器对象
# visualize_rewards(trainer=trainer)

# 2. 或者使用日志文件
visualize_rewards(log_file="./outputs_gemma-3_grpo_lora/opt_gemm3_2.log")

# 从训练后直接可视化
# 在训练后调用以下代码即可直接可视化
# visualize_rewards(trainer=trainer, output_path="reward_trends.png")

没有数据可以绘图!


### 模型测试
#### 默认模型测试

In [21]:
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user",   "content": "There is a group of 18 people who are ordering pizza. If each person gets 3 slices and each pizza has 9 slices, how many pizzas should they order? "},
]

text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    tokenize = False,
)
from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

<start_working_out>
Let $N$ be the number of people in the group, so $N = 18$.
Each person gets 3 slices of pizza.
The total number of slices needed is $18 \times 3 = 54$ slices.
Each pizza has 9 slices.
Let $P$ be the number of pizzas they need to order.
The total number of slices from $P$ pizzas is $9P$.
We need $9P \ge 54$.
Dividing both sides by 9, we get $P \ge \frac{54}{9} = 6$.
Since they need to order a whole number of pizzas, they need to order at least 6 pizzas.
Therefore, they should order 6 pizzas.
<end_working_out>
<SOLUTION>6
<end_of_turn>


In [None]:
# 加载原始模型（不包含微调）
from unsloth import FastModel
import torch

# 定义相同的参数
max_seq_length = 1024

# 重新加载原始模型（不应用LoRA权重）
original_model, original_tokenizer = FastModel.from_pretrained(
    model_name = "./models/gemma-3-1b-it",  # 使用原始模型路径
    max_seq_length = max_seq_length,
    load_in_4bit = False,
    load_in_8bit = False,
)

# 测试问题
test_messages = [
    {"role": "system", "content": system_prompt},  # 使用之前定义的系统提示词
    {"role": "user", "content": "What is the sqrt of 101?"},  # 使用同样的测试问题以便比较
]

# 准备输入
test_text = original_tokenizer.apply_chat_template(
    test_messages,
    add_generation_prompt = True,
    tokenize = False,
)

# 使用TextStreamer直接查看输出
from transformers import TextStreamer
print("\n原始模型输出：")
_ = original_model.generate(
    **original_tokenizer(test_text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024,
    temperature = 0.8,  # 使用与微调模型相同的温度
    top_p = 0.95,
    top_k = 64,
    streamer = TextStreamer(original_tokenizer, skip_prompt = True),
)

#### finetuning 模型测试

In [None]:
# 保存 Lora
model.save_lora("grpo_saved_lora")

#### 保存 Lora

In [None]:
model.save_pretrained("gemma-3")  # Local saving
tokenizer.save_pretrained("gemma-3")

('gemma-3/tokenizer_config.json',
 'gemma-3/special_tokens_map.json',
 'gemma-3/tokenizer.model',
 'gemma-3/added_tokens.json',
 'gemma-3/tokenizer.json')

In [15]:
if True: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3-finetune", tokenizer)

AttributeError: 'NoneType' object has no attribute 'startswith'

### 保存为完整模型

##### 保存为 bf16 格式

In [23]:
# Merge to 16bit
if True: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if True: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

AttributeError: 'NoneType' object has no attribute 'startswith'

In [None]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "HF_ACCOUNT/gemma-3-finetune", tokenizer,
        token = "hf_..."
    )

In [None]:
# 保存为 GGUF 格式
# if False:
#     model.save_pretrained_gguf(
#         "gemma-3-finetune",
#         quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
#     )

In [None]:
# if False: # Change to True to upload GGUF
#     model.push_to_hub_gguf(
#         "gemma-3-finetune",
#         quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
#         repo_id = "HF_ACCOUNT/gemma-finetune-gguf",
#         token = "hf_...",
#     )