### 加载
#### 加载模型

In [None]:
from unsloth import FastModel
import torch
from client import ClientSender
import uuid
import zmq
import msgpack

# 实例化网络传输对象
client = ClientSender(server_address="", port=5555)

max_seq_length = 2048 # 模型的最大序列长度，默认是1024
lora_rank = 8 # LoRA的秩，越大越好，但会消耗更多内存 #8

model, tokenizer = FastModel.from_pretrained(
    model_name = "./models/gemma-3-4b-it", #"unsloth/gemma-3-1b-it",
    max_seq_length = max_seq_length, # 可以选择任意长度以支持长上下文！
    load_in_4bit = True,  # 4位量化以减少内存使用
    load_in_8bit = False, # 精度更高，但使用2倍内存
    full_finetuning = False, # 完全微调
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-20 20:05:30 [__init__.py:256] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Gemma3 patching. Transformers: 4.50.0.dev0. vLLM: 0.8.1.
   \\   /|    NVIDIA GeForce RTX 4070 Ti SUPER. Num GPUs = 1. Max memory: 15.992 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


#### 加载 Lora 设置

In [2]:
model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # 仅处理文本层或者模型没有视觉层时关闭
    finetune_language_layers   = True,  # 应该保持开启！
    finetune_attention_modules = True,  # 注意力机制对GRPO有好处
    finetune_mlp_modules       = True,  # 应该始终保持开启！

    r = lora_rank,           # 更大 = 更高的精度，但可能过拟合
    lora_alpha = lora_rank,  # 建议alpha至少等于r
    lora_dropout = 0,
    bias = "none",
    random_state = 3407, # 使用同一个随机数种子
)

Unsloth: Making `model.base_model.model.language_model.model` require gradients


#### 加载、构造数据集

##### 构造系统提示词

In [3]:
# 设置系统提示此
reasoning_start = "<think>"
reasoning_end   = "</think>"
solution_start = "<code>"
solution_end   = "</code>"

system_prompt = \
f"""你是一个 Blender 的材质生成器，你将会考虑问题并提供材质对应的 python 代码，该代码应该可以且仅在 Blender 中创建对应材质，你生成出的python代码应当就是最终结果，用户可以直接使用，不需要用户更改，你也不会使用任何外部文件。
请将思考过程放在 {reasoning_start} 和 {reasoning_end} 之间。
然后，请在 {solution_start} 和 {solution_end} 之间提供你的答案。"""
system_prompt

'你是一个 Blender 的材质生成器，你将会考虑问题并提供材质对应的 python 代码，该代码应该可以且仅在 Blender 中创建对应材质，你生成出的python代码应当就是最终结果，用户可以直接使用，不需要用户更改，你也不会使用任何外部文件。\n请将思考过程放在 <think> 和 </think> 之间。\n然后，请在 <code> 和 </code> 之间提供你的答案。'

##### 构造数据集

In [4]:
from datasets import Dataset
import random

dataset = []

# 设置三个不同的难度等级
level1 = [
    "红色的材质", "蓝色的材质", "蓝色到黄的渐变材质", "绿色的材质", "紫色的材质", "金色的材质", 
    "银色的材质", "透明的材质", "棕色的木质材质", "白色的材质", "黑色的材质", "灰色的材质",
    "橙色的材质", "粉红色的材质", "黄色的材质", "深红色的材质",     "红色的材质", "蓝色的材质", "蓝色到黄色的渐变材质", "绿色到黄色的渐变材质", "紫色到粉色的渐变材质",
    "金色到银色的渐变材质", "绿色到白色的渐变材质", "橙色到红色的渐变材质", "红色到黑色的渐变材质", "青色的材质", 
    "深蓝色的材质", "墨绿色的材质", "淡蓝色的材质", "米色的材质", "炭灰色的材质",
    "浅黄色的材质", "彩虹色的材质", "霓虹色的材质", "紫黑色的材质", "金属色的材质",
    "皮革材质", "深棕色的材质", "大理石材质", "沙土材质", "亮光材质", 
    "亮金色的材质", "暗棕色的材质", "液体材质", "冷白色的材质", "塑料材质",
    "深蓝色的材质", "热气流材质", "透明的材质", "冰块材质", "深蓝到浅蓝的渐变材质", "蓝色到紫色的渐变材质", "粉色到紫色的渐变材质", "红色到绿色的渐变材质",
    "黄色到绿色的渐变材质", "深紫到浅紫的渐变材质", "紫色到蓝色的渐变材质", "蓝色到白色的渐变材质", 
    "青色到白色的渐变材质", "橙色到黄色的渐变材质", "黑色到灰色的渐变材质"
]
level2 = [
    "红色的金属材质：这种材质表面光滑，金属光泽非常突出，适合表现科技感强的物体，给人一种坚硬而现代的感觉。",
    "蓝色的金属材质：它呈现出深邃的蓝色，金属反射效果明显，适合用在精密机械或未来感十足的设计中。",
    "红色的木头材质：这种木材的表面有明显的纹理，颜色鲜艳且富有自然感，适合制作温暖、自然的环境。",
    "深紫色的钢铁材质：这种材质混合了紫色和灰色，具有较强的视觉冲击感，适合用于表现坚固而神秘的物体。",
    "绿松石色的塑料材质：该材质呈现出绿色的光泽感，给人一种清新自然的感觉，适合用于现代简约风格。",
    "金色的光滑材质：它的表面非常光滑，反射效果强，通常用于奢华或高贵的设计风格中。",
    "银白色的钛金属材质：该材质具有冷光感和金属质感，质地非常坚硬且耐用，适合用于科技产品或高级饰品。",
    "白色的瓷砖材质：这种材质表面平整且具有光泽，适合用于现代简约或清新风格的空间装饰。",
    "金属质感的深灰色材质：这种材质表面有着明显的金属光泽，深灰色调给人稳重、高科技的感觉。",
    "绿色的玻璃材质：这种玻璃表面呈现出绿色的透光效果，具有良好的透明度，适合用于装饰或外墙材质。",
    "橙色的陶瓷质感材质：材质表面有细腻的纹理和温暖的橙色调，适合用在舒适、温馨的环境中。",
    "深蓝色的铝合金材质：它表面平滑且坚固，深蓝色使其更具科技感，适合应用于高端电子产品。",
    "深红色的硬木材质：表面有着清晰的木纹，深红色使得材质更显高贵和传统，适用于古典风格的设计。",
    "高亮黑色的炭材质：表面呈现出黑色的光泽，给人一种现代、简约的感觉，适合极简风格的设计。",
    "浅黄色的金属氧化物材质：这款材质呈现浅黄色，表面有一定的金属质感，适合用于现代艺术品或装饰。",
    "紫色的有机玻璃材质：它呈现出紫色的透明效果，给人一种现代、梦幻的感觉，适合用于创意设计中。",
    "铁灰色的铝合金材质：表面坚硬且具金属光泽，深沉的铁灰色给人稳重、工业感强的视觉效果。",
    "红色的亮面木质材料：红色木质材料的光泽感非常明显，色调明亮且温暖，适合用于家具和室内装饰。",
    "铜色的老化金属材质：这种材质的表面有着岁月的痕迹，带有金属的光泽和老化效果，适合复古风格设计。",
    "钢铁质感的蓝色材质：这种材质坚硬且具有冷金属光泽，深蓝色增加了它的冷峻感，适用于工业设计或机械构件。"
]

level3 = [
    "红色的拉丝金属材质：这种材质具有拉丝效果的金属表面，表面上有一层细微的划痕，折射出红色光泽。拉丝金属具有非常独特的视觉效果，常用于高端产品的外壳设计。它不仅具备金属质感的坚固，还能通过表面纹理传递出一种精细且独特的工业风格。适合用在电子设备、汽车设计或任何需要展现现代感和精致感的物品上。",
    "椅子上的纹路：这款材质的纹理呈现出精致的花纹，可能是圆形、对称或者不规则的图案，表面看起来既舒适又具有视觉冲击力。椅子上的纹理材质常常是通过精细的织物或者皮革等材质呈现，给使用者带来视觉上的享受与舒适的触感。设计师往往会利用这些纹路来增强座椅的美感与舒适度，特别适用于豪华沙发和高端办公椅。",
    "科幻战舰的纹路：这种材质是为了打造科幻风格的战舰外壳效果而设计，表面有类似航空材料的纹理和结构。光滑的表面上布满了复杂的机械纹路和细致的金属拼接效果，通常搭配冷光的材质和强烈的金属质感。适用于未来科技、科幻电影或高端模型的设计，给人一种高科技、坚固、且充满未来感的视觉体验。",
    "深蓝色的抛光金属材质：此材质采用深蓝色金属合金，其表面经过精细的抛光处理，展现出光滑如镜的效果。光泽度和反射效果非常强，给人一种现代和高端的感觉。深蓝色调使得材质更加具有神秘感，适合用于航空航天、精密机械或任何需要强烈视觉冲击的高科技设计。",
    "水面反射的紫色金属材质：这种材质表面呈现出紫色和蓝色的渐变效果，仿佛水面反射的光辉。金属质感结合了水面般的流动感，光线在表面折射，产生多层次的视觉效果。适合用于未来科技、科幻影视道具，或者高端艺术作品中的元素，能够带给人一种极为独特的感觉。",
    "白色皮革的纹理：这款皮革材质表面呈现出精细的纹路，质感柔软且富有弹性，白色使得它显得清新且高贵。常用于时尚品牌的服装、鞋包设计或者豪华家具中。它不仅具备皮革的耐用性，还通过纹理的变化提升了整体的美感与触感，成为时尚和品味的象征。",
    "蓝色光泽的铁板材质：铁板表面采用了特殊的蓝色光泽涂层，形成了一种极具现代感和科技感的视觉效果。表面虽然有金属的坚硬感，但由于光泽的反射，表现出了独特的动感。适合用于电子产品、建筑装饰或者其他需要坚固且具有现代感的产品设计。",
    "铝合金表面的细小纹理：这种材质的表面细腻且均匀，呈现出细小的铝合金纹理，整体上既坚固又具有非常精致的工艺感。适合用于精密机械、航空器材以及现代建筑的设计。其表面纹理不仅提高了抗磨损的能力，还能使得整体设计显得更具现代科技感。",
    "深棕色的木纹纹理：这种木材表面呈现出深棕色的纹理，纹路自然且清晰，带有浓厚的自然气息。常用于家具、地板或室内装饰中，能够为空间带来温暖且舒适的氛围。木纹质感的变化使得每一件产品都有独特的个性，适合用于传统或现代混合风格的室内设计。",
    "经典的金属拉丝效果：这款金属材质的表面具有经典的拉丝效果，通过金属表面细致的刷纹处理，创造出独特的视觉效果。拉丝金属不仅能够反射光线，还赋予表面一种精致且简约的外观，适合用于现代家居、电子产品以及高端装饰物品中。"
]

# 合并材质请求
tasks = level1 + level2 + level3

# 任务前缀列表
user_start = ["做这个材质:", "帮我生成这个：", "这个问题需要你的帮助, 帮我生成", "我希望你能帮助我生成", "请帮我做一个材质", "希望你帮我生成", "请你为我做一个", "你能做一下这个吗?", "", "请完成以下任务:", "帮我搞一个材质:", "能否帮我生成", "请协助我生成一个"]

# 构造数据集
for task in tasks:
    user_prompts = user_start + user_start
    random.shuffle(user_prompts)
    for user in user_prompts:
        dataset.append({
            "prompt": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user + task},
            ],
            "taskid": uuid.uuid4().hex,
            "goal": task,
        })
    
# 输出最终数据集
final_dataset = Dataset.from_list(dataset)

### 定义奖励函数
#### 定义标准格式形式

In [5]:
import re

# 定义正则表达式，用来判断模型的输出是否符合格式要求
match_format = re.compile(
    rf"^[\s]{{0,}}"\
    rf"<think>.+?</think>.*?"\
    rf"<code>(.+?)</code>"\
    rf"[\s]{{0,}}$",
    flags = re.MULTILINE | re.DOTALL
)

match_format.search(
    "<think>Let me think!</think>"\
    "<code>2</code>",
)

<re.Match object; span=(0, 42), match='<think>Let me think!</think><code>2</code>'>

#### 构造奖励函数

In [6]:
# 严格格式判断函数
def match_format_exactly(completions, **kwargs):
    """格式判断函数，严格判断格式是否匹配
    """
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # Match if format is seen exactly!
        if match_format.search(response) is not None: score += 3.0
        scores.append(score)
    return scores

In [7]:
# 弱格式判断函数
def match_format_approximately(prompts, completions, **kwargs):
    """弱格式判断奖励，即使没有严格对应，也可以根据使用的标签数量来做出相应的奖励
    """
    question = prompts[0][-1]["content"]
    responses = [completion[0]["content"] for completion in completions]
        
    print('*'*20, f"Question:\n{question}", f"\nResponse:\n{responses[0]}")
    
    scores = []
    for completion in completions:
        score = 0
        response = completion[0]["content"]
        # 数一数看到多少个关键词——如果太多，我们会惩罚你！
        # 如果我们看到1个关键词，那么加一些积分！如果更多了，那么就应当扣除一些分
        score += 0.5 if response.count(reasoning_start) == 1 else -0.5
        score += 0.5 if response.count(reasoning_end)   == 1 else -0.5
        score += 0.5 if response.count(solution_start)  == 1 else -0.5
        score += 0.5 if response.count(solution_end)    == 1 else -0.5
        scores.append(score)
    return scores

In [8]:
# 获取代码
def code_extractor(code):
    match = re.search(r'<code>(.*?)</code>', code, flags=re.DOTALL)
    if not match:
        return ""
    content = match.group(1).strip()
    if not content:
        return ""
    return content

In [9]:
# 生成的图像和描述的相似度
def accuracy_reward(goal, taskid, completions, prompts, **kwargs):
    """计算生成的图像和描述的相似度
    """
    
    WEIGHT = 2 # 用来在归一化之后加权
    scores = []
    names = []
    
    # 构造传输对象
    materials = {
        "head": {
            "input": goal[0],
            "taskid": taskid[0],
            "request": []
        },
        "outputs": []
    }
    
    # 填充材质代码
    for completion in completions:
        response = completion[0]["content"]
        code = code_extractor(response)
        print("AR_CODE________________")
        print(type(code))
        print(code)
        
        name = f"M{len(materials['outputs'])+1}"
        names.append(name)
        
        materials["outputs"].append({
            "name": name,
            "code": code
        })
    
    print("AR_MT________________")
    print(type(materials))
    print(materials)

    # 添加分数
    c =  client.send_materials(materials)
    print("AR_P________________")
    print(type(c))
    print(c)
    results = c.get("accuracy_rank", {})

    
    for name in names:
        score = int(results.get(name, 0))
        scores.append(score)
    
    # 归一化并加权
    min_s, max_s = min(scores), max(scores)
    if max_s > min_s:
        scores[:] = [(s - min_s) / (max_s - min_s) for s in scores]
        # 对分数加权
        scores = [s * WEIGHT for s in scores]
    else:
        scores = [0,0,0,0]
    
    # 返回分数
    print("accuracy_reward" + str(results))
    return scores

In [10]:
# 图像是否有意义
def meaning_reward(goal, taskid, completions, **kwargs):
    """计算生成的图像是否有意义
    """
    
    WEIGHT = 1 # 用来在归一化之后加权
    scores = []
    names = []
    
    # 构造传输对象
    materials = {
        "head": {
            "input": goal[0],
            "taskid": taskid[0],
            "request": []
        },
        "outputs": []
    }
    
    # 填充材质代码
    for completion in completions:
        response = completion[0]["content"]
        code = code_extractor(response)
        print("MR_CODE________________")
        print(type(code))
        print(code)
        
        name = f"M{len(materials['outputs'])+1}"
        names.append(name)
        
        materials["outputs"].append({
            "name": name,
            "code": code
        })

    print("MR_MT________________")
    print(type(materials))
    print(materials)
    
    # 添加分数
    c =  client.send_materials(materials)
    print("MR_P________________")
    print(type(c))
    print(c)
    results = c.get("meaning_rank", {})
    
    for name in names:
        score = int(results.get(name, 0))
        scores.append(score)
    
    # 归一化并加权
    min_s, max_s = min(scores), max(scores)
    if max_s > min_s:
        scores[:] = [(s - min_s) / (max_s - min_s) for s in scores]
        scores[:] = [s * WEIGHT for s in scores]
    else:
        scores = [0,0,0,0]
    
    # 返回分数
    print("meaning_reward" + str(results))
    return scores

In [11]:
# 代码是否报错
def error_check(goal, taskid, completions, **kwargs):
    """检查生成的代码是否报错
    """
    
    WEIGHT = 2
    scores = []
    names = []
    
    # 构造传输对象
    materials = {
        "head": {
            "input": goal[0],
            "taskid": taskid[0],
            "request": []
        },
        "outputs": []
    }
    
    # 填充材质代码
    for completion in completions:
        response = completion[0]["content"]
        code = code_extractor(response)
        print("EC_CODE________________")
        print(type(code))
        print(code)
        
        name = f"M{len(materials['outputs'])+1}"
        names.append(name)
        
        materials["outputs"].append({
            "name": name,
            "code": code
        })

    print("EC_MT________________")
    print(type(materials))
    print(materials)
    
    # 添加分数
    c =  client.send_materials(materials)
    print("EC_P________________")
    print(type(c))
    print(c)
    results = c.get("status", {})
    # results = client.send_materials(materials).get("status", {})
    
    for name in names:
        score = results.get(name, False)
        scores.append(WEIGHT if score else 0)
    
    # 检查是否存在梯度，如果没有梯度了就放弃
    min_s, max_s = min(scores), max(scores)
    if not max_s > min_s:
        scores = [0,0,0,0]
    
    # 返回分数
    print("error_check" + str(results))
    return scores

### 训练部分
#### 训练配置

In [12]:
max_prompt_length = 256

# 使用 GRPO 训练器，并构造训练器
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    beta = 0.0, # 设置为 0 以禁用 KL 散度惩罚 # defaults to 0.04
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "adamw_torch_fused",
    logging_steps = 1,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1, # 增加到4，以便更顺滑地训练 #1
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = max_prompt_length,
    max_completion_length = max_seq_length - max_prompt_length,
    # num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 500, # 训练步数
    save_steps = 200, # 每200步保存一次
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs_ArtI_12b_bf4_qlora", # 输出目录
)

#### 开始训练

In [13]:
# 创建训练器，并且使用上面给出的 reward function
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        # 格式奖励
        match_format_exactly,
        match_format_approximately,
        
        # 图像意义奖励
        accuracy_reward,
        meaning_reward,
        
        # 运行错误检查
        error_check,
        
    ],
    args = training_args,
    train_dataset = final_dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,366 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 29,802,496/4,000,000,000 (0.75% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


******************** Question:
希望你帮我生成透明的材质 
Response:
<code></code><code></code><code></code><code></code><code></code><code></code><code></code><code></code>

<think>
Okay, the user wants a transparent material. This usually means we’ll want to decrease the roughness value and potentially use a value like 1 to represent full transparency. We want to avoid external files. We should also be aware that a truly transparent material in Blender needs to be carefully set up – primarily through usingler and adjustinger. We need to generate python code that can be directly used to create a transparent material that can be directly used. We will focus on a value of 1 for complete transparency and will decrease roughness for a more visually transparent material.
</think>

<code></code><code></code><code></code><code></code><code></code><code></code><code></code><code></code><code></code>

<answer>
<code></code><code></code><code></code><code></code><code></code><code></code><code></code><code><

RuntimeError: CUDA driver error: out of memory

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import os
from collections import defaultdict
import seaborn as sns

# 设置Seaborn样式以获得更好看的图表
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

def extract_rewards_from_log(log_path):
    """从训练日志文件中提取奖励数据
    
    参数:
        log_path: 日志文件路径
        
    返回:
        包含步骤和对应奖励的pandas DataFrame
    """
    # 存储数据的字典
    data = defaultdict(list)
    step_pattern = re.compile(r'Step\s+(\d+)')
    reward_pattern = re.compile(r'Reward_(\d+):\s+([-\d.]+)')
    mean_reward_pattern = re.compile(r'Mean Reward:\s+([-\d.]+)')
    
    if not os.path.exists(log_path):
        print(f"日志文件 {log_path} 不存在!")
        return pd.DataFrame()
    
    with open(log_path, 'r') as f:
        for line in f:
            # 提取步骤
            step_match = step_pattern.search(line)
            if step_match:
                current_step = int(step_match.group(1))
                data['step'].append(current_step)
                
                # 提取各个奖励函数的值
                rewards = reward_pattern.findall(line)
                for idx, value in rewards:
                    data[f'reward_{idx}'].append(float(value))
                
                # 提取平均奖励
                mean_match = mean_reward_pattern.search(line)
                if mean_match:
                    data['mean_reward'].append(float(mean_match.group(1)))
    
    return pd.DataFrame(data)

def extract_rewards_from_trainer(trainer):
    """从trainer对象中直接提取奖励数据
    
    参数:
        trainer: GRPOTrainer对象
        
    返回:
        包含步骤和对应奖励的pandas DataFrame
    """
    if hasattr(trainer, 'state') and hasattr(trainer.state, 'log_history'):
        data = defaultdict(list)
        for entry in trainer.state.log_history:
            if 'step' in entry:
                data['step'].append(entry['step'])
                
                # 提取各个奖励
                for key, value in entry.items():
                    if key.startswith('reward_'):
                        data[key].append(value)
                
                # 提取平均奖励
                if 'mean_reward' in entry:
                    data['mean_reward'].append(entry['mean_reward'])
                
        return pd.DataFrame(data)
    else:
        print("训练器没有日志历史或者结构不符合预期!")
        return pd.DataFrame()

def plot_rewards(data, title="GRPO训练奖励曲线", save_path=None, moving_avg_window=5):
    """绘制奖励折线图
    
    参数:
        data: 包含奖励数据的DataFrame
        title: 图表标题
        save_path: 保存图表的路径，如果为None则显示图表
        moving_avg_window: 移动平均窗口大小
    """
    if data.empty:
        print("没有数据可以绘图!")
        return
    
    fig, ax = plt.subplots()
    
    # 定义一组专业的颜色
    colors = sns.color_palette('viridis', n_colors=len(data.columns)-1)
    
    # 绘制每个奖励函数的曲线
    for i, col in enumerate([col for col in data.columns if col != 'step']):
        # 原始数据点（透明度降低）
        ax.plot(data['step'], data[col], alpha=0.3, color=colors[i], label=f"{col} (raw)")
        
        # 添加移动平均线
        if len(data) >= moving_avg_window:
            moving_avg = data[col].rolling(window=moving_avg_window).mean()
            ax.plot(data['step'], moving_avg, linewidth=2, color=colors[i], label=f"{col} ({moving_avg_window}-point avg)")
    
    # 添加标题和标签
    ax.set_title(title, fontsize=16, fontweight='bold')
    ax.set_xlabel('Training Steps', fontsize=14)
    ax.set_ylabel('Reward', fontsize=14)
    
    # 添加网格线和图例
    ax.grid(True, linestyle='--', alpha=0.7)
    ax.legend(loc='best', fontsize=12)
    
    # 添加统计信息
    if 'mean_reward' in data.columns:
        final_mean = data['mean_reward'].iloc[-1]
        max_mean = data['mean_reward'].max()
        min_mean = data['mean_reward'].min()
        stats_text = f"Final mean reward: {final_mean:.4f}\nMax mean reward: {max_mean:.4f}\nMin mean reward: {min_mean:.4f}"
        plt.figtext(0.02, 0.02, stats_text, fontsize=12, bbox=dict(facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    
    # 保存或显示图表
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
        print(f"图表已保存到 {save_path}")
    else:
        plt.show()

# 示例用法
def visualize_rewards(trainer=None, log_file=None, output_path=None):
    """可视化训练奖励
    
    参数:
        trainer: GRPOTrainer对象，如果提供则直接从训练器中提取数据
        log_file: 日志文件路径，如果trainer不可用则从日志文件中提取数据
        output_path: 图表保存路径，默认为当前目录下的'reward_plot.png'
    """
    if output_path is None:
        output_path = 'reward_plot.png'
    
    if trainer is not None:
        data = extract_rewards_from_trainer(trainer)
    elif log_file is not None:
        data = extract_rewards_from_log(log_file)
    else:
        print("请提供trainer对象或日志文件路径!")
        return
    
    plot_rewards(data, save_path=output_path)
    
    # 输出一些统计信息
    if not data.empty and 'mean_reward' in data.columns:
        print("\n--- 奖励统计信息 ---")
        print(f"最终平均奖励: {data['mean_reward'].iloc[-1]:.4f}")
        print(f"最大平均奖励: {data['mean_reward'].max():.4f}")
        print(f"最小平均奖励: {data['mean_reward'].min():.4f}")
        
        # 计算奖励增长率
        if len(data) > 1:
            first_reward = data['mean_reward'].iloc[0]
            last_reward = data['mean_reward'].iloc[-1]
            growth = ((last_reward - first_reward) / abs(first_reward)) * 100 if first_reward != 0 else float('inf')
            print(f"奖励增长率: {growth:.2f}%")

# 用法示例
# 1. 使用训练器对象
# visualize_rewards(trainer=trainer)

# 2. 或者使用日志文件
# visualize_rewards(log_file="./outputs_gemma-3_grpo_lora/opt_gemm3_2.log")

# 从训练后直接可视化
# 在训练后调用以下代码即可直接可视化
visualize_rewards(trainer=trainer, output_path="reward_trends.png")

没有数据可以绘图!


### 模型测试
#### 默认模型测试

In [None]:
messages = [
    # {"role": "system", "content": "你是一个 GLSL Shader 生成器，你生成出来的应当就是最终结果，可以直接使用，你也不会使用任何外部文件，纯粹程序化生成"},
    # {"role": "system", "content": "你是一个 blender 节点解释器，会和我解释 blender 节点是干什么用的"},
    {"role": "system", "content": "你是一个 Blender 的材质生成器，会直接生成材质对应的 Python 代码，该代码应该可以且仅在 Blender 中创建对应材质，你生成出来的应当就是最终结果，用户可以直接使用，不需要用户更改，你也不会使用任何外部文件"},
    {"role": "user",   "content": "给我生成一个的灰色渐变到黄色的材质"},
]

text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True, # Must add for generation
    tokenize = False,
)
from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024*2, # Increase for longer outputs!
    # Recommended Gemma-3 settings!
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

```python
import bpy

def create_gradient_material(material_name="GradientMaterial"):
    """
    Creates a grey to yellow gradient material in Blender using GLSL shader.

    Args:
        material_name (str): The name of the material to create.
    """

    # Check if material already exists
    if material_name in bpy.data.materials:
        return

    # Create a new material
    mat = bpy.data.materials.new(name=material_name)
    mat.use_nodes = True
    nodes = mat.node_tree.nodes

    # Clear default nodes
    for node in nodes:
        nodes.remove(node)

    # Create Principled BSDF node
    principled_bsdf = nodes.new(type='ShaderNodeBsdfPrincipled')
    principled_bsdf.location = (200, 0)
    principled_bsdf.inputs['Base Color'].default_value = (0.0, 0.0, 0.0, 1.0) # Initial black color

    # Create ShaderNodeGLSL
    glsl_node = nodes.new(type='ShaderNodeGLSL')
    glsl_node.location = (-200, 0)
    glsl_node.inputs['GLSL Code'].default_value = """
    #version 330 core
 

In [None]:
# 加载原始模型（不包含微调）
from unsloth import FastModel
import torch

# 定义相同的参数
max_seq_length = 1024

# 重新加载原始模型（不应用LoRA权重）
original_model, original_tokenizer = FastModel.from_pretrained(
    model_name = "./models/gemma-3-1b-it",  # 使用原始模型路径
    max_seq_length = max_seq_length,
    load_in_4bit = False,
    load_in_8bit = False,
)

# 测试问题
test_messages = [
    {"role": "system", "content": system_prompt},  # 使用之前定义的系统提示词
    {"role": "user", "content": "What is the sqrt of 101?"},  # 使用同样的测试问题以便比较
]

# 准备输入
test_text = original_tokenizer.apply_chat_template(
    test_messages,
    add_generation_prompt = True,
    tokenize = False,
)

# 使用TextStreamer直接查看输出
from transformers import TextStreamer
print("\n原始模型输出：")
_ = original_model.generate(
    **original_tokenizer(test_text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1024,
    temperature = 0.8,  # 使用与微调模型相同的温度
    top_p = 0.95,
    top_k = 64,
    streamer = TextStreamer(original_tokenizer, skip_prompt = True),
)

#### finetuning 模型测试

In [None]:
# 保存 Lora
model.save_lora("grpo_saved_lora")

#### 保存 Lora

In [None]:
model.save_pretrained("gemma-3")  # Local saving
tokenizer.save_pretrained("gemma-3")

('gemma-3/tokenizer_config.json',
 'gemma-3/special_tokens_map.json',
 'gemma-3/tokenizer.model',
 'gemma-3/added_tokens.json',
 'gemma-3/tokenizer.json')

In [15]:
if True: # Change to True to save finetune!
    model.save_pretrained_merged("gemma-3-finetune", tokenizer)

AttributeError: 'NoneType' object has no attribute 'startswith'

### 保存为完整模型

##### 保存为 bf16 格式

In [23]:
# Merge to 16bit
if True: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if True: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

AttributeError: 'NoneType' object has no attribute 'startswith'

In [None]:
if False: # Change to True to upload finetune
    model.push_to_hub_merged(
        "HF_ACCOUNT/gemma-3-finetune", tokenizer,
        token = "hf_..."
    )

In [None]:
# 保存为 GGUF 格式
# if False:
#     model.save_pretrained_gguf(
#         "gemma-3-finetune",
#         quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
#     )

In [None]:
# if False: # Change to True to upload GGUF
#     model.push_to_hub_gguf(
#         "gemma-3-finetune",
#         quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
#         repo_id = "HF_ACCOUNT/gemma-finetune-gguf",
#         token = "hf_...",
#     )