# PPO算法入门实践

这个notebook将带你一步步了解和实践PPO算法

## 1. 导入必要的库

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead
from datasets import Dataset
import matplotlib.pyplot as plt

print(f"PyTorch版本: {torch.__version__}")
print(f"CUDA可用: {torch.cuda.is_available()}")

## 2. 理解PPO配置参数

In [None]:
# 创建PPO配置
config = PPOConfig(
    model_name="gpt2",           # 基础模型名称
    learning_rate=1.41e-5,       # 学习率
    batch_size=8,               # 批大小
    mini_batch_size=2,          # 小批大小
    gradient_accumulation_steps=1, # 梯度累积
    ppo_epochs=4,               # PPO训练轮数
    cliprange=0.2,              # 剪切范围
    vf_coef=0.1,                # 价值函数损失系数
    seed=42,                    # 随机种子
    steps=50,                   # 总训练步数
)

print("PPO配置创建完成")
print(f"批大小: {config.batch_size}")
print(f"学习率: {config.learning_rate}")
print(f"训练步数: {config.steps}")

## 3. 准备模型和数据

In [None]:
# 加载tokenizer和模型
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")

# 设置特殊token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

print("模型加载完成")
print(f"模型参数量: {model.num_parameters():,}")

In [None]:
# 创建训练数据
prompts = [
    "今天天气很好，",
    "人工智能是",
    "我最喜欢的颜色是",
    "编程的乐趣在于",
    "学习新知识让我",
] * 10  # 重复创建更多样本

dataset = Dataset.from_dict({"query": prompts})
print(f"数据集大小: {len(dataset)}")
print(f"示例提示: {dataset[0]['query']}")

## 4. 定义奖励函数

In [None]:
def reward_function(texts):
    """
    奖励函数：鼓励积极、有意义的回复
    """
    rewards = []
    
    # 积极词汇列表
    positive_words = ["好", "棒", "喜欢", "开心", "快乐", "有趣", "学习", "成长"]
    
    for text in texts:
        reward = 0.0
        
        # 基础奖励
        reward += 0.1
        
        # 长度奖励 (适中长度)
        if 10 <= len(text) <= 50:
            reward += 0.5
        
        # 积极词汇奖励
        for word in positive_words:
            if word in text:
                reward += 0.3
        
        # 避免重复
        words = text.split()
        if len(words) > len(set(words)) * 1.5:  # 重复词太多
            reward -= 0.2
        
        rewards.append(reward)
    
    return rewards

# 测试奖励函数
test_texts = ["今天天气很好，我很开心", "重复重复重复重复", "学习让我快乐"]
test_rewards = reward_function(test_texts)

for text, reward in zip(test_texts, test_rewards):
    print(f"文本: '{text}' -> 奖励: {reward:.2f}")

## 5. 创建PPO训练器并开始训练

In [None]:
# 创建PPO训练器
ppo_trainer = PPOTrainer(
    config=config,
    model=model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=dataset
)

print("PPO训练器创建完成")

In [None]:
# 训练循环
training_stats = []

print("🚀 开始PPO训练...")

for epoch, batch in enumerate(ppo_trainer.dataloader):
    if epoch >= 10:  # 只训练10步用于演示
        break
    
    print(f"\n--- 训练步骤 {epoch + 1} ---")
    
    # 生成回复
    query_tensors = batch["input_ids"]
    response_tensors = ppo_trainer.generate(
        query_tensors,
        return_prompt=False,
        max_length=80,
        do_sample=True,
        top_p=0.9,
        temperature=0.7,
    )
    
    # 解码文本
    batch_texts = []
    for i in range(len(response_tensors)):
        response_text = tokenizer.decode(response_tensors[i], skip_special_tokens=True)
        batch_texts.append(response_text)
        
        # 显示第一个样本
        if i == 0:
            query_text = tokenizer.decode(query_tensors[i], skip_special_tokens=True)
            print(f"提示: {query_text}")
            print(f"回复: {response_text}")
    
    # 计算奖励
    rewards = reward_function(batch_texts)
    rewards = [torch.tensor(r) for r in rewards]
    
    print(f"平均奖励: {np.mean([r.item() for r in rewards]):.3f}")
    
    # PPO更新
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    
    # 记录统计信息
    if stats:
        training_stats.append({
            'epoch': epoch,
            'mean_reward': np.mean([r.item() for r in rewards]),
            'policy_loss': stats.get('ppo/loss/policy', 0),
            'value_loss': stats.get('ppo/loss/value', 0)
        })

print("\n🎉 训练完成！")

## 6. 可视化训练过程

In [None]:
# 绘制训练曲线
if training_stats:
    epochs = [s['epoch'] for s in training_stats]
    rewards = [s['mean_reward'] for s in training_stats]
    
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(epochs, rewards, 'b-o')
    plt.title('平均奖励随训练变化')
    plt.xlabel('训练步骤')
    plt.ylabel('平均奖励')
    plt.grid(True)
    
    if 'policy_loss' in training_stats[0]:
        policy_losses = [s['policy_loss'] for s in training_stats]
        plt.subplot(1, 2, 2)
        plt.plot(epochs, policy_losses, 'r-o')
        plt.title('策略损失随训练变化')
        plt.xlabel('训练步骤')
        plt.ylabel('策略损失')
        plt.grid(True)
    
    plt.tight_layout()
    plt.show()
    
    print("📊 训练曲线绘制完成")
else:
    print("❌ 没有训练统计数据")

## 7. 测试训练效果

In [None]:
# 交互式测试
def test_model(prompt):
    """测试模型生成"""
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=100,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            pad_token_id=tokenizer.pad_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()

# 测试几个例子
test_prompts = [
    "今天是美好的一天，",
    "学习人工智能让我",
    "我的梦想是"
]

for prompt in test_prompts:
    response = test_model(prompt)
    reward = reward_function([response])[0]
    print(f"\n提示: {prompt}")
    print(f"回复: {response}")
    print(f"奖励: {reward:.2f}")

## 8. 下一步学习

完成这个基础示例后，你已经了解了：
- PPO的基本工作流程
- 如何定义奖励函数
- 如何配置和运行PPO训练

接下来我们将学习：
1. RLHF完整训练流程
2. DPO算法实现
3. 自定义奖励模型
4. 大规模训练技巧