# Step 1: Install Dependencies

In [None]:
!apt-get update
!apt install nvidia-cuda-toolkit --yes
!nvcc --version
!pip install unsloth
!pip install transformers==4.41.0
!pip install torch==2.5.1
!pip install peft==0.10.0
!pip install datasets==2.16.1
!pip install matplotlib==3.8.0 pandas==2.2.2 numpy==1.26 tqdm==4.66.2
!pip install --upgrade accelerate bitsandbytes

# 验证安装
import unsloth
import transformers
import torch
import peft
import datasets
print('All dependencies installed successfully.')

# Step 2: Import Libraries

In [None]:
import os
import numpy as np
import torch
from datetime import datetime
import json
import re
from tqdm import tqdm
import pandas as pd
import random
import logging
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from datasets import Dataset
from unsloth import GRPOTrainer, GRPOConfig  # 假设 unsloth 提供了这些接口

# Step 3: Download Model and Tokenizer

In [None]:
model_name = 'Qwen/Qwen2.5-7B-Instruct'

# 下载并配置 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained('/content/qwen2.5-7b-tokenizer')
print('Tokenizer downloaded and saved locally.')

# 下载模型
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.float16,
    trust_remote_code=True
)
model.save_pretrained('/content/qwen2.5-7b-model', safe_serialization=False)
print('Model downloaded and saved locally.')

# 应用 LoRA
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM'
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Step 4: Configuration Setup

In [None]:
class Config:
    def __init__(self):
        self.max_seq_length = 3000  # 最大序列长度
        self.batch_size = 1  # 批次大小
        self.gradient_accumulation_steps = 8  # 梯度累积步数
        self.learning_rate = 3e-4  # 学习率
        self.max_steps = 500  # 最大步数
        self.eval_every = 10  # 评估频率
        self.dataset_path = '/content/sudoku.csv'  # 数据集路径
        self.output_dir = f'/content/sudoku_rl_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
        self.save_checkpoints = True
        self.plot_metrics = True
        self.log_interval = 10
        self.lora_rank = 16  # LoRA rank

config = Config()

# 设置日志
os.makedirs(config.output_dir, exist_ok=True)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler(os.path.join(config.output_dir, 'experiment_log.txt')), logging.StreamHandler()]
)
logging.info(f'Experiment started at {datetime.now()}')
logging.info(f'Configuration: {vars(config)}')

# Step 5: Helper Functions

In [None]:
def format_grid(grid_str):
    """格式化数独网格为可读形式"""
    grid = [grid_str[i:i+9] for i in range(0, 81, 9)]
    formatted = []
    for i, row in enumerate(grid):
        formatted.append(' | '.join(row))
        if i % 3 == 2 and i < 8:
            formatted.append('-' * 21)
    return '\n'.join(formatted)

def extract_answer(response):
    """从模型响应中提取答案"""
    match = re.search(r'<answer>(.*?)</answer>', response, re.DOTALL)
    return match.group(1).strip() if match else ''

def normalize_grid(grid_str):
    """规范化数独网格字符串，仅保留数字"""
    return ''.join([c for c in grid_str if c.isdigit()])

# Step 6: Load and Prepare Dataset (Optimized)

In [None]:
def load_sudoku_data(config):
    """加载数独数据集并分割为训练和评估集"""
    try:
        df = pd.read_csv(config.dataset_path)
        logging.info(f'Loaded {len(df)} Sudoku puzzles from {config.dataset_path}')
    except FileNotFoundError:
        logging.error(f'File {config.dataset_path} not found!')
        return [], []
    except Exception as e:
        logging.error(f'Error loading Sudoku data: {e}')
        return [], []

    train_data = []
    eval_data = []
    total_size = len(df)
    eval_size = int(total_size * 0.2)
    train_size = total_size - eval_size

    # 逐行处理数据，避免一次性加载所有数据到内存
    for idx, row in df.iterrows():
        puzzle_str = row['puzzle']
        solution_str = row['solution']
        formatted_puzzle = format_grid(puzzle_str)
        prompt = f'Solve this 9x9 Sudoku puzzle:\n{formatted_puzzle}\nProvide step-by-step reasoning in <think> tags and the final grid in <answer> tags.'
        clues = sum(1 for c in puzzle_str if c != '0')
        difficulty = 1 if clues >= 50 else 2 if clues >= 40 else 3 if clues >= 30 else 4
        data = {
            'prompt': prompt,
            'puzzle': puzzle_str,
            'solution': solution_str,
            'formatted_puzzle': formatted_puzzle,
            'formatted_solution': format_grid(solution_str),
            'difficulty': difficulty,
        }
        if idx < train_size:
            train_data.append(data)
        else:
            eval_data.append(data)

    logging.info(f'Split into {len(train_data)} training examples and {len(eval_data)} evaluation examples')
    return train_data, eval_data

train_data, eval_data = load_sudoku_data(config)
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

# Step 7: Define Reward Functions

In [None]:
def tags_presence_reward_func(completions):
    """检查标签存在性奖励"""
    rewards = []
    for r in completions:
        reward = 0.0
        if '<think>' in r: reward += 0.25
        if '</think>' in r: reward += 0.25
        if '<answer>' in r: reward += 0.25
        if '</answer>' in r: reward += 0.25
        rewards.append(reward)
    return rewards

def tags_order_reward_func(completions):
    """检查标签顺序奖励"""
    rewards = []
    for r in completions:
        think_start = r.find('<think>')
        think_end = r.find('</think>')
        answer_start = r.find('<answer>')
        answer_end = r.find('</answer>')
        if (think_start != -1 and think_end != -1 and
            answer_start != -1 and answer_end != -1 and
            think_start < think_end < answer_start < answer_end):
            rewards.append(1.0)
        else:
            reward = 0.0
            if think_start != -1 and think_end != -1 and think_start < think_end:
                reward += 0.5
            if answer_start != -1 and answer_end != -1 and answer_start < answer_end:
                reward += 0.5
            if think_end != -1 and answer_start != -1 and think_end < answer_start:
                reward += 0.5
            rewards.append(min(reward, 1.0))
    return rewards

def rule_compliance_reward_func(completions):
    """检查数独规则合规性奖励"""
    rewards = []
    for r in completions:
        answer = extract_answer(r)
        digits = normalize_grid(answer)
        if len(digits) != 81:
            rewards.append(0.0)
            continue
        try:
            grid = np.array([int(d) for d in digits]).reshape(9, 9)
            valid_rows = valid_columns = valid_boxes = 0
            for i in range(9):
                if len(set(grid[i, :])) == 9:
                    valid_rows += 1
            for j in range(9):
                if len(set(grid[:, j])) == 9:
                    valid_columns += 1
            for box_i in range(3):
                for box_j in range(3):
                    box = grid[box_i*3:(box_i+1)*3, box_j*3:(box_j+1)*3].flatten()
                    if len(set(box)) == 9:
                        valid_boxes += 1
            total_units = 27  # 9 rows + 9 columns + 9 boxes
            valid_units = valid_rows + valid_columns + valid_boxes
            rewards.append(valid_units / total_units)
        except Exception as e:
            logging.error(f'Error in rule compliance check: {e}')
            rewards.append(0.0)
    return rewards

def enhanced_partial_answer_reward_func(completions, puzzle_str, solution_str):
    """增强的部分答案奖励"""
    rewards = []
    for r, puzzle, solution in zip(completions, puzzle_str, solution_str):
        answer = extract_answer(r)
        digits = normalize_grid(answer)
        if len(digits) != 81:
            rewards.append(0.0)
            continue
        for p, d in zip(puzzle, digits):
            if p != '0' and p != d:
                rewards.append(0.0)
                break
        else:
            empty_cells = [i for i, c in enumerate(puzzle) if c == '0']
            correct_empty = sum(1 for i in empty_cells if digits[i] == solution[i])
            rewards.append(correct_empty * 0.25)
    return rewards

def combined_reward_function(completions, puzzle_str, solution_str):
    """综合奖励函数"""
    reward_functions = [
        (tags_presence_reward_func, 0.5),  # 格式合规性权重
        (tags_order_reward_func, 0.5),
        (rule_compliance_reward_func, 2.0),  # 网格结构权重
        (enhanced_partial_answer_reward_func, 3.0),  # 准确性权重
    ]
    total_rewards = [0.0] * len(completions)
    for reward_func, weight in reward_functions:
        try:
            if reward_func == enhanced_partial_answer_reward_func:
                rewards = reward_func(completions, puzzle_str, solution_str)
            else:
                rewards = reward_func(completions)
            for i, reward in enumerate(rewards):
                total_rewards[i] += reward * weight
        except Exception as e:
            logging.error(f'Error in {reward_func.__name__}: {e}')
    return total_rewards

# Step 8: Training Loop with GRPO (Optimized)

In [None]:
# GRPO 配置
grpo_config = GRPOConfig(
    model_name=model_name,
    learning_rate=config.learning_rate,
    batch_size=config.batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    max_steps=config.max_steps,
    ppo_epochs=1,
    max_grad_norm=1.0
)

# 初始化 GRPO 训练器
grpo_trainer = GRPOTrainer(
    config=grpo_config,
    model=model,
    tokenizer=tokenizer,
    dataset=train_dataset
)

# 训练循环（分批加载数据）
rewards_history = []
for step in tqdm(range(config.max_steps), desc='Training'):
    # 分批加载数据，避免一次性加载整个数据集
    batch = random.sample(train_data, min(config.batch_size, len(train_data)))
    prompts = [data['prompt'] for data in batch]
    puzzle_str = [data['puzzle'] for data in batch]
    solution_str = [data['solution'] for data in batch]
    
    # 生成模型输出
    inputs = tokenizer(prompts, return_tensors='pt', truncation=True, max_length=config.max_seq_length).to('cuda')
    response_ids = grpo_trainer.generate(inputs['input_ids'], max_length=config.max_seq_length)
    completions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in response_ids]
    
    # 计算奖励
    rewards = combined_reward_function(completions, puzzle_str, solution_str)
    rewards_tensor = torch.tensor(rewards, dtype=torch.float).to('cuda')
    
    # 更新模型
    grpo_trainer.step(inputs['input_ids'], response_ids, rewards_tensor)
    
    # 日志和评估
    avg_reward = np.mean(rewards)
    rewards_history.append(avg_reward)
    if (step + 1) % config.log_interval == 0:
        logging.info(f'Step {step + 1}, Average Reward: {avg_reward}')
    
    if (step + 1) % config.eval_every == 0:
        eval_rewards = []
        eval_batch = random.sample(eval_data, min(10, len(eval_data)))
        for data in eval_batch:
            prompt = data['prompt']
            inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=config.max_seq_length).to('cuda')
            response_ids = model.generate(**inputs, max_length=config.max_seq_length)
            response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
            reward = combined_reward_function([response], [data['puzzle']], [data['solution']])[0]
            eval_rewards.append(reward)
        logging.info(f'Step {step + 1}: Evaluation Reward: {np.mean(eval_rewards)}')
        
        if config.save_checkpoints:
            checkpoint_dir = os.path.join(config.output_dir, f'checkpoint_step_{step + 1}')
            model.save_pretrained(checkpoint_dir)
            tokenizer.save_pretrained(checkpoint_dir)
            logging.info(f'Saved checkpoint at {checkpoint_dir}')

# 保存最终模型
final_model_dir = os.path.join(config.output_dir, 'final_model')
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
logging.info(f'Training completed. Final model saved to {final_model_dir}')

# Step 9: Plot Results

In [None]:
if config.plot_metrics:
    plt.plot(rewards_history)
    plt.xlabel('Training Step')
    plt.ylabel('Average Reward')
    plt.title('Training Reward Over Time')
    plt.savefig(os.path.join(config.output_dir, 'reward_curve.png'))
    plt.show()