# Step 1: Install Dependencies
Install all required libraries for model fine-tuning, reinforcement learning, and data processing.

In [1]:
!apt-get update
!apt install nvidia-cuda-toolkit --yes
!nvcc --version
!pip install bitsandbytes-cuda117
!pip install transformers==4.41.0
!pip install torch==2.5.1
!pip install peft==0.10.0
!pip install datasets==2.16.1
!pip install trl==0.8.1
!pip install matplotlib==3.8.0 pandas==2.2.2 numpy==1.26 tqdm==4.66.2
!pip install --upgrade accelerate bitsandbytes
# Verify installations
import transformers
import datasets
import torch
import peft
import trl
import accelerate
import bitsandbytes
print('All dependencies installed successfully.')

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ubuntu.com (91.189.91.8                                                                                                    Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Connecting to r2u.stat.illinois.edu (192.17.190.167)                                                                                                    Hit:3 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting for headers] [Connecting to r2u.stat.illinois.edu (192.17.190.167)                                                                                                    Hit:4 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
0% [Waiting for headers] [Waiting fo

# Step 2: Import Libraries
Import all necessary libraries for the training process.

In [1]:
import os
import numpy as np
import torch
from datetime import datetime
import json
import re
from tqdm import tqdm
import pandas as pd
import random
import logging
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model
from trl import PPOTrainer, PPOConfig
from datasets import Dataset

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

# Step 3: Download Model and Tokenizer
Download the Qwen2.5-7B-Instruct model and tokenizer from Hugging Face.

In [2]:
#!rm -rf /content/qwen2.5-7b-model /root/.cache/huggingface
# 模型名称
model_name = 'Qwen/Qwen2.5-7B-Instruct'

# 下载并保存tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained('/content/qwen2.5-7b-tokenizer')
print('Tokenizer downloaded and saved locally.')

# 下载非量化模型
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',  # 自动分配设备（GPU优先）
    torch_dtype=torch.float16,  # 使用半精度以节省显存
    trust_remote_code=True
)

# 保存模型
model.save_pretrained('/content/qwen2.5-7b-model',safe_serialization=False)
print('Model downloaded and saved locally.')

# 应用LoRA
lora_config = LoraConfig(
    r=16,  # LoRA的秩
    lora_alpha=32,  # LoRA的缩放因子
    target_modules=['q_proj', 'v_proj'],  # 目标模块
    lora_dropout=0.05,  # Dropout概率
    bias='none',  # 不训练偏置
    task_type='CAUSAL_LM'  # 任务类型
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer downloaded and saved locally.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Model downloaded and saved locally.
trainable params: 5,046,272 || all params: 7,620,662,784 || trainable%: 0.06621828235983522


# Step 4: Configuration Setup
Define the configuration class for training parameters.

In [5]:
class Config:
    def __init__(self):
        self.max_seq_length = 3000
        self.batch_size = 1
        self.gradient_accumulation_steps = 8
        self.learning_rate = 3e-4
        self.max_steps = 500
        self.eval_every = 10
        self.dataset_path = '/content/sudoku.csv'  # Adjust path for Colab
        self.output_dir = f'/content/sudoku_rl_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
        self.save_checkpoints = True
        self.plot_metrics = True
        self.log_interval = 10
        self.lora_rank = 16  # LoRA rank for fine-tuning

config = Config()

# Setup logging
os.makedirs(config.output_dir, exist_ok=True)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler(os.path.join(config.output_dir, 'experiment_log.txt')), logging.StreamHandler()]
)
logging.info(f'Experiment started at {datetime.now()}')
logging.info(f'Configuration: {vars(config)}')

# Step 5: Helper Functions
Define utility functions for grid formatting and answer extraction.

In [6]:
def format_grid(grid_str):
    grid = [grid_str[i:i+9] for i in range(0, 81, 9)]
    formatted = []
    for i, row in enumerate(grid):
        formatted.append(' | '.join(row))
        if i % 3 == 2 and i < 8:
            formatted.append('-' * 21)
    return '\n'.join(formatted)

def extract_answer(response):
    match = re.search(r'<answer>(.*?)</answer>', response, re.DOTALL)
    return match.group(1).strip() if match else ''

def normalize_grid(grid_str):
    return ''.join([c for c in grid_str if c.isdigit()])

# Step 6: Load and Prepare Dataset
Load the Sudoku dataset and split it into training and evaluation sets.

In [None]:
def load_sudoku_data(config):
    try:
        df = pd.read_csv(config.dataset_path)
        logging.info(f'Loaded {len(df)} Sudoku puzzles from {config.dataset_path}')
    except FileNotFoundError:
        logging.error(f'File {config.dataset_path} not found!')
        return [], []
    except Exception as e:
        logging.error(f'Error loading Sudoku data: {e}')
        return [], []

    train_data = []
    eval_data = []
    total_size = len(df)
    eval_size = int(total_size * 0.2)
    train_size = total_size - eval_size

    for idx, row in df.iterrows():
        puzzle_str = row['puzzle']
        solution_str = row['solution']
        formatted_puzzle = format_grid(puzzle_str)
        prompt = f'Solve this 9x9 Sudoku puzzle:\n{formatted_puzzle}\nProvide step-by-step reasoning in <think> tags and the final grid in <answer> tags.'
        data = {
            'prompt': prompt,
            'puzzle': puzzle_str,
            'solution': solution_str,
            'formatted_puzzle': formatted_puzzle,
            'formatted_solution': format_grid(solution_str),
            'difficulty': 1 if sum(1 for c in puzzle_str if c != '0') >= 50 else 2,
        }
        if idx < train_size:
            train_data.append(data)
        else:
            eval_data.append(data)

    logging.info(f'Split into {len(train_data)} training examples and {len(eval_data)} evaluation examples')
    return train_data, eval_data

# Load data
train_data, eval_data = load_sudoku_data(config)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)

# Step 7: Define Reward Functions
Implement reward functions to evaluate model outputs.

In [None]:
def tags_presence_reward_func(completions):
    rewards = []
    for r in completions:
        reward = 0.0
        if '<think>' in r: reward += 0.25
        if '</think>' in r: reward += 0.25
        if '<answer>' in r: reward += 0.25
        if '</answer>' in r: reward += 0.25
        rewards.append(reward)
    return rewards

def tags_order_reward_func(completions):
    rewards = []
    for r in completions:
        think_start = r.find('<think>')
        think_end = r.find('</think>')
        answer_start = r.find('<answer>')
        answer_end = r.find('</answer>')
        if (think_start != -1 and think_end != -1 and
            answer_start != -1 and answer_end != -1 and
            think_start < think_end < answer_start < answer_end):
            rewards.append(1.0)
        else:
            reward = 0.0
            if think_start != -1 and think_end != -1 and think_start < think_end:
                reward += 0.5
            if answer_start != -1 and answer_end != -1 and answer_start < answer_end:
                reward += 0.5
            if think_end != -1 and answer_start != -1 and think_end < answer_start:
                reward += 0.5
            rewards.append(min(reward, 1.0))
    return rewards

def rule_compliance_reward_func(completions):
    rewards = []
    for r in completions:
        answer = extract_answer(r)
        digits = normalize_grid(answer)
        if len(digits) != 81:
            rewards.append(0.0)
            continue
        try:
            grid = np.array([int(d) if d.isdigit() else 0 for d in digits]).reshape(9, 9)
            valid_rows = valid_columns = valid_boxes = 0
            for i in range(9):
                row = grid[i, :]
                non_zero = row[row != 0]
                if len(non_zero) == len(set(non_zero)):
                    valid_rows += len(non_zero) / 9
                    if len(non_zero) == 9:
                        valid_rows += 1
            for j in range(9):
                col = grid[:, j]
                non_zero = col[col != 0]
                if len(non_zero) == len(set(non_zero)):
                    valid_columns += len(non_zero) / 9
                    if len(non_zero) == 9:
                        valid_columns += 1
            for box_i in range(3):
                for box_j in range(3):
                    box = grid[box_i*3:(box_i+1)*3, box_j*3:(box_j+1)*3].flatten()
                    non_zero = box[box != 0]
                    if len(non_zero) == len(set(non_zero)):
                        valid_boxes += len(non_zero) / 9
                        if len(non_zero) == 9:
                            valid_boxes += 1
            max_score = 9 + 9 + 9
            total_score = valid_rows + valid_columns + valid_boxes
            rewards.append(min(total_score / max_score, 1.0))
        except Exception as e:
            logging.error(f'Error in rule compliance check: {e}')
            rewards.append(0.0)
    return rewards

def enhanced_partial_answer_reward_func(completions, puzzle_str, solution_str):
    rewards = []
    for r, puzzle, solution in zip(completions, puzzle_str, solution_str):
        answer = extract_answer(r)
        digits = normalize_grid(answer)
        if len(digits) != 81:
            rewards.append(0.0)
            continue
        for p, d in zip(puzzle, digits):
            if p != '0' and p != d:
                rewards.append(0.0)
                break
        else:
            empty_cells = [i for i, c in enumerate(puzzle) if c == '0']
            correct_empty = sum(1 for i in empty_cells if digits[i] == solution[i])
            rewards.append(correct_empty * 0.25)
    return rewards

def combined_reward_function(completions, puzzle_str, solution_str):
    reward_functions = [
        (tags_presence_reward_func, 0.5),
        (tags_order_reward_func, 0.5),
        (rule_compliance_reward_func, 2.0),
        (enhanced_partial_answer_reward_func, 3.0),
    ]
    total_rewards = [0.0] * len(completions)
    for reward_func, weight in reward_functions:
        try:
            if reward_func == enhanced_partial_answer_reward_func:
                rewards = reward_func(completions, puzzle_str, solution_str)
            else:
                rewards = reward_func(completions)
            for i, reward in enumerate(rewards):
                total_rewards[i] += reward * weight
        except Exception as e:
            logging.error(f'Error in {reward_func.__name__}: {e}')
    return total_rewards

# Step 8: Training Loop with PPO
Set up and run the PPO training loop for reinforcement learning.

In [None]:
# PPO Configuration
ppo_config = PPOConfig(
    model_name=model_name,
    learning_rate=config.learning_rate,
    batch_size=config.batch_size,
    mini_batch_size=1,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    ppo_epochs=1,
    max_grad_norm=1.0
)

# Initialize PPO Trainer
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=None,  # Use the same model as reference (optional)
    tokenizer=tokenizer,
    dataset=train_dataset
)

# Training Loop
rewards_history = []
for step in tqdm(range(config.max_steps), desc='Training'):
    batch = random.sample(train_data, min(config.batch_size, len(train_data)))
    step_rewards = []

    for data in batch:
        prompt = data['prompt']
        inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=config.max_seq_length).to('cuda')
        response_ids = ppo_trainer.generate(inputs['input_ids'], max_length=config.max_seq_length)
        response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
        completions = [response]
        puzzle_str = [data['puzzle']]
        solution_str = [data['solution']]
        total_reward = combined_reward_function(completions, puzzle_str, solution_str)[0]
        step_rewards.append(total_reward)

        # Prepare for PPO update
        rewards = torch.tensor([total_reward], dtype=torch.float).to('cuda')
        ppo_trainer.step(inputs['input_ids'], response_ids, rewards)

        if (step + 1) % config.log_interval == 0:
            logging.info(f'Step {step + 1}, Prompt: {prompt[:50]}..., Reward: {total_reward}')

    if step_rewards:
        avg_reward = np.mean(step_rewards)
        rewards_history.append(avg_reward)
        logging.info(f'Step {step + 1} completed. Average reward: {avg_reward}')

    if (step + 1) % config.eval_every == 0:
        logging.info(f'Step {step + 1}: Evaluating...')
        eval_rewards = []
        eval_batch = random.sample(eval_data, min(10, len(eval_data)))
        for data in eval_batch:
            prompt = data['prompt']
            inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=config.max_seq_length).to('cuda')
            response_ids = model.generate(**inputs, max_length=config.max_seq_length)
            response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
            completions = [response]
            puzzle_str = [data['puzzle']]
            solution_str = [data['solution']]
            total_reward = combined_reward_function(completions, puzzle_str, solution_str)[0]
            eval_rewards.append(total_reward)
        if eval_rewards:
            logging.info(f'Evaluation Reward: {np.mean(eval_rewards)}')

    # Save checkpoint
    if config.save_checkpoints and (step + 1) % config.eval_every == 0:
        checkpoint_dir = os.path.join(config.output_dir, f'checkpoint_step_{step + 1}')
        model.save_pretrained(checkpoint_dir)
        tokenizer.save_pretrained(checkpoint_dir)
        logging.info(f'Saved checkpoint at {checkpoint_dir}')

# Save final model
final_model_dir = os.path.join(config.output_dir, 'final_model')
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
logging.info(f'Training completed. Final model saved to {final_model_dir}')

# Step 9: Plot Results
Visualize the training rewards.

In [None]:
if config.plot_metrics:
    plt.plot(rewards_history)
    plt.xlabel('Training Step')
    plt.ylabel('Average Reward')
    plt.title('Training Reward Over Time')
    plt.savefig(os.path.join(config.output_dir, 'reward_curve.png'))
    plt.show()

# Notes
- Upload `sudoku.csv` to Colab before running (via Files tab).
- Ensure you have a GPU runtime enabled (Runtime > Change runtime type > GPU).
- The 7B model with LoRA should fit in a T4 GPU (16GB), but you may need to reduce `max_seq_length` (e.g., to 1024) if you encounter OOM errors.
- Model and tokenizer are downloaded in Step 3 and saved locally for reuse.