In [None]:
# @title üõ†Ô∏è One-Click Installation & Setup (Run once per session)
import os
import sys
import subprocess
import time

start_time = time.time()
print("‚è≥ Starting Environment Setup... (This takes ~2 minutes)")

def run_cmd(cmd, message):
    print(f"   - {message}")
    try:
        subprocess.check_call(cmd, shell=True)
    except subprocess.CalledProcessError as e:
        print(f"   ‚ùå Error executing: {cmd}")
        raise e

# --- 1. Install Core Dependencies ---
# We install these first to leverage pre-built wheels
run_cmd("pip install -q --upgrade pip", "Upgrading pip")
run_cmd(
    "pip install -q vllm transformers accelerate datasets bitsandbytes peft",
    "Installing Core ML Libraries (vLLM, Transformers, etc.)"
)

# --- 2. Install Verl (The RL Library) ---
run_cmd(
    "pip install -q git+https://github.com/volcengine/verl.git",
    "Installing Verl from Source"
)

# --- 3. Fix Numpy Conflict (Crucial for Colab) ---
# Colab uses Numpy 2.x by default, but Verl requires 1.x
run_cmd(
    "pip install -q 'numpy<2.0.0' --force-reinstall",
    "Downgrading Numpy to 1.x (Compatibility Fix)"
)

# --- 4. Auto-Patch for Fast Training (SDPA) ---
# This forces the library to use standard PyTorch attention instead of 
# waiting 15 mins for Flash Attention to compile.
print("   - Patching library for SDPA (Fast Attention)...")
target_file = "/usr/local/lib/python3.12/dist-packages/verl/workers/fsdp_workers.py"

if os.path.exists(target_file):
    with open(target_file, 'r') as f:
        content = f.read()

    # The code we want to modify
    search_str = "actor_module_class.from_pretrained("
    replace_str = "actor_module_class.from_pretrained(attn_implementation='sdpa', "

    # IDEMPOTENT PATCHING: Only apply if not already present
    if replace_str in content:
        print("     ‚úÖ Library is already patched.")
    elif search_str in content:
        new_content = content.replace(search_str, replace_str)
        with open(target_file, 'w') as f:
            f.write(new_content)
        print("     ‚úÖ Patch applied successfully!")
    else:
        print("     ‚ö†Ô∏è Warning: Could not find patch target. Verl version might have changed.")
else:
    print("     ‚ùå Error: Verl library not found. Installation likely failed.")

# --- 5. Set Environment Variables ---
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

elapsed = int(time.time() - start_time)
print(f"üéâ Setup Complete in {elapsed} seconds! You can run training now.")

In [None]:
# @title 2. Prepare Data (GSM8K)
import datasets
import pandas as pd
import os

# Create directory for data
os.makedirs("data/gsm8k", exist_ok=True)

# 1. Load the dataset
dataset = datasets.load_dataset("openai/gsm8k", "main")

# 2. Define the formatting function
# GRPO needs a "prompt" and a "ground_truth"
def process_fn(example, idx, split):
    # Standard GSM8K prompt structure
    instruction = (
        example["question"] +
        "\nAnswer the above math problem. "
        "Think step by step. Output the final answer after ####."
    )
    
    return {
        "data_source": "gsm8k",
        "prompt": [{"role": "user", "content": instruction}],
        "ability": "math",
        "reward_model": {
            "style": "rule", 
            "ground_truth": example["answer"]
        },
        "extra_info": {"split": split, "index": idx}
    }

# 3. Apply formatting
train_dataset = dataset["train"].map(lambda x, i: process_fn(x, i, "train"), with_indices=True)
test_dataset = dataset["test"].map(lambda x, i: process_fn(x, i, "test"), with_indices=True)

# 4. Save to Parquet (Verl format)
train_dataset.to_parquet("data/gsm8k/train.parquet")
test_dataset.to_parquet("data/gsm8k/test.parquet")

print(f"‚úÖ Data ready! Train: {len(train_dataset)}, Test: {len(test_dataset)}")
print("First example prompt:", train_dataset[0]['prompt'][0]['content'])

In [None]:
# @title 4. Run Training (Polished & Drive Integrated)
import os
import sys
from google.colab import drive

# --- ‚öôÔ∏è USER SETTINGS ---
SAVE_TO_DRIVE = True  # Set to False if you don't want to mount Drive
EXPERIMENT_NAME = "qwen-grpo-gsm8k"
MODEL_PATH = "Qwen/Qwen2.5-0.5B-Instruct"

# --- 1. Setup Storage Paths ---
if SAVE_TO_DRIVE:
    drive.mount('/content/drive')
    # Save directly to MyDrive so checkpoints survive runtime disconnects
    local_dir = f"/content/drive/MyDrive/verl_checkpoints/{EXPERIMENT_NAME}"
    print(f"üíæ Checkpoints will be saved to: {local_dir}")
else:
    local_dir = f"checkpoints/{EXPERIMENT_NAME}"
    print(f"üíæ Checkpoints will be saved locally: {local_dir}")

# --- 2. Set Environment Optimization ---
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "true"

# --- 3. Construct the Training Command ---
print("üöÄ Starting GRPO Training...")
print("Logs will stream below. (Initialize time: ~1 min)")

cmd_parts = [
    sys.executable, "-m", "verl.trainer.main_ppo",
    
    # --- Algorithm & Data ---
    "algorithm.adv_estimator=grpo",
    "data.train_files=data/gsm8k/train.parquet",
    "data.val_files=data/gsm8k/test.parquet",
    "data.train_batch_size=4",
    "data.val_batch_size=4",
    "data.max_prompt_length=512",
    "data.max_response_length=512",
    
    # --- Model Configuration ---
    f"actor_rollout_ref.model.path={MODEL_PATH}",
    "actor_rollout_ref.model.use_remove_padding=True",
    "actor_rollout_ref.model.enable_gradient_checkpointing=True",
    
    # --- Training Hyperparameters ---
    "actor_rollout_ref.rollout.n=4",            # Number of generated responses per prompt
    "actor_rollout_ref.rollout.temperature=0.8",
    "actor_rollout_ref.actor.optim.lr=1e-6",    # Learning rate
    
    # --- Batch Sizes (Critical for Stability) ---
    "actor_rollout_ref.actor.ppo_mini_batch_size=4",        
    "actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4", 
    "actor_rollout_ref.rollout.log_prob_micro_batch_size=4",

    # --- Hardware & Colab Optimizations ---
    "data.dataloader_num_workers=1",       # Low worker count to save CPU
    "+ray_kwargs.ray_init.num_cpus=8",     # Spoof CPU count to prevent Ray blocking
    "trainer.n_gpus_per_node=1",
    "trainer.nnodes=1",
    
    # --- Logging & Saving ---
    "trainer.logger=['console']",
    f"trainer.project_name='{EXPERIMENT_NAME}'",
    f"trainer.experiment_name='{EXPERIMENT_NAME}'",
    f"trainer.default_local_dir='{local_dir}'", # Where to save the model
    "++trainer.val_before_train=False"
]

# --- 4. Execute ---
cmd = " ".join(cmd_parts)
!{cmd}