# Train a Reasoning LLM with GRPO (R1-Zero Replication)

This notebook trains Qwen2.5-1.5B using pure RL (GRPO) to develop reasoning capabilities,
replicating the DeepSeek-R1-Zero experiment at small scale.

**Requirements:** A100 GPU (40GB or 80GB). The config auto-adjusts for GPU memory.

## 1. Setup

In [None]:
# Install dependencies
!pip install -q torch transformers trl accelerate datasets math-verify wandb tensorboard pyyaml

In [None]:
# Clone the repo (or upload files)
import os
REPO_URL = ""  # Set your repo URL here if using git
PROJECT_DIR = "/content/reson-llm"

if REPO_URL and not os.path.exists(PROJECT_DIR):
    !git clone {REPO_URL} {PROJECT_DIR}
elif not os.path.exists(PROJECT_DIR):
    print("Please upload the reson-llm project files or set REPO_URL")

os.chdir(PROJECT_DIR)
print(f"Working directory: {os.getcwd()}")

In [None]:
# GPU check and memory-based config selection
import torch

if not torch.cuda.is_available():
    raise RuntimeError("No GPU detected! This notebook requires an A100 GPU.")

gpu_name = torch.cuda.get_device_name(0)
gpu_mem_gb = torch.cuda.get_device_properties(0).total_mem / 1e9
print(f"GPU: {gpu_name}")
print(f"Memory: {gpu_mem_gb:.1f} GB")

# Auto-select config based on GPU memory
if gpu_mem_gb >= 70:
    GPU_TIER = "80gb"
    print("Config: Full (G=16, max_completion=2048)")
elif gpu_mem_gb >= 35:
    GPU_TIER = "40gb"
    print("Config: Reduced (G=8, max_completion=1024)")
else:
    GPU_TIER = "small"
    print("WARNING: Limited GPU memory. Using minimal config.")

In [None]:
# W&B login
import wandb
wandb.login()

In [None]:
# Google Drive mount for checkpoint persistence
from google.colab import drive
drive.mount("/content/drive")

DRIVE_CHECKPOINT_DIR = "/content/drive/MyDrive/reson-llm-checkpoints"
os.makedirs(DRIVE_CHECKPOINT_DIR, exist_ok=True)
print(f"Checkpoints will be saved to: {DRIVE_CHECKPOINT_DIR}")

## 2. Configure Training

In [None]:
import yaml

# Load base config
with open("configs/grpo_qwen2.5_1.5b.yaml") as f:
    config = yaml.safe_load(f)

# Override output_dir to use Google Drive
config["output_dir"] = DRIVE_CHECKPOINT_DIR

# Adjust for GPU tier
if GPU_TIER == "40gb":
    config["num_generations"] = 8
    config["max_completion_length"] = 1024
    config["per_device_train_batch_size"] = 1
    print("Adjusted config for 40GB GPU")
elif GPU_TIER == "small":
    config["num_generations"] = 4
    config["max_completion_length"] = 512
    config["per_device_train_batch_size"] = 1
    config["gradient_accumulation_steps"] = 4
    print("Adjusted config for small GPU")

# Write adjusted config
RUNTIME_CONFIG = "/content/runtime_config.yaml"
with open(RUNTIME_CONFIG, "w") as f:
    yaml.dump(config, f, default_flow_style=False)

print("\nTraining config:")
for k, v in config.items():
    print(f"  {k}: {v}")

## 3. Train

In [None]:
# Check for existing checkpoint to resume from
import glob

checkpoints = sorted(glob.glob(f"{DRIVE_CHECKPOINT_DIR}/checkpoint-*"))
resume_arg = ""
if checkpoints:
    latest = checkpoints[-1]
    print(f"Found checkpoint: {latest}")
    resume_arg = f"--resume_from_checkpoint {latest}"
else:
    print("No checkpoint found, starting fresh.")

In [None]:
# Launch training
!python src/train_grpo.py --config {RUNTIME_CONFIG} {resume_arg}

## 4. Monitor (TensorBoard)

In [None]:
%load_ext tensorboard
%tensorboard --logdir {DRIVE_CHECKPOINT_DIR}

## 5. Evaluate

In [None]:
# Quick eval on 200 samples
!python src/evaluate.py \
    --model_path {DRIVE_CHECKPOINT_DIR} \
    --num_samples 200 \
    --output_dir eval_results

In [None]:
# Baseline comparison (raw Qwen2.5-1.5B)
!python src/evaluate.py \
    --model_path Qwen/Qwen2.5-1.5B \
    --num_samples 200 \
    --output_dir eval_results_baseline

In [None]:
# Compare results
import json

with open("eval_results/summary.json") as f:
    trained = json.load(f)
with open("eval_results_baseline/summary.json") as f:
    baseline = json.load(f)

print("=" * 60)
print(f"{'Metric':<30} {'Baseline':>12} {'Trained':>12}")
print("=" * 60)
for dataset in ["gsm8k", "math"]:
    if dataset in trained and dataset in baseline:
        b = baseline[dataset]
        t = trained[dataset]
        print(f"{dataset.upper()} accuracy:{'':>13} {b['accuracy']:>11.1%} {t['accuracy']:>11.1%}")
        print(f"{dataset.upper()} format compliance:{'':>4} {b['format_compliance']:>11.1%} {t['format_compliance']:>11.1%}")
        print(f"{dataset.upper()} avg think tokens:{'':>5} {b['avg_think_tokens']:>11.0f} {t['avg_think_tokens']:>11.0f}")
        print("-" * 60)

## 6. Push to Hub (Optional)

In [None]:
# Uncomment and set your HF username to push
# from huggingface_hub import login
# login()

# HF_USERNAME = "your-username"
# from transformers import AutoModelForCausalLM, AutoTokenizer
# model = AutoModelForCausalLM.from_pretrained(DRIVE_CHECKPOINT_DIR, torch_dtype=torch.bfloat16)
# tokenizer = AutoTokenizer.from_pretrained(DRIVE_CHECKPOINT_DIR)
# model.push_to_hub(f"{HF_USERNAME}/qwen2.5-1.5b-r1zero-grpo")
# tokenizer.push_to_hub(f"{HF_USERNAME}/qwen2.5-1.5b-r1zero-grpo")