# 03 - DPO Training on Kaggle (Generative Manim)

Self-contained notebook for DPO alignment on Kaggle T4 GPU.

**Prerequisites**: SFT checkpoint + DPO pairs dataset uploaded to Kaggle.

In [None]:
!pip install -q torch transformers trl peft bitsandbytes accelerate datasets wandb

In [None]:
import os
import torch
from datasets import load_dataset
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import DPOTrainer
import wandb

In [None]:
# Configuration
MODEL_ID = "Qwen/Qwen2.5-Coder-7B-Instruct"
MODEL_NAME = "qwen2.5-coder-7b"

# Paths
SFT_CHECKPOINT = "/kaggle/input/gm-sft-checkpoint/sft-qwen2.5-coder-7b"
DPO_DATA = "/kaggle/input/gm-training-data/dpo_train.jsonl"
OUTPUT_DIR = f"/kaggle/working/dpo-{MODEL_NAME}"

# Hyperparameters
BETA = 0.1
LEARNING_RATE = 5e-5
BATCH_SIZE = 2
GRAD_ACCUM = 4
LORA_R = 32
LORA_ALPHA = 64

In [None]:
# W&B
from kaggle_secrets import UserSecretsClient
try:
    secrets = UserSecretsClient()
    wandb.login(key=secrets.get_secret("WANDB_API_KEY"))
    USE_WANDB = True
except:
    USE_WANDB = False

In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load base + SFT adapter
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, quantization_config=bnb_config, device_map="auto", trust_remote_code=True
)
base_model = prepare_model_for_kbit_training(base_model)
model = PeftModel.from_pretrained(base_model, SFT_CHECKPOINT, is_trainable=True)
print("Model loaded with SFT adapter")

In [None]:
# LoRA for DPO
peft_config = LoraConfig(
    r=LORA_R, lora_alpha=LORA_ALPHA, lora_dropout=0.05,
    target_modules="all-linear", bias="none", task_type="CAUSAL_LM",
)

# Load DPO pairs
dataset = load_dataset("json", data_files=DPO_DATA, split="train")
print(f"DPO pairs: {len(dataset)}")
print(f"Sample prompt: {dataset[0]['prompt'][:100]}")

In [None]:
# Training
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=1,
    learning_rate=LEARNING_RATE,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    fp16=True,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    optim="paged_adamw_32bit",
    report_to="wandb" if USE_WANDB else "none",
    seed=42,
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    beta=BETA,
    train_dataset=dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    max_prompt_length=512,
    max_length=2048,
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"DPO model saved to {OUTPUT_DIR}")

if USE_WANDB:
    wandb.finish()