# 05: Reward Model Training with Preference Data
This notebook demonstrates how to train a reward model using human preference pairs.

In [None]:
!pip install torch transformers

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from rlhf.reward_model import RewardModel
from rlhf.preference_dataset import PreferenceDataset, reward_loss

## Prepare preference pairs and tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

preference_pairs = [
    ("The cat sat on the mat.", "The cat was sat by the mat."),
    ("He won the race easily.", "He lost the race in shame."),
    ("The answer is correct.", "The answer might be wrong."),
    ("Paris is the capital of France.", "France is the capital of Paris."),
]

## Create Dataset and Dataloader

In [4]:
dataset = PreferenceDataset(preference_pairs, tokenizer, max_length=64)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

## Initialize Reward Model

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = RewardModel(vocab_size=len(tokenizer)).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-4)

## Train the Reward Model

In [8]:
model.train()
for epoch in range(5):
    total_loss = 0
    for batch in dataloader:
        chosen = batch["chosen_input_ids"].to(device)
        rejected = batch["rejected_input_ids"].to(device)
        reward_chosen = model(chosen)
        reward_rejected = model(rejected)
        loss = reward_loss(reward_chosen, reward_rejected)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(dataloader):.4f}")

torch.save(model.state_dict(), "reward_model.pt")
print("Reward model saved.")

Epoch 1 Loss: 0.6825
Epoch 2 Loss: 0.5232
Epoch 3 Loss: 0.1990
Epoch 4 Loss: 0.0031
Epoch 5 Loss: 0.0001
Reward model saved.
