# RLHF example

In this example, we will load a JSONL file containing preference records and preprocess it for training a model using Hugging Face's `datasets` library.

The example shows that the dataset is a bit different from the standard one. It contains both prompts and two responses - one that is preferred and one that is not. We will preprocess this data to prepare it for training.


In [10]:
for example in raw_data:
    print(example.keys())  # Debug: see available keys

# Use the correct key, e.g., "prompt" if that's what your data uses
train_dataset = [
    tokenizer(example.get("text", ""), return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    for example in raw_data if "text" in example
]

dict_keys(['prompt', 'winner', 'loser'])
dict_keys(['prompt', 'winner', 'loser'])
dict_keys(['prompt', 'winner', 'loser'])
dict_keys(['prompt', 'winner', 'loser'])
dict_keys(['prompt', 'winner', 'loser'])


In [12]:
from datasets import load_dataset

def tokenize_function(example):
    return tokenizer(example["prompt"], padding="max_length", truncation=True, max_length=128)

# Load your JSONL file as a Hugging Face Dataset
dataset = load_dataset("json", data_files="preference_records.jsonl", split="train")

# Print available keys for debugging
print(dataset[0].keys())

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# If your key is not "text", replace it with the correct one
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Pass tokenized_dataset to PPOTrainer as train_dataset

dict_keys(['prompt', 'winner', 'loser'])


Map: 100%|██████████| 5/5 [00:00<00:00, 274.64 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 274.64 examples/s]


In [None]:
from trl import PPOTrainer, PPOConfig
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
import copy
from tqdm import trange

config = PPOConfig(batch_size=16, learning_rate=1e-5)
model   = AutoModelForCausalLM.from_pretrained("distilgpt2", device_map="cuda")
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

reward  = AutoModelForSequenceClassification.from_pretrained("OpenAssistant/reward-model-deberta-v3-large")

# Define the reference model and value model (if applicable)
ref_model = copy.deepcopy(model)  # Create a copy of the model for reference
value_model = model  # Assuming the same model is used for value estimation

# Load and preprocess the training dataset
train_dataset_path = "preference_records.jsonl"

def tokenize_function(example):
    return tokenizer(example["prompt"], 
                     padding="max_length", 
                     truncation=True, 
                     max_length=128)

dataset = load_dataset("json", 
                       data_files=train_dataset_path, 
                       split="train")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define additional required arguments
processing_class = None  # Replace with actual processing class if needed

# Initialize PPOTrainer with required arguments
ppo = PPOTrainer(model=model, 
                 ref_model=ref_model, 
                 reward_model=reward, 
                 train_dataset=tokenized_dataset, 
                 value_model=value_model, 
                 args=config, 
                 processing_class=processing_class)

ppo.stop_token_id = 50256
ppo.train()

===training policy===


In [18]:
import torch

# Print device info for model and reward model
print(f"Model device: {next(model.parameters()).device}")
print(f"Reward model device: {next(reward.parameters()).device}")
print(f"CUDA available: {torch.cuda.is_available()}")

Model device: cuda:0
Reward model device: cuda:0
CUDA available: True
