# [Direct Preference Optimization: Your Language Model is Secretly a Reward Model (DPO)](https://arxiv.org/pdf/2305.18290.pdf)

### Reference Code 
- https://huggingface.co/docs/trl/main/en/dpo_trainer
- https://github.com/huggingface/trl/blob/main/examples/scripts/dpo.py

Therefore the final dataset object should contain these 3 entries if you use the default DPODataCollatorWithPadding data collator. 

The entries should be named:
- prompt
- chosen
- rejected

In [1]:
import os
import torch
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
# pip install trl

In [3]:
dpo_dataset_dict = {
    "prompt": [
        "hello",
        "how are you",
        "What is your name?",
        "What is your name?",
        "Which is the best programming language?",
        "Which is the best programming language?",
        "Which is the best programming language?",
    ],
    "chosen": [
        "hi nice to meet you",
        "I am fine",
        "My name is Mary",
        "My name is Mary",
        "Python",
        "Python",
        "Java",
    ],
    "rejected": [
        "leave me alone",
        "I am not fine",
        "Whats it to you?",
        "I dont have a name",
        "Javascript",
        "C++",
        "C++",
    ],
}

In [4]:
import torch
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    HfArgumentParser, 
    TrainingArguments
)

from typing import Dict, Optional
from trl import DPOTrainer, DPOConfig
from huggingface_hub import login

# 1. load a pretrained model and tokenizer

In [5]:
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name_or_path = "gpt2"
ignore_bias_buffers = False

model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
# if ignore_bias_buffers:
#     # torch distributed hack
#     model._ddp_params_and_buffers_to_ignore = [
#         name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
#     ]

model_ref = AutoModelForCausalLM.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

The DPO trainer expects a model of AutoModelForCausalLM, compared to PPO that expects AutoModelForCausalLMWithValueHead for the value function.

## 2. Load the Dahoas dataset

In [6]:
# Function to extract prompt from response
def extract_anthropic_prompt(prompt_and_response: str) -> str:
    search_term = "\n\nAssistant:"
    search_term_idx = prompt_and_response.rfind(search_term)
    assert search_term_idx != -1, f"Prompt and response does not contain '{search_term}'"
    return prompt_and_response[: search_term_idx + len(search_term)]

# Load dataset
def get_static_hh(split: str, sanity_check: bool = False, cache_dir: str = None):
    dataset = load_dataset("Dahoas/static-hh", split=split, cache_dir=cache_dir)
    if sanity_check:
        dataset = dataset.select(range(min(len(dataset), 5)))  # Use a smaller dataset for testing

    def filter_columns(sample):
        return {
            "prompt": sample["prompt"],
            "chosen": sample["chosen"],
            "rejected": sample["rejected"],
        }

    return dataset.map(filter_columns)

In [7]:
sanity_check = True
train_dataset = get_static_hh("train", sanity_check=sanity_check)
eval_dataset = get_static_hh("test", sanity_check=sanity_check)

In [8]:
train_dataset

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected'],
    num_rows: 5
})

In [9]:
eval_dataset

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected'],
    num_rows: 5
})

# 3. initialize training arguments:

In [10]:
# learning_rate = 1e-3
# per_device_train_batch_size = 8
# gradient_accumulation_steps = 1
# max_length= 512 
# max_prompt_length = 128 
# max_target_length =128 
# label_pad_token_id = 100
# max_steps = 1000
# # instrumentation
# sanity_check = True
# report_to = None
# gradient_checkpointing = None
# beta = 0.1

In [11]:
# pip install --upgrade transformers[torch] accelerate>=0.26.0

In [12]:
# training_args = TrainingArguments(
#     per_device_train_batch_size=per_device_train_batch_size,
#     max_steps=max_steps,
#     remove_unused_columns=False,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     learning_rate=learning_rate,
#     evaluation_strategy="steps",
#     logging_first_step=True,
#     logging_steps=5,  # match results in blog post
#     eval_steps=500,
#     output_dir="./test",
#     optim="rmsprop",
#     warmup_steps=150,
#     report_to=report_to,
#     bf16=True,
#     # gradient_checkpointing=gradient_checkpointing,
#     gradient_checkpointing=True,
#     # TODO: uncomment that on the next transformers release
#     # gradient_checkpointing_kwargs=gradient_checkpointing_kwargs,
# )

# from trl import DPOConfig

# training_args = DPOConfig(
#     # beta=0.1,  # Keep this if needed
#     output_dir="./test",
#     per_device_train_batch_size=8,
#     gradient_accumulation_steps=1,
#     learning_rate=1e-3,
#     evaluation_strategy="steps",
#     logging_steps=5,
#     eval_steps=500,
#     max_steps=1000,
#     optim="rmsprop",
#     warmup_steps=150,
#     bf16=True,
#     gradient_checkpointing=True,
# )


# 4. initialize the DPO trainer

In [13]:
# dpo_trainer = DPOTrainer(
#     model,
#     model_ref,
#     args=training_args,
#     beta=beta,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     max_length=max_length,
#     max_target_length=max_target_length,
#     max_prompt_length=max_prompt_length,
#     generate_during_eval=True,
# )

In [14]:
# from trl import DPOTrainer

# dpo_trainer = DPOTrainer(
#     model=model,
#     ref_model=model_ref,  # Ensure model_ref is correctly defined
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
# )

In [15]:
# import trl
# print(trl.__version__)

In [16]:
# pip install --upgrade trl

# 5. Train

In [17]:
# dpo_trainer.train()

In [18]:
import itertools
import torch
from trl import DPOConfig, DPOTrainer

# Define hyperparameter search space
learning_rates = [1e-3]
batch_sizes = [8]
num_epochs = [5]
betas = [0.1]

# Generate all hyperparameter combinations
hyperparameter_combinations = list(itertools.product(learning_rates, batch_sizes, num_epochs, betas))

# Track best model
results = []
best_loss = float("inf")  
best_model_path = None

# Loop through hyperparameter combinations
for lr, batch_size, epochs, beta in hyperparameter_combinations:
    print(f"\n Training with lr={lr}, batch_size={batch_size}, epochs={epochs}, beta={beta}")
    
    output_dir = f"./dpo_lr{lr}_bs{batch_size}_ep{epochs}_beta{beta}"

    # Define DPO configuration
    dpo_config = DPOConfig(
        output_dir=output_dir,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        gradient_accumulation_steps=1,
        learning_rate=lr,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        save_total_limit=2,
        warmup_steps=150,
        bf16=torch.cuda.is_available(),  # Use bf16 if GPU supports it
        gradient_checkpointing=True,
        report_to="none",
    )

    # Initialize trainer
    dpo_trainer = DPOTrainer(
        model=model,
        ref_model=model_ref,
        args=dpo_config,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
    )

    # Train model
    dpo_trainer.train()

    # Evaluate model
    eval_results = dpo_trainer.evaluate()
    loss = eval_results.get("eval_loss", None)
    
    # Store results
    results.append({
        "learning_rate": lr,
        "batch_size": batch_size,
        "epochs": epochs,
        "beta": beta,
        "loss": loss
    })

    # Save the best model based on eval loss
    if loss is not None and loss < best_loss:
        best_loss = loss
        best_model_path = output_dir
        print(f" New best model found! Saving at: {best_model_path}")

# Print results
print("\n Experiment Results:")
for res in results:
    print(res)

# Save best model path
if best_model_path:
    print(f"\n Best model saved at: {best_model_path}")
else:
    print("\n No model improved from the initial loss.")



 Training with lr=0.001, batch_size=8, epochs=5, beta=0.1


  dpo_trainer = DPOTrainer(


Applying chat template to train dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
1,No log,0.693147,0.0,0.0,0.0,0.0,-267.235687,-185.095245,-108.793655,-151.277023
2,No log,0.620372,-0.078956,-0.251735,0.6,0.172779,-268.025238,-187.612595,-108.78112,-151.336227
3,No log,0.636784,0.06379,-0.054355,0.8,0.118144,-266.597778,-185.638779,-108.687149,-151.747314
4,No log,0.595837,-0.017816,-0.237263,0.8,0.219446,-267.413818,-187.467865,-109.276733,-152.481323
5,No log,0.494677,0.07786,-0.382726,1.0,0.460586,-266.457062,-188.922485,-109.685966,-153.107025


 New best model found! Saving at: ./dpo_lr0.001_bs8_ep5_beta0.1

 Experiment Results:
{'learning_rate': 0.001, 'batch_size': 8, 'epochs': 5, 'beta': 0.1, 'loss': 0.4946766495704651}

 Best model saved at: ./dpo_lr0.001_bs8_ep5_beta0.1


In [19]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Load model and tokenizer from saved directory
save_directory = "./dpo_lr0.001_bs8_ep5_beta0.1/checkpoint-4"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(save_directory).to(device)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

print("Model loaded successfully!")

Model loaded successfully!


In [41]:
# Ensure tokenizer padding token is set correctly
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Assign pad token

# Move model to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_response(prompt, max_tokens=100):
    try:
        # Format input as a dialogue
        formatted_prompt = f"Human: {prompt}\n\nAssistant:"

        # Tokenize input and move to the correct device
        input_ids = tokenizer(
            formatted_prompt, return_tensors="pt", padding=True, truncation=True
        ).input_ids.to(device)

        # Generate response with optimized settings
        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                max_new_tokens=50,  # Response length
                temperature=0.4,  # Lower for more focused responses
                top_p=0.8,  # Nucleus sampling
                top_k=30,  # Limits token selection to top 40 words
                repetition_penalty=1.5,  # Stronger penalty against repetition
                do_sample=False,  # Enables diverse responses
                pad_token_id=tokenizer.pad_token_id,  # Uses tokenizer's pad token
            )

        # Decode and clean response
        full_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        response = full_response.split("\n\nAssistant:")[-1].strip()  # Remove redundant parts

        return response
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Example test case
sample_prompt = "What is love?"

# Generate and print the response
response = generate_response(sample_prompt)
print(f"Prompt: {sample_prompt}\nResponse: {response}")


Prompt: What is love?
Response: Love. It's a word that comes from the Greek meaning "love" and it means to be loved, not just for yourself but also because of your own actions or feelings about others (e-mail). The term was first used in 1876


In [40]:
import torch

# Ensure the tokenizer has a pad token
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

def generate_response(prompt, max_tokens=100):
    try:
        # ✅ Provide a more explicit instruction
        formatted_prompt = f"Q: {prompt}\nA:"

        # Tokenize input
        input_ids = tokenizer(
            formatted_prompt, return_tensors="pt", padding=True, truncation=True
        ).input_ids.to(device)

        # Generate response with improved settings
        with torch.no_grad():
            output_ids = model.generate(
                input_ids,
                max_new_tokens=max_tokens,
                temperature=0.5,  # Increase randomness
                top_p=0.85,  # Balance nucleus sampling
                top_k=50,  # Increase diversity
                repetition_penalty=1.4,  # Reduce repetitive responses
                do_sample=True,  # Enable sampling for variability
                eos_token_id=tokenizer.eos_token_id,  # Ensure stopping criteria
                pad_token_id=tokenizer.pad_token_id,  # Avoid padding issues
            )

        # Decode and clean response
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

        # ✅ Ensure we don't return the prompt itself
        if response.lower() == prompt.lower():
            return "I am not sure about that, but I can try to help with more information!"

        return response
    except Exception as e:
        return f"Error generating response: {str(e)}"

# Example test case
sample_prompt = "Who is the president of USA?"
response = generate_response(sample_prompt)
print(f"Prompt: {sample_prompt}\nResponse: {response}")


Prompt: Who is the president of USA?
Response: Q: Who is the president of USA?
A: The United States President. He's a guy who has been around for over 20 years, and he knows how to run an organization that runs like it does business in America – which makes him very important because you have people from all walks of life coming up here at this time when there are so many different types of businesses operating out on our soil…I think we're going through some pretty big changes right now with regard not just health care but also education as well."
