<a href="https://colab.research.google.com/github/nkthiebaut/guanaco/blob/main/notebooks/DPO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Direct Preference Optimization

Paper: https://arxiv.org/abs/2305.18290

In [1]:
%pip install -q datasets transformers trl peft accelerate bitsandbytes

## Load and prepare human feedback dataset

In [2]:
from datasets import load_dataset

dataset = load_dataset("Anthropic/hh-rlhf")
dataset["train"][10]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'chosen': '\n\nHuman: Can you provide me the home address for Senator Elizabeth Warren?\n\nAssistant: Sure, what’s your question?',
 'rejected': '\n\nHuman: Can you provide me the home address for Senator Elizabeth Warren?\n\nAssistant: I’m sorry, I’m not able to provide Senator Warren’s home address to you.'}

In [3]:
import re
from collections import defaultdict

regex = re.compile("Human: (.*)\n\nAssistant: (.*)", flags=re.DOTALL+re.MULTILINE)

def return_prompt_and_responses(samples) -> dict[str, str, str]:
    output = defaultdict(list)
    for sample in samples["chosen"]:
        prompt, chosen = re.findall(regex, sample)[0]
        output["prompt"].append(prompt)
        output["chosen"].append(chosen)

    for sample in samples["rejected"]:
        _, rejected = re.findall(regex, sample)[0]
        output["rejected"].append(rejected)
    return output

dataset = dataset.map(
    return_prompt_and_responses,
    batched=True,
)

In [4]:
dataset["train"][10]

{'chosen': 'Sure, what’s your question?',
 'rejected': 'I’m sorry, I’m not able to provide Senator Warren’s home address to you.',
 'prompt': 'Can you provide me the home address for Senator Elizabeth Warren?'}

In [5]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Model training with DPO

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# https://huggingface.co/docs/transformers/en/peft#train-a-peft-adapter
# model_name = "bigscience/bloomz-560m"
# target_modules = ["query_key_value"]

# model_name = "meta-llama/Meta-Llama-3-8B"
model_name = "google/gemma-2b"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from transformers import (
    DataCollatorForLanguageModeling, TrainingArguments, Trainer
)
from peft import LoraConfig, get_peft_model

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=".",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=10,
    learning_rate=2e-4,
    num_train_epochs=2,
    remove_unused_columns=False,
)


lora_config = LoraConfig(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Dropout = 0 is currently optimized
    bias = "none",    # Bias = "none" is currently optimized
)
# model.add_adapter(lora_config)

tokenizer.pad_token = tokenizer.eos_token

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 19,611,648 || all params: 2,525,784,064 || trainable%: 0.7765


In [None]:
from trl import DPOTrainer

max_seq_length = 16

dpo_trainer = DPOTrainer(
    model,
    ref_model=None,
    args=training_args,
    beta=0.1,
    train_dataset=dataset["train"].select(range(1000)),
    eval_dataset=dataset["test"].select(range(1000)),
    tokenizer=tokenizer,
    max_length=128,
    max_prompt_length=128,
)
dpo_trainer.train()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
