In [None]:
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
# Load GPT-2
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Policy model (trainable)
policy_model = AutoModelForCausalLM.from_pretrained(model_name)
policy_model.train()

# Reference model (fixed, no grad)
reference_model = AutoModelForCausalLM.from_pretrained(model_name)
reference_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
sample = {
    "prompt": "The weather today is",
    "chosen": " sunny and warm.",
    "rejected": " not a banana.",
}

prompt = sample["prompt"]

# Tokenize
chosen = tokenizer(prompt + sample["chosen"], return_tensors="pt", padding=True)
rejected = tokenizer(prompt + sample["rejected"], return_tensors="pt", padding=True)

In [None]:
# Forward pass - policy model
policy_chosen_outputs = policy_model(**chosen)
policy_rejected_outputs = policy_model(**rejected)

# Forward pass - reference model (no_grad)
with torch.no_grad():
    ref_chosen_outputs = reference_model(**chosen)
    ref_rejected_outputs = reference_model(**rejected)

In [None]:
# Get logits
def compute_log_prob(outputs, inputs):
    logits = outputs.logits[:, :-1, :]
    labels = inputs["input_ids"][:, 1:]
    log_probs = (
        torch.gather(logits.log_softmax(dim=-1), dim=2, index=labels.unsqueeze(-1))
        .squeeze(-1)
        .sum(dim=1)
    )
    return log_probs


policy_chosen_logp = compute_log_prob(policy_chosen_outputs, chosen)
policy_rejected_logp = compute_log_prob(policy_rejected_outputs, rejected)
ref_chosen_logp = compute_log_prob(ref_chosen_outputs, chosen)
ref_rejected_logp = compute_log_prob(ref_rejected_outputs, rejected)

In [None]:
# DPO loss function
beta = 0.1

diff = (policy_chosen_logp - ref_chosen_logp) - (
    policy_rejected_logp - ref_rejected_logp
)
dpo_loss = -torch.nn.functional.logsigmoid(beta * diff).mean()
dpo_loss

tensor(1.0762, grad_fn=<NegBackward0>)