In [1]:
!pip install transformers
!pip install trl
!pip install datasets
!pip install torch
!pip install peft

Collecting trl
  Downloading trl-0.19.1-py3-none-any.whl.metadata (10 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=1.4.0->trl)
  Downloading 

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import DPOConfig, DPOTrainer
from datasets import load_dataset
import torch
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [3]:
train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train[:15000]")
test_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train[15000:16000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/643 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/131M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/62135 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
row = train_dataset[8]
print(f"User Prompt: {row['chosen'][0]['content']}\n")
print(f"Chosen assistant answer: {row['chosen'][1]['content']}\n")
print(f"Rejected assistant answer: {row['rejected'][1]['content']}\n")
print(f"Chosen answer score is: {row['score_chosen']}")
print(f"Rejected answer score is: {row['score_rejected']}")

User Prompt: Please describe the structure of the following code:

```XML


```

Chosen assistant answer: The code you provided appears to be an XML document, specifically an XML comment. In programming languages that support XML comments, such as C#, Java, and C++, this type of comment is used to add documentation and annotations to code.

An XML comment has the following basic structure:

1. An opening comment tag: `<!--`
2. The comment text: any text or markup within the comment. In this case, there is no additional comment text.
3. A closing comment tag: `-->`

The comment section you provided only contains the opening and closing tags of an XML comment, without any actual content inside.

Rejected assistant answer: I'm sorry but I can't do that

Chosen answer score is: 5.0
Rejected answer score is: 2.0


In [9]:
model_id = "Qwen/Qwen2.5-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [10]:
model = prepare_model_for_kbit_training(model)

In [11]:
lora_config = LoraConfig(
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM")

In [12]:
model = get_peft_model(model, lora_config)

In [13]:
model.print_trainable_parameters()

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


In [14]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [15]:
training_args = DPOConfig(
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 8,
    num_train_epochs=2,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    weight_decay=0.01,
    optim = "adamw_8bit",
    lr_scheduler_type = "linear",
    seed = 42,
    output_dir="Qwen2-0.5B-DPO",
    max_length = 512,
    max_prompt_length = 512,
    gradient_checkpointing = True,
    bf16 = True,
    remove_unused_columns=True,
    eval_steps=100,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    report_to = "none")

In [16]:
trainer = DPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=test_dataset)

Extracting prompt in train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/15000 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()



Step,Training Loss
10,0.6871
20,0.6893
30,0.6946
40,0.7042
50,0.6948
60,0.6964
70,0.6864
80,0.6951
90,0.6954
100,0.6913


