In [1]:
# ===== Colab 3: RL with Preference Data (DPO with Unsloth) =====

!pip install -q "unsloth>=2025.3.0" "unsloth_zoo" "trl>=0.9.6" "datasets" "accelerate" "transformers>=4.44.0"

from unsloth import FastLanguageModel, PatchDPOTrainer, is_bfloat16_supported
PatchDPOTrainer()

from datasets import Dataset
from trl import DPOTrainer, DPOConfig
import torch

# 1) Tiny preference dataset
pref_data = [
    {
        "prompt":   "User: Write a Python function to add two numbers.\nAssistant:",
        "chosen":   "def add(a, b):\n    return a + b",
        "rejected": "def add(a, b):\n    print(a + b)"
    },
    {
        "prompt":   "User: Say hello in a warm and friendly way.\nAssistant:",
        "chosen":   "Hello! It's great to see you. How can I help today?",
        "rejected": "Hello.",
    },
]

dataset = Dataset.from_list(pref_data)
max_seq_length = 512

# 2) Load model with QLoRA
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = "unsloth/SmolLM2-135M-Instruct",
    max_seq_length  = max_seq_length,
    load_in_4bit    = True,
    full_finetuning = False,
    dtype           = None,
)

model = FastLanguageModel.get_peft_model(
    model,
    r                           = 16,
    lora_alpha                  = 16,
    lora_dropout                = 0,
    bias                        = "none",
    use_gradient_checkpointing  = "unsloth",
    max_seq_length              = max_seq_length,
)

# 3) DPO config
training_args = DPOConfig(
    output_dir                  = "smollm2_dpo_rl",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 2,
    num_train_epochs            = 1,
    learning_rate               = 1e-5,
    warmup_ratio                = 0.1,
    logging_steps               = 1,
    save_strategy               = "no",
    fp16                        = not is_bfloat16_supported(),
    bf16                        = is_bfloat16_supported(),
    report_to                   = "none",
)

dpo_trainer = DPOTrainer(
    model             = model,
    ref_model         = None,
    args              = training_args,
    beta              = 0.1,
    train_dataset     = dataset,
    tokenizer         = tokenizer,
    max_length        = 256,
    max_prompt_length = 128,
)

dpo_trainer.train()

# 4) Inference demo
model.eval()

prompt = "User: Say hello in a warm and friendly way.\nAssistant:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens = 60,
        do_sample      = True,
        temperature    = 0.7,
        use_cache      = False,
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.3/64.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.7/358.7 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.4/284.4 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m134.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth 2025.11.4 patched 30 layers with 30 QKV layers, 30 O layers and 30 MLP layers.
num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Extracting prompt in train dataset (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Applying chat template to train dataset (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Tokenizing train dataset (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 4,884,480 of 139,400,064 (3.50% trained)


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
1,0.6931,0.0,0.0,0.0,0.0,-26.608986,-21.202871,3.105672,3.208227,0,0,0


User: Say hello in a warm and friendly way.
Assistant: And go! Hello!

I've added a small amount of commas at the beginning of the sentence to make it sound a bit more natural. Let's see how it sounds:

Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello
