In [2]:
# !pip install -U trl
# !pip install -U transformers
# !pip install -U accelerate
# !pip install -U bitsandbytes
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import DPOTrainer, DPOConfig

In [3]:
model_id = "HuggingFaceTB/smollm2-135M-SFT-Only"

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
dataset_name = "Intel/orca_dpo_pairs"
dataset = load_dataset(dataset_name, split="train")

In [5]:
def format_for_dpo(example):
    prompt_messages = [
        {"role": "system", "content": example["system"]},
        {"role": "user", "content": example["question"]}
    ]
    prompt = tokenizer.apply_chat_template(prompt_messages, tokenize=False)

    return {
        "prompt": prompt,
        "chosen": example["chosen"],
        "rejected": example["rejected"]
    }

train_dataset = dataset.map(format_for_dpo)

In [6]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear"
)

training_args = DPOConfig(
    output_dir="./smollm2-dpo-8bit",
    beta=0.1,  # temperature for DPO loss
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    num_train_epochs=1,\
    logging_steps=10,
    save_steps=100,
    fp16=True,
    remove_unused_columns=False
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
    # max_length=1024,
    # max_prompt_length=512,
)

In [10]:
print("Starting DPO training...")
trainer.train(resume_from_checkpoint=True)

trainer.save_model("./smollm2-dpo-8bit-final")
print("Training complete and model saved.")

Starting DPO training...




Step,Training Loss
910,0.0026
920,0.002
930,0.0091
940,0.0018
950,0.0022
960,0.0023
970,0.0032
980,0.0005
990,0.0063
1000,0.0009




Training complete and model saved.


In [11]:
from google.colab import files
import shutil

shutil.make_archive('smollm2-dpo-8bit-final', 'zip', './smollm2-dpo-8bit-final')

files.download('smollm2-dpo-8bit-final.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>