In [None]:
# !pip install -U trl
# !pip install -U transformers
# !pip install -U accelerate
# !pip install -U bitsandbytes
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig
from trl import DPOTrainer, DPOConfig

In [None]:
model_id = "HuggingFaceTB/smollm2-135M-SFT-Only"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset_name = "Intel/orca_dpo_pairs"
dataset = load_dataset(dataset_name, split="train")

In [None]:
def format_for_dpo(example):
    prompt_messages = [
        {"role": "system", "content": example["system"]},
        {"role": "user", "content": example["question"]}
    ]
    prompt = tokenizer.apply_chat_template(prompt_messages, tokenize=False)

    return {
        "prompt": prompt,
        "chosen": example["chosen"],
        "rejected": example["rejected"]
    }

train_dataset = dataset.map(format_for_dpo)

In [None]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules="all-linear"
)

training_args = DPOConfig(
    output_dir="./smollm2-dpo",
    beta=0.1,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    num_train_epochs=1,\
    logging_steps=10,
    save_steps=100,
    fp16=True,
    remove_unused_columns=False
)

trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer,
    peft_config=peft_config,
    max_length=1024
)

In [None]:
print("Starting DPO training...")
trainer.train(resume_from_checkpoint=True)

trainer.save_model("./smollm2-dpo-final")
print("Training complete and model saved.")

In [None]:
from google.colab import files
import shutil

shutil.make_archive('smollm2-dpo-final', 'zip', './smollm2-dpo-final')

files.download('smollm2-dpo-final.zip')