In [None]:
# !pip install -U trl
# !pip install -U transformers
# !pip install -U accelerate
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import LoraConfig, TaskType
from trl import RewardTrainer, RewardConfig

In [None]:
model_id = "HuggingFaceTB/smollm2-135M-SFT-Only"
dataset_name = "Intel/orca_dpo_pairs"
output_dir = "smollm2-reward-model"

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=1,
    device_map="auto"
)

model.config.use_cache = False 

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
dataset = load_dataset(dataset_name, split="train")

def format_for_reward_model(example):
    prompt_messages = [
        {"role": "system", "content": example["system"]},
        {"role": "user", "content": example["question"]}
    ]
    prompt_text = tokenizer.apply_chat_template(prompt_messages, tokenize=False)
    chosen_text = prompt_text + "\nAssistant: " + example["chosen"]
    rejected_text = prompt_text + "\nAssistant: " + example["rejected"]
    return {
        "chosen": chosen_text,
        "rejected": rejected_text
    }

train_dataset = dataset.map(format_for_reward_model, num_proc=4)

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules="all-linear"
)

training_args = RewardConfig(
    output_dir=output_dir,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=100,
    fp16=True,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    max_length=1024,
    center_rewards_coefficient=0.01
)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    peft_config=peft_config,
)

In [None]:
print("Starting Reward Model training...")
trainer.train()

trainer.save_model(output_dir + "-final")
print("Reward Model training complete.")

In [None]:
from google.colab import files
import shutil

shutil.make_archive('smollm2-reward-model-final', 'zip', './smollm2-reward-model-final')

files.download('smollm2-reward-model-final.zip')