In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset, concatenate_datasets



In [2]:
# Load preference dataset
dataset = load_dataset("Dahoas/full-hh-rlhf")

In [3]:
''# Preprocess
def preprocess_data(example):
    return {"text": example["chosen"], "label": float(1)}

def preprocess_rejected(example):
    return {"text": example["rejected"], "label": float(0)}


chosen_dataset = dataset["train"].map(preprocess_data)
rejected_dataset = dataset["train"].map(preprocess_rejected)

# Correctly merge them
reward_dataset = concatenate_datasets([chosen_dataset, rejected_dataset])



In [4]:
reward_dataset

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected', 'text', 'label'],
    num_rows: 224104
})

In [5]:
len(reward_dataset)

224104

In [6]:
type(reward_dataset)

datasets.arrow_dataset.Dataset

In [7]:
# Load base model and tokenizer
base_model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=1)

# Apply LoRA to model
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "key", "value", "dense", "intermediate.dense", "output.dense"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

reward_dataset = reward_dataset.map(tokenize_function, batched=True)
reward_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,930,753 || all params: 126,589,442 || trainable%: 1.5252


In [8]:
reward_dataset

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 224104
})

In [9]:
reward_dataset_small = reward_dataset.shuffle(seed=42).select(range(10000))

In [10]:
reward_dataset_small

Dataset({
    features: ['prompt', 'response', 'chosen', 'rejected', 'text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 10000
})

In [11]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./lora_reward_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",
    report_to="none",
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=reward_dataset_small,
)

# Train model
trainer.train()

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,0.298
100,0.2729
150,0.2648
200,0.2636
250,0.2596
300,0.2545
350,0.256
400,0.2561
450,0.2549
500,0.2565


TrainOutput(global_step=1250, training_loss=0.25723645324707034, metrics={'train_runtime': 366.0877, 'train_samples_per_second': 54.632, 'train_steps_per_second': 3.414, 'total_flos': 5381554298880000.0, 'train_loss': 0.25723645324707034, 'epoch': 2.0})

In [21]:
# Correct way
model.save_pretrained("./lora_reward_model-final", save_adapter=True)
tokenizer.save_pretrained("./lora_reward_model-final")

('./lora_reward_model-final/tokenizer_config.json',
 './lora_reward_model-final/special_tokens_map.json',
 './lora_reward_model-final/vocab.json',
 './lora_reward_model-final/merges.txt',
 './lora_reward_model-final/added_tokens.json',
 './lora_reward_model-final/tokenizer.json')

In [22]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSdpaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): Mo

In [23]:
prompt = "Explain the theory of relativity."
good_response = "Let's think step-by-step. Relativity consists of two theories by Einstein: special and general relativity..."
bad_response = "It's something about fast cars or something."

In [24]:
model = model.eval()

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [26]:
full_text = prompt + "\n" + good_response
inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

In [27]:
model(**inputs)

SequenceClassifierOutput(loss={'logits': tensor([[0.5210]], device='cuda:0', grad_fn=<ToCopyBackward0>)}, logits=tensor([[0.5210]], device='cuda:0', grad_fn=<ToCopyBackward0>), hidden_states=None, attentions=None)

In [28]:
full_text = prompt + "\n" + bad_response
inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

In [29]:
model(**inputs)

SequenceClassifierOutput(loss={'logits': tensor([[0.4998]], device='cuda:0', grad_fn=<ToCopyBackward0>)}, logits=tensor([[0.4998]], device='cuda:0', grad_fn=<ToCopyBackward0>), hidden_states=None, attentions=None)

In [35]:
torch.sigmoid(torch.tensor([0.521, 0.4998]))

tensor([0.6274, 0.6224])

In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("OpenAssistant/debertareward-model-deberta-v3-large-v2")
reward_model = AutoModelForSequenceClassification.from_pretrained(
    "OpenAssistant/reward-model-deberta-v3-large-v2",
    num_labels=1
).to("cuda" if torch.cuda.is_available() else "cpu")
reward_model.eval()

# Scoring function
def score_response(prompt, response, model, tokenizer):
    full_text = prompt + "\n" + response
    inputs = tokenizer(full_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        score = outputs.logits.squeeze().item()
    return score



tokenizer_config.json:   0%|          | 0.00/455 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/993 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [32]:


good_score = score_response(prompt, good_response, reward_model, tokenizer)
bad_score = score_response(prompt, bad_response, reward_model, tokenizer)

print(f"✅ Good Response Score: {good_score:.4f}")
print(f"❌ Bad Response Score: {bad_score:.4f}")


✅ Good Response Score: -0.5243
❌ Bad Response Score: -4.5390


In [34]:
torch.sigmoid(torch.tensor([good_score, bad_score]))

tensor([0.3719, 0.0106])