In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset, concatenate_datasets



In [None]:
# Load preference dataset
dataset = load_dataset("Dahoas/full-hh-rlhf")

In [None]:
''# Preprocess
def preprocess_data(example):
    return {"text": example["chosen"], "label": float(1)}

def preprocess_rejected(example):
    return {"text": example["rejected"], "label": float(0)}


chosen_dataset = dataset["train"].map(preprocess_data)
rejected_dataset = dataset["train"].map(preprocess_rejected)

# Correctly merge them
reward_dataset = concatenate_datasets([chosen_dataset, rejected_dataset])



In [None]:
reward_dataset

In [None]:
len(reward_dataset)

In [None]:
type(reward_dataset)

In [None]:
# Load base model and tokenizer
base_model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=1)

# Apply LoRA to model
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "key", "value", "dense", "intermediate.dense", "output.dense"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

reward_dataset = reward_dataset.map(tokenize_function, batched=True)
reward_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
reward_dataset_small = reward_dataset.shuffle(seed=42).select(range(10000))

In [None]:
reward_dataset_small

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./lora_reward_model",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_strategy="epoch",
    report_to="none",
    fp16=True
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=reward_dataset_small,
)

# Train model
trainer.train()

In [None]:
model.save_pretrained("./lora_reward_model-final", save_adapter=True)
tokenizer.save_pretrained("./lora_reward_model-final")

In [None]:
model

In [None]:
prompt = "Explain the theory of relativity."
good_response = "Let's think step-by-step. Relativity consists of two theories by Einstein: special and general relativity..."
bad_response = "It's something about fast cars or something."

In [None]:
model = model.eval()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
full_text = prompt + "\n" + good_response
inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

In [None]:
model(**inputs)

In [None]:
full_text = prompt + "\n" + bad_response
inputs = tokenizer(full_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

In [None]:
model(**inputs)

In [None]:
torch.sigmoid(torch.tensor([0.521, 0.4998]))

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("OpenAssistant/reward-model-deberta-v3-large")
reward_model = AutoModelForSequenceClassification.from_pretrained(
    "OpenAssistant/reward-model-deberta-v3-large",
    num_labels=1
).to("cuda" if torch.cuda.is_available() else "cpu")
reward_model.eval()

# Scoring function
def score_response(prompt, response, model, tokenizer):
    full_text = prompt + "\n" + response
    inputs = tokenizer(full_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        score = outputs.logits.squeeze().item()
    return score



In [None]:


good_score = score_response(prompt, good_response, reward_model, tokenizer)
bad_score = score_response(prompt, bad_response, reward_model, tokenizer)

print(f"✅ Good Response Score: {good_score:.4f}")
print(f"❌ Bad Response Score: {bad_score:.4f}")


In [None]:
torch.sigmoid(torch.tensor([good_score, bad_score]))