# Evaluation Pipeline

This notebook demonstrates how to use RM-Gallery's evaluation pipeline for benchmarking reward models.

## Setup

Import the necessary evaluation modules:

In [None]:
from rm_gallery.gallery.evaluation import RewardBench, JudgeBench
from rm_gallery.gallery.rm import RewardModel
import matplotlib.pyplot as plt
import seaborn as sns

## Initialize Reward Model

Load a reward model for evaluation:

In [None]:
# Load reward model
rm = RewardModel(
    model_name="Skywork/Skywork-Reward-Llama-3.1-8B",
    device="cuda",
    batch_size=8
)

print(f"Model loaded: {rm.model_name}")

## RewardBench Evaluation

Evaluate on the RewardBench benchmark:

In [None]:
# Initialize RewardBench
rewardbench = RewardBench(
    benchmark_name="reward-bench-2",
    subset=["chat", "chat_hard", "safety", "reasoning"]
)

# Run evaluation
results = rewardbench.evaluate(rm)

# Display results
print("\nRewardBench Results:")
print("=" * 50)
for category, score in results.items():
    print(f"{category:20s}: {score:.2f}%")

## JudgeBench Evaluation

Evaluate on the JudgeBench benchmark for judge models:

In [None]:
# Initialize JudgeBench
judgebench = JudgeBench(benchmark_name="judgebench")

# Run evaluation
judge_results = judgebench.evaluate(rm)

# Display results
print("\nJudgeBench Results:")
print("=" * 50)
print(f"Overall Accuracy: {judge_results['accuracy']:.2f}%")
print(f"Consistency Score: {judge_results['consistency']:.2f}%")

## Visualize Results

Create visualizations of the evaluation results:

In [None]:
# Prepare data for visualization
categories = list(results.keys())
scores = list(results.values())

# Create bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=categories, y=scores, palette="viridis")
plt.title("RewardBench Performance by Category", fontsize=16)
plt.xlabel("Category", fontsize=12)
plt.ylabel("Accuracy (%)", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Custom Evaluation Dataset

Evaluate on your own custom dataset:

In [None]:
# Define custom evaluation dataset
custom_data = [
    {
        "prompt": "Explain quantum entanglement",
        "chosen": "Quantum entanglement is a phenomenon where particles...",
        "rejected": "It's a quantum thing."
    },
    {
        "prompt": "Write a Python function to sort a list",
        "chosen": "def sort_list(lst):\n    return sorted(lst)",
        "rejected": "use sort()"
    }
]

# Evaluate
correct = 0
for item in custom_data:
    score_chosen = rm.score(item["prompt"], item["chosen"])
    score_rejected = rm.score(item["prompt"], item["rejected"])
    if score_chosen > score_rejected:
        correct += 1

accuracy = (correct / len(custom_data)) * 100
print(f"\nCustom Dataset Accuracy: {accuracy:.2f}%")

## Compare Multiple Models

Compare performance across different reward models:

In [None]:
# Define models to compare
model_names = [
    "Skywork/Skywork-Reward-Llama-3.1-8B",
    "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
    "sfairXC/FsfairX-LLaMA3-RM-v0.1"
]

# Evaluate each model
comparison_results = {}
for model_name in model_names:
    print(f"\nEvaluating {model_name}...")
    test_rm = RewardModel(model_name=model_name, device="cuda")
    results = rewardbench.evaluate(test_rm)
    comparison_results[model_name.split("/")[-1]] = results["overall"]

# Display comparison
print("\nModel Comparison:")
print("=" * 50)
for model, score in comparison_results.items():
    print(f"{model:40s}: {score:.2f}%")

## Export Results

Save evaluation results to a file:

In [None]:
import json

# Save results
output = {
    "model": rm.model_name,
    "rewardbench": results,
    "judgebench": judge_results,
    "custom_accuracy": accuracy
}

with open("evaluation_results.json", "w") as f:
    json.dump(output, f, indent=2)

print("Results saved to evaluation_results.json")