In [None]:
import sys
sys.path.append('..')

from src.evaluation.metrics import JailbreakEvaluator, print_evaluation_summary
from src.data.dataset_loader import DatasetLoader
from configs.models.model_config import MODELS, RECOMMENDED_MODELS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
%matplotlib inline

## 1. Load Test Data

In [None]:
loader = DatasetLoader()

# Load test prompts
test_data = loader.load_jsonl('../data/processed/advbench/test.jsonl')
test_prompts = [item['prompt'] for item in test_data]

print(f"Loaded {len(test_prompts)} test prompts")
print(f"\nExample: {test_prompts[0]}")

## 2. Simulate Baseline Responses

For demonstration, we'll simulate different response types.

In [None]:
# Simulate different models with varying safety levels
import random

def simulate_model_responses(prompts, refusal_prob=0.5):
    """Simulate model responses with given refusal probability."""
    safe_templates = [
        "I cannot help with that request as it could be harmful.",
        "I'm not able to assist with that.",
        "I must decline that request."
    ]
    
    unsafe_templates = [
        "Sure, here's what you need to know...",
        "I can help you with that. Here are the steps...",
        "Let me guide you through this process..."
    ]
    
    responses = []
    for prompt in prompts:
        if random.random() < refusal_prob:
            responses.append(random.choice(safe_templates))
        else:
            responses.append(random.choice(unsafe_templates))
    
    return responses

# Simulate 3 models with different safety levels
models_simulated = {
    'Unsafe Model': simulate_model_responses(test_prompts[:50], refusal_prob=0.2),
    'Baseline Model': simulate_model_responses(test_prompts[:50], refusal_prob=0.5),
    'Safe Model': simulate_model_responses(test_prompts[:50], refusal_prob=0.9)
}

## 3. Evaluate Models

In [None]:
evaluator = JailbreakEvaluator()

# Evaluate each model
results = {}
for model_name, responses in models_simulated.items():
    result = evaluator.evaluate(
        prompts=test_prompts[:50],
        responses=responses
    )
    results[model_name] = result
    print_evaluation_summary(result, model_name)

## 4. Compare Models

In [None]:
# Extract metrics for comparison
comparison_data = []
for model_name, result in results.items():
    comparison_data.append({
        'Model': model_name,
        'Refusal Rate': result['overall']['refusal_rate'],
        'ASR': result['overall']['attack_success_rate'],
        'Safety Score': result['overall']['safety_score']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Refusal Rate
axes[0].bar(comparison_df['Model'], comparison_df['Refusal Rate'])
axes[0].set_ylabel('Refusal Rate')
axes[0].set_title('Model Refusal Rates')
axes[0].set_ylim(0, 1)
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=45, ha='right')

# Attack Success Rate
axes[1].bar(comparison_df['Model'], comparison_df['ASR'], color='orange')
axes[1].set_ylabel('Attack Success Rate')
axes[1].set_title('Model Attack Success Rates')
axes[1].set_ylim(0, 1)
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

## 5. Response Analysis

In [None]:
# Analyze response patterns
for model_name, result in results.items():
    print(f"\n{model_name} - Sample Responses:\n")
    
    # Show examples of refusals and non-refusals
    examples = result['examples'][:5]
    
    for i, ex in enumerate(examples, 1):
        status = "ðŸŸ¢ REFUSED" if ex['refused'] else "ðŸ”´ COMPLIED"
        print(f"{i}. {status}")
        print(f"   Prompt: {ex['prompt'][:60]}...")
        print(f"   Response: {ex['response']}")
        print()

## Next Steps

1. Train models with DPO: `python src/training/train_dpo.py`
2. Compare before/after DPO in next notebook
3. Run real evaluations with actual models