diff --git a/docs/benchmarking/Jailbreak_roc_curves.png b/docs/benchmarking/Jailbreak_roc_curves.png new file mode 100644 index 0000000..98f15f3 Binary files /dev/null and b/docs/benchmarking/Jailbreak_roc_curves.png differ diff --git a/docs/benchmarking/jailbreak_roc_curve.png b/docs/benchmarking/jailbreak_roc_curve.png deleted file mode 100644 index 82bafd9..0000000 Binary files a/docs/benchmarking/jailbreak_roc_curve.png and /dev/null differ diff --git a/docs/ref/checks/jailbreak.md b/docs/ref/checks/jailbreak.md index 027f096..6f4a40c 100644 --- a/docs/ref/checks/jailbreak.md +++ b/docs/ref/checks/jailbreak.md @@ -96,37 +96,40 @@ When conversation history is available (e.g., in chat applications or agent work ### Dataset Description -This benchmark evaluates model performance on a diverse set of prompts: +This benchmark combines multiple public datasets and synthetic benign conversations: -- **Subset of the open source jailbreak dataset [JailbreakV-28k](https://huggingface.co/datasets/JailbreakV-28K/JailBreakV-28k)** (n=2,000) -- **Synthetic prompts** covering a diverse range of benign topics (n=1,000) -- **Open source [Toxicity](https://github.com/surge-ai/toxicity/blob/main/toxicity_en.csv) dataset** containing harmful content that does not involve jailbreak attempts (n=1,000) +- **Red Queen jailbreak corpus ([GitHub](https://github.com/kriti-hippo/red_queen/blob/main/Data/Red_Queen_Attack.zip))**: 14,000 positive samples collected with gpt-4o attacks. +- **Tom Gibbs multi-turn jailbreak attacks ([Hugging Face](https://huggingface.co/datasets/tom-gibbs/multi-turn_jailbreak_attack_datasets/tree/main))**: 4,136 positive samples. +- **Scale MHJ dataset ([Hugging Face](https://huggingface.co/datasets/ScaleAI/mhj))**: 537 positive samples. +- **Synthetic benign conversations**: 12,433 negative samples generated by seeding prompts from [WildGuardMix](https://huggingface.co/datasets/allenai/wildguardmix?utm_source=chatgpt.com) where `adversarial=false` and `prompt_harm_label=false`, then expanding each single-turn input into five-turn dialogues using gpt-4.1. -**Total n = 4,000; positive class prevalence = 2,000 (50.0%)** +**Total n = 31,106; positives = 18,673; negatives = 12,433** + +For benchmarking, we randomly sampled 4,000 conversations from this pool using a 50/50 split between positive and negative samples. ### Results #### ROC Curve -![ROC Curve](../../benchmarking/jailbreak_roc_curve.png) +![ROC Curve](../../benchmarking/Jailbreak_roc_curves.png) #### Metrics Table | Model | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 | |--------------|---------|-------------|-------------|-------------|-----------------| -| gpt-5 | 0.982 | 0.984 | 0.977 | 0.977 | 0.743 | -| gpt-5-mini | 0.980 | 0.980 | 0.976 | 0.975 | 0.734 | -| gpt-4.1 | 0.979 | 0.975 | 0.975 | 0.975 | 0.661 | -| gpt-4.1-mini (default) | 0.979 | 0.974 | 0.972 | 0.972 | 0.654 | +| gpt-5 | 0.994 | 0.993 | 0.993 | 0.993 | 0.997 | +| gpt-5-mini | 0.813 | 0.832 | 0.832 | 0.832 | 0.000 | +| gpt-4.1 | 0.999 | 0.999 | 0.999 | 0.999 | 1.000 | +| gpt-4.1-mini (default) | 0.928 | 0.968 | 0.968 | 0.500 | 0.000 | #### Latency Performance | Model | TTC P50 (ms) | TTC P95 (ms) | |--------------|--------------|--------------| -| gpt-5 | 4,569 | 7,256 | -| gpt-5-mini | 5,019 | 9,212 | -| gpt-4.1 | 841 | 1,861 | -| gpt-4.1-mini | 749 | 1,291 | +| gpt-5 | 7,370 | 12,218 | +| gpt-5-mini | 7,055 | 11,579 | +| gpt-4.1 | 2,998 | 4,204 | +| gpt-4.1-mini | 1,538 | 2,089 | **Notes:** diff --git a/src/guardrails/evals/core/visualizer.py b/src/guardrails/evals/core/visualizer.py index 07f006a..050c87c 100644 --- a/src/guardrails/evals/core/visualizer.py +++ b/src/guardrails/evals/core/visualizer.py @@ -12,6 +12,7 @@ import matplotlib.pyplot as plt import numpy as np import seaborn as sns +from sklearn.metrics import roc_auc_score, roc_curve logger = logging.getLogger(__name__) @@ -111,10 +112,8 @@ def create_roc_curves(self, results_by_model: dict[str, list[Any]], guardrail_na continue try: - from sklearn.metrics import roc_curve - fpr, tpr, _ = roc_curve(y_true, y_scores) - roc_auc = np.trapz(tpr, fpr) + roc_auc = roc_auc_score(y_true, y_scores) ax.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc:.3f})", linewidth=2) except Exception as e: logger.error("Failed to calculate ROC curve for model %s: %s", model_name, e) @@ -144,15 +143,25 @@ def _extract_roc_data(self, results: list[Any], guardrail_name: str) -> tuple[li y_scores = [] for result in results: - if guardrail_name in result.expected_triggers: - expected = result.expected_triggers[guardrail_name] - actual = result.triggered.get(guardrail_name, False) + if guardrail_name not in result.expected_triggers: + logger.warning("Guardrail '%s' not found in expected_triggers for sample %s", guardrail_name, result.id) + continue - y_true.append(1 if expected else 0) - y_scores.append(1 if actual else 0) + expected = result.expected_triggers[guardrail_name] + y_true.append(1 if expected else 0) + y_scores.append(self._get_confidence_score(result, guardrail_name)) return y_true, y_scores + def _get_confidence_score(self, result: Any, guardrail_name: str) -> float: + """Extract the model-reported confidence score for plotting.""" + if guardrail_name in result.details: + guardrail_details = result.details[guardrail_name] + if isinstance(guardrail_details, dict) and "confidence" in guardrail_details: + return float(guardrail_details["confidence"]) + + return 1.0 if result.triggered.get(guardrail_name, False) else 0.0 + def create_latency_comparison_chart(self, latency_results: dict[str, dict[str, Any]]) -> Path: """Create a chart comparing latency across models.""" fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))