Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified docs/benchmarking/NSFW_roc_curve.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 6 additions & 4 deletions docs/ref/checks/nsfw.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,12 @@ This benchmark evaluates model performance on a balanced set of social media pos

| Model | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 |
|--------------|---------|-------------|-------------|-------------|-----------------|
| gpt-4.1 | 0.989 | 0.976 | 0.962 | 0.962 | 0.717 |
| gpt-4.1-mini (default) | 0.984 | 0.977 | 0.977 | 0.943 | 0.653 |
| gpt-4.1-nano | 0.952 | 0.972 | 0.823 | 0.823 | 0.429 |
| gpt-4o-mini | 0.965 | 0.977 | 0.955 | 0.945 | 0.842 |
| gpt-5 | 0.9532 | 0.9195 | 0.9096 | 0.9068 | 0.0339 |
| gpt-5-mini | 0.9629 | 0.9321 | 0.9168 | 0.9149 | 0.0998 |
| gpt-5-nano | 0.9600 | 0.9297 | 0.9216 | 0.9175 | 0.1078 |
| gpt-4.1 | 0.9603 | 0.9312 | 0.9249 | 0.9192 | 0.0439 |
| gpt-4.1-mini (default) | 0.9520 | 0.9180 | 0.9130 | 0.9049 | 0.0459 |
| gpt-4.1-nano | 0.9502 | 0.9262 | 0.9094 | 0.9043 | 0.0379 |

**Notes:**

Expand Down
3 changes: 2 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,14 @@ nav:
- "Streaming vs Blocking": streaming_output.md
- Tripwires: tripwires.md
- Checks:
- Prompt Injection Detection: ref/checks/prompt_injection_detection.md
- Contains PII: ref/checks/pii.md
- Custom Prompt Check: ref/checks/custom_prompt_check.md
- Hallucination Detection: ref/checks/hallucination_detection.md
- Jailbreak Detection: ref/checks/jailbreak.md
- Moderation: ref/checks/moderation.md
- NSFW Text: ref/checks/nsfw.md
- Off Topic Prompts: ref/checks/off_topic_prompts.md
- Prompt Injection Detection: ref/checks/prompt_injection_detection.md
- URL Filter: ref/checks/urls.md
- Evaluation Tool: evals.md
- API Reference:
Expand Down
52 changes: 31 additions & 21 deletions src/guardrails/checks/text/hallucination_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,13 @@
from guardrails.spec import GuardrailSpecMetadata
from guardrails.types import GuardrailLLMContextProto, GuardrailResult

from .llm_base import LLMConfig, LLMOutput, _invoke_openai_callable
from .llm_base import (
LLMConfig,
LLMErrorOutput,
LLMOutput,
_invoke_openai_callable,
create_error_result,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -232,39 +238,43 @@ async def hallucination_detection(
)

except ValueError as e:
# Log validation errors but return safe default
# Log validation errors and use shared error helper
logger.warning(f"Validation error in hallucination_detection: {e}")
return GuardrailResult(
tripwire_triggered=False,
info={
"guardrail_name": "Hallucination Detection",
"flagged": False,
"confidence": 0.0,
error_output = LLMErrorOutput(
flagged=False,
confidence=0.0,
info={"error_message": f"Validation failed: {str(e)}"},
)
return create_error_result(
guardrail_name="Hallucination Detection",
analysis=error_output,
checked_text=candidate,
additional_info={
"threshold": config.confidence_threshold,
"reasoning": f"Validation failed: {str(e)}",
"hallucination_type": None,
"hallucinated_statements": None,
"verified_statements": None,
"threshold": config.confidence_threshold,
"error": str(e),
"checked_text": candidate, # Hallucination Detection doesn't modify text, pass through unchanged
},
)
except Exception as e:
# Log unexpected errors and return safe default
# Log unexpected errors and use shared error helper
logger.exception("Unexpected error in hallucination_detection")
return GuardrailResult(
tripwire_triggered=False,
info={
"guardrail_name": "Hallucination Detection",
"flagged": False,
"confidence": 0.0,
error_output = LLMErrorOutput(
flagged=False,
confidence=0.0,
info={"error_message": str(e)},
)
return create_error_result(
guardrail_name="Hallucination Detection",
analysis=error_output,
checked_text=candidate,
additional_info={
"threshold": config.confidence_threshold,
"reasoning": f"Analysis failed: {str(e)}",
"hallucination_type": None,
"hallucinated_statements": None,
"verified_statements": None,
"threshold": config.confidence_threshold,
"error": str(e),
"checked_text": candidate, # Hallucination Detection doesn't modify text, pass through unchanged
},
)

Expand Down
64 changes: 49 additions & 15 deletions src/guardrails/checks/text/llm_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,13 @@ class MyLLMOutput(LLMOutput):
logger = logging.getLogger(__name__)


__all__ = ["LLMConfig", "LLMOutput", "LLMErrorOutput", "create_llm_check_fn"]
__all__ = [
"LLMConfig",
"LLMOutput",
"LLMErrorOutput",
"create_llm_check_fn",
"create_error_result",
]


class LLMConfig(BaseModel):
Expand Down Expand Up @@ -115,6 +121,44 @@ class LLMErrorOutput(LLMOutput):
info: dict


def create_error_result(
guardrail_name: str,
analysis: LLMErrorOutput,
checked_text: str,
additional_info: dict[str, Any] | None = None,
) -> GuardrailResult:
"""Create a standardized GuardrailResult from an LLM error output.

Args:
guardrail_name: Name of the guardrail that failed.
analysis: The LLM error output.
checked_text: The text that was being checked.
additional_info: Optional additional fields to include in info dict.

Returns:
GuardrailResult with execution_failed=True.
"""
error_info = getattr(analysis, "info", {})
error_message = error_info.get("error_message", "LLM execution failed")

result_info: dict[str, Any] = {
"guardrail_name": guardrail_name,
"checked_text": checked_text,
"error": error_message,
**analysis.model_dump(),
}

if additional_info:
result_info.update(additional_info)

return GuardrailResult(
tripwire_triggered=False,
execution_failed=True,
original_exception=Exception(error_message),
info=result_info,
)


def _build_full_prompt(system_prompt: str) -> str:
"""Assemble a complete LLM prompt with instructions and response schema.

Expand Down Expand Up @@ -334,20 +378,10 @@ async def guardrail_func(

# Check if this is an error result
if isinstance(analysis, LLMErrorOutput):
# Extract error information from the LLMErrorOutput
error_info = analysis.info if hasattr(analysis, "info") else {}
error_message = error_info.get("error_message", "LLM execution failed")

return GuardrailResult(
tripwire_triggered=False, # Don't trigger tripwire on execution errors
execution_failed=True,
original_exception=Exception(error_message), # Create exception from error message
info={
"guardrail_name": name,
"checked_text": data,
"error": error_message,
**analysis.model_dump(),
},
return create_error_result(
guardrail_name=name,
analysis=analysis,
checked_text=data,
)

# Compare severity levels
Expand Down
Loading
Loading