From c5b21fc11789ed97155ec5abb1857cf551aaa314 Mon Sep 17 00:00:00 2001 From: Steven C Date: Thu, 30 Oct 2025 10:01:25 -0400 Subject: [PATCH 1/2] Support prompt usage --- docs/benchmarking/nsfw.md | 31 ------------------------------- src/guardrails/agents.py | 20 ++++++++++++++++---- 2 files changed, 16 insertions(+), 35 deletions(-) delete mode 100644 docs/benchmarking/nsfw.md diff --git a/docs/benchmarking/nsfw.md b/docs/benchmarking/nsfw.md deleted file mode 100644 index df331e3..0000000 --- a/docs/benchmarking/nsfw.md +++ /dev/null @@ -1,31 +0,0 @@ -# NSFW Text Check Benchmark Results - -## Dataset Description - -This benchmark evaluates model performance on a balanced set of social media posts: - -- Open Source [Toxicity dataset](https://github.com/surge-ai/toxicity/blob/main/toxicity_en.csv) -- 500 NSFW (true) and 500 non-NSFW (false) samples -- All samples are sourced from real social media platforms - -**Total n = 1,000; positive class prevalence = 500 (50.0%)** - -## Results - -### ROC Curve - -![ROC Curve](./NSFW_roc_curve.png) - -### Metrics Table - -| Model | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 | -|--------------|---------|-------------|-------------|-------------|-----------------| -| gpt-4.1 | 0.989 | 0.976 | 0.962 | 0.962 | 0.717 | -| gpt-4.1-mini | 0.984 | 0.977 | 0.977 | 0.943 | 0.653 | -| gpt-4.1-nano | 0.952 | 0.972 | 0.823 | 0.823 | 0.429 | -| gpt-4o-mini | 0.965 | 0.977 | 0.955 | 0.945 | 0.842 | - -#### Notes -- ROC AUC: Area under the ROC curve (higher is better) -- Prec@R: Precision at the specified recall threshold -- Recall@FPR=0.01: Recall when the false positive rate is 1% diff --git a/src/guardrails/agents.py b/src/guardrails/agents.py index 0645081..0dbe077 100644 --- a/src/guardrails/agents.py +++ b/src/guardrails/agents.py @@ -492,7 +492,7 @@ def __new__( cls, config: str | Path | dict[str, Any], name: str, - instructions: str, + instructions: str | Callable[[Any, Any], Any] | None = None, raise_guardrail_errors: bool = False, block_on_tool_violations: bool = False, **agent_kwargs: Any, @@ -511,7 +511,9 @@ def __new__( Args: config: Pipeline configuration (file path, dict, or JSON string) name: Agent name - instructions: Agent instructions + instructions: Agent instructions. Can be a string, a callable that dynamically + generates instructions, or None. If a callable, it will receive the context + and agent instance and must return a string. raise_guardrail_errors: If True, raise exceptions when guardrails fail to execute. If False (default), treat guardrail errors as safe and continue execution. block_on_tool_violations: If True, tool guardrail violations raise exceptions (halt execution). @@ -553,7 +555,11 @@ def __new__( input_tool, input_agent = _separate_tool_level_from_agent_level(stage_guardrails.get("input", [])) output_tool, output_agent = _separate_tool_level_from_agent_level(stage_guardrails.get("output", [])) - # Create agent-level INPUT guardrails + # Extract any user-provided guardrails from agent_kwargs + user_input_guardrails = agent_kwargs.pop("input_guardrails", []) + user_output_guardrails = agent_kwargs.pop("output_guardrails", []) + + # Create agent-level INPUT guardrails from config input_guardrails = [] # Add agent-level guardrails from pre_flight and input stages @@ -573,7 +579,10 @@ def __new__( ) ) - # Create agent-level OUTPUT guardrails + # Merge with user-provided input guardrails (config ones run first, then user ones) + input_guardrails.extend(user_input_guardrails) + + # Create agent-level OUTPUT guardrails from config output_guardrails = [] if output_agent: output_guardrails = _create_agents_guardrails_from_config( @@ -583,6 +592,9 @@ def __new__( raise_guardrail_errors=raise_guardrail_errors, ) + # Merge with user-provided output guardrails (config ones run first, then user ones) + output_guardrails.extend(user_output_guardrails) + # Apply tool-level guardrails tools = agent_kwargs.get("tools", []) From b6997794772dc73f73740c8eaf6b0c03a1fb75eb Mon Sep 17 00:00:00 2001 From: Steven C Date: Thu, 30 Oct 2025 10:11:11 -0400 Subject: [PATCH 2/2] adding tests --- tests/unit/test_agents.py | 137 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) diff --git a/tests/unit/test_agents.py b/tests/unit/test_agents.py index 0aa1c4d..d9202fe 100644 --- a/tests/unit/test_agents.py +++ b/tests/unit/test_agents.py @@ -597,3 +597,140 @@ def test_guardrail_agent_without_tools(monkeypatch: pytest.MonkeyPatch) -> None: agent_instance = agents.GuardrailAgent(config={}, name="NoTools", instructions="None") assert getattr(agent_instance, "input_guardrails", []) == [] # noqa: S101 + + +def test_guardrail_agent_without_instructions(monkeypatch: pytest.MonkeyPatch) -> None: + """GuardrailAgent should work without instructions parameter.""" + pipeline = SimpleNamespace(pre_flight=None, input=None, output=None) + + monkeypatch.setattr(runtime_module, "load_pipeline_bundles", lambda config: pipeline, raising=False) + monkeypatch.setattr(runtime_module, "instantiate_guardrails", lambda *args, **kwargs: [], raising=False) + + # Should not raise TypeError about missing instructions + agent_instance = agents.GuardrailAgent(config={}, name="NoInstructions") + + assert isinstance(agent_instance, agents_module.Agent) # noqa: S101 + assert agent_instance.instructions is None # noqa: S101 + + +def test_guardrail_agent_with_callable_instructions(monkeypatch: pytest.MonkeyPatch) -> None: + """GuardrailAgent should accept callable instructions.""" + pipeline = SimpleNamespace(pre_flight=None, input=None, output=None) + + monkeypatch.setattr(runtime_module, "load_pipeline_bundles", lambda config: pipeline, raising=False) + monkeypatch.setattr(runtime_module, "instantiate_guardrails", lambda *args, **kwargs: [], raising=False) + + def dynamic_instructions(ctx: Any, agent: Any) -> str: + return f"You are {agent.name}" + + agent_instance = agents.GuardrailAgent( + config={}, + name="DynamicAgent", + instructions=dynamic_instructions, + ) + + assert isinstance(agent_instance, agents_module.Agent) # noqa: S101 + assert callable(agent_instance.instructions) # noqa: S101 + assert agent_instance.instructions == dynamic_instructions # noqa: S101 + + +def test_guardrail_agent_merges_user_input_guardrails(monkeypatch: pytest.MonkeyPatch) -> None: + """User input guardrails should be merged with config guardrails.""" + agent_guard = _make_guardrail("Config Input Guard") + + class FakePipeline: + def __init__(self) -> None: + self.pre_flight = None + self.input = SimpleNamespace() + self.output = None + + pipeline = FakePipeline() + + def fake_load_pipeline_bundles(config: Any) -> FakePipeline: + return pipeline + + def fake_instantiate_guardrails(stage: Any, registry: Any | None = None) -> list[Any]: + if stage is pipeline.input: + return [agent_guard] + return [] + + from guardrails import runtime as runtime_module + + monkeypatch.setattr(runtime_module, "load_pipeline_bundles", fake_load_pipeline_bundles) + monkeypatch.setattr(runtime_module, "instantiate_guardrails", fake_instantiate_guardrails) + + # Create a custom user guardrail + custom_guardrail = lambda ctx, agent, input: None # noqa: E731 + + agent_instance = agents.GuardrailAgent( + config={}, + name="MergedAgent", + instructions="Test", + input_guardrails=[custom_guardrail], + ) + + # Should have both config and user guardrails merged + assert isinstance(agent_instance, agents_module.Agent) # noqa: S101 + assert len(agent_instance.input_guardrails) == 2 # noqa: S101 + # Config guardrail from _create_agents_guardrails_from_config, then user guardrail + + +def test_guardrail_agent_merges_user_output_guardrails(monkeypatch: pytest.MonkeyPatch) -> None: + """User output guardrails should be merged with config guardrails.""" + agent_guard = _make_guardrail("Config Output Guard") + + class FakePipeline: + def __init__(self) -> None: + self.pre_flight = None + self.input = None + self.output = SimpleNamespace() + + pipeline = FakePipeline() + + def fake_load_pipeline_bundles(config: Any) -> FakePipeline: + return pipeline + + def fake_instantiate_guardrails(stage: Any, registry: Any | None = None) -> list[Any]: + if stage is pipeline.output: + return [agent_guard] + return [] + + from guardrails import runtime as runtime_module + + monkeypatch.setattr(runtime_module, "load_pipeline_bundles", fake_load_pipeline_bundles) + monkeypatch.setattr(runtime_module, "instantiate_guardrails", fake_instantiate_guardrails) + + # Create a custom user guardrail + custom_guardrail = lambda ctx, agent, output: None # noqa: E731 + + agent_instance = agents.GuardrailAgent( + config={}, + name="MergedAgent", + instructions="Test", + output_guardrails=[custom_guardrail], + ) + + # Should have both config and user guardrails merged + assert isinstance(agent_instance, agents_module.Agent) # noqa: S101 + assert len(agent_instance.output_guardrails) == 2 # noqa: S101 + # Config guardrail from _create_agents_guardrails_from_config, then user guardrail + + +def test_guardrail_agent_with_empty_user_guardrails(monkeypatch: pytest.MonkeyPatch) -> None: + """GuardrailAgent should handle empty user guardrail lists gracefully.""" + pipeline = SimpleNamespace(pre_flight=None, input=None, output=None) + + monkeypatch.setattr(runtime_module, "load_pipeline_bundles", lambda config: pipeline, raising=False) + monkeypatch.setattr(runtime_module, "instantiate_guardrails", lambda *args, **kwargs: [], raising=False) + + agent_instance = agents.GuardrailAgent( + config={}, + name="EmptyListAgent", + instructions="Test", + input_guardrails=[], + output_guardrails=[], + ) + + assert isinstance(agent_instance, agents_module.Agent) # noqa: S101 + assert agent_instance.input_guardrails == [] # noqa: S101 + assert agent_instance.output_guardrails == [] # noqa: S101