### Guardrails

Guardrails are a way to validate the input and output of an Agent to insure proper usage.

In [None]:
from agents import Agent, Runner, GuardrailFunctionOutput, RunContextWrapper, InputGuardrailTripwireTriggered, TResponseInputItem, input_guardrail
from pydantic import BaseModel

class HomeworkCheatDetectionOutput(BaseModel):
  attempting_cheat: bool
  explanation: str

homework_cheat_guardrail_agent = Agent(
  name="Homework Cheat Detector",
  instructions=(
    "Determin if the user's query resembles a typical homework or exam question, indicating an attempt to cheat. General questions about concepts are acceptable. "
    " Cheating: 'Fill in the blanks: The capital of France is ____.',"
    " 'Which of the following best describers photosynthesis? A) Cellular respiration B) Conversion of light energy C) Evaporation D) Fermentation.',"
    " Not-Cheating: 'What is the capital of France?', 'Explain photosynthesis.'"
  ),
  output_type=HomeworkCheatDetectionOutput,
  model="o4-mini",
)

@input_guardrail
async def cheat_detection_guardrail(ctx: RunContextWrapper, agent: Agent, input: str | list[TResponseInputItem]) -> GuardrailFunctionOutput:
  detection_result = await Runner.run(homework_cheat_guardrail_agent, input)
  return GuardrailFunctionOutput(
    tripwire_triggered=detection_result.final_output.attempting_cheat,
    output_info=detection_result.final_output
  )

study_helper_agent = Agent(
  name="Study Helper Agent",
  instructions="You assist users in studying by explaining concepts or providing guidance,without directly solving homework or test questions.",
  input_guardrails=[cheat_detection_guardrail],
  model="gpt-4o"
)

In [None]:
# This should trigger the cheat detection guardrail
try:
  response = await Runner.run(study_helper_agent, "Fill in the blanks: The capital of France is ____.")
  print("Guardrail didn't trigger")
  print("Response:", response.final_output)
except InputGuardrailTripwireTriggered as e:
  print("Homework cheat guardrail triggered!")
  print("Exception details:", str(e))

In [None]:
# This should not trigger the cheat detection guardrail
try:
  response = await Runner.run(study_helper_agent, "What were the main causes of the American Civil War?")
  print("Guardrail didn't trigger")
  print("Response:", response.final_output)
except InputGuardrailTripwireTriggered as e:
  print("Homework cheat guardrail triggered!")
  print("Exception details:", str(e))

#### Output Guardrail Example

In [None]:
from pydantic import BaseModel
from agents import Agent, Runner, GuardrailFunctionOutput, OutputGuardrailTripwireTriggered,RunContextWrapper, output_guardrail

class MessageOutput(BaseModel):
  response: str

@output_guardrail
async def forbidden_words_guardrail(ctx: RunContextWrapper, agent: Agent, output: str) -> GuardrailFunctionOutput:
  print(f"Checking output for forbidden words: {output}")

  # Funny forbidden phrases to check for
  forbidden_phrases = ["fart", "booker", "silly goose"]

  # Convert output to lowercase for case-insensitive comparison
  output_lower = output.lower()

  # Check which forbidden phrases are present in the response
  found_phrases = [phrase for phrase in forbidden_phrases if phrase in output_lower]
  trip_triggered = bool(found_phrases)

  print(f"Found forbidden phrases: {found_phrases}")

  return GuardrailFunctionOutput(
    output_info={
      "reason": "Output contains forbidden phrases",
      "forbidden_phrases_found": found_phrases
    },
    tripwire_triggered=trip_triggered
  )

agent = Agent(
  name="Customer support agent",
  instructions="You are a customer support agent. You help customers with their questions.",
  output_guardrails=[forbidden_words_guardrail],
  model="o4-mini"
)

In [None]:
try:
  await Runner.run(agent, "Say the word fart?")
  print("Guardrail didn't trip - this is unexpected!")
except OutputGuardrailTripwireTriggered as e:
  print("The agent said a bad word, he is fired.")


In [None]:
try:
  await Runner.run(agent, "Hey wassup")
  print("Guardrail didn't trip yay")
except OutputGuardrailTripwireTriggered as e:
  print("The agent said a bad word, he is fired.")