openai · gabor-openai · Oct 30, 2025 · Oct 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -101,3 +101,6 @@ site/
 __pycache__/
 *.pyc
 .pytest_cache/
+
+# internal
+internal_examples/
diff --git a/docs/benchmarking/alignment_roc_curves.png b/docs/benchmarking/alignment_roc_curves.png
diff --git a/docs/ref/checks/prompt_injection_detection.md b/docs/ref/checks/prompt_injection_detection.md
@@ -65,6 +65,7 @@ Returns a `GuardrailResult` with the following `info` dictionary:
     "observation": "The assistant is calling get_weather function with location parameter",
     "flagged": false,
     "confidence": 0.1,
+    "evidence": null,
     "threshold": 0.7,
     "user_goal": "What's the weather in Tokyo?",
     "action": [
@@ -81,6 +82,7 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 - **`observation`**: What the AI action is doing
 - **`flagged`**: Whether the action is misaligned (boolean)
 - **`confidence`**: Confidence score (0.0 to 1.0) that the action is misaligned
+- **`evidence`**: Specific evidence from conversation history that supports the decision (null when aligned)
 - **`threshold`**: The confidence threshold that was configured
 - **`user_goal`**: The tracked user intent from conversation
 - **`action`**: The list of function calls or tool outputs analyzed for alignment
@@ -92,10 +94,8 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 
 This benchmark evaluates model performance on agent conversation traces:
 
-- **Synthetic dataset**: 1,000 samples with 500 positive cases (50% prevalence) simulating realistic agent traces
-- **AgentDojo dataset**: 1,046 samples from AgentDojo's workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
-- **Test scenarios**: Multi-turn conversations with function calls and tool outputs across realistic workplace domains
-- **Misalignment examples**: Unrelated function calls, harmful operations, and data leakage
+- **[AgentDojo dataset](https://github.com/ethz-spylab/agentdojo)**: 1,046 samples generated from running AgentDojo's benchmark script on workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
+- **Internal synthetic dataset**: 537 positive cases simulating realistic, multi-turn agent conversation traces
 
 **Example of misaligned conversation:**
 
@@ -113,12 +113,12 @@ This benchmark evaluates model performance on agent conversation traces:
 
 | Model         | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 |
 |---------------|---------|-------------|-------------|-------------|-----------------|
-| gpt-5         | 0.9604  | 0.998       | 0.995       | 0.963       | 0.431           |
-| gpt-5-mini    | 0.9796  | 0.999       | 0.999       | 0.966       | 0.000           |
-| gpt-5-nano    | 0.8651  | 0.963       | 0.963       | 0.951       | 0.056           |
-| gpt-4.1       | 0.9846  | 0.998       | 0.998       | 0.998       | 0.000           |
-| gpt-4.1-mini (default) | 0.9728  | 0.995       | 0.995       | 0.995       | 0.000           |
-| gpt-4.1-nano  | 0.8677  | 0.974       | 0.974       | 0.974       | 0.000           |
+| gpt-5         | 0.9931  | 0.9992      | 0.9992      | 0.9992      | 0.5845          |
+| gpt-5-mini    | 0.9536  | 0.9951      | 0.9951      | 0.9951      | 0.0000          |
+| gpt-5-nano    | 0.9283  | 0.9913      | 0.9913      | 0.9717      | 0.0350          |
+| gpt-4.1       | 0.9794  | 0.9973      | 0.9973      | 0.9973      | 0.0000          |
+| gpt-4.1-mini (default) | 0.9865  | 0.9986      | 0.9986      | 0.9986      | 0.0000          |
+| gpt-4.1-nano  | 0.9142  | 0.9948      | 0.9948      | 0.9387      | 0.0000          |
 
 **Notes:**
 

diff --git a/src/__tests__/unit/prompt_injection_detection.test.ts b/src/__tests__/unit/prompt_injection_detection.test.ts
@@ -22,6 +22,7 @@ const mockOpenAI = {
                 flagged: false,
                 confidence: 0.2,
                 observation: "The LLM action is aligned with the user's goal",
+                evidence: null,
               }),
             },
           },
@@ -65,8 +66,9 @@ describe('Prompt Injection Detection Check', () => {
     const result = await promptInjectionDetectionCheck(contextWithoutHistory, 'test data', config);
 
     expect(result.tripwireTriggered).toBe(false);
-    expect(result.info.observation).toBe('No conversation history available');
+    expect(result.info.observation).toBe('No actionable tool messages to evaluate');
     expect(result.info.guardrail_name).toBe('Prompt Injection Detection');
+    expect(result.info.evidence).toBeNull();
   });
 
   it('should return skip result when only user messages', async () => {
@@ -120,6 +122,103 @@ describe('Prompt Injection Detection Check', () => {
     const result = await promptInjectionDetectionCheck(contextWithError, 'test data', config);
 
     expect(result.tripwireTriggered).toBe(false);
-    expect(result.info.observation).toBe('No conversation history available');
+    expect(result.info.observation).toBe('No actionable tool messages to evaluate');
+  });
+
+  it('should not flag benign weather check', async () => {
+    const result = await promptInjectionDetectionCheck(mockContext, 'test data', config);
+
+    expect(result.tripwireTriggered).toBe(false);
+    expect(result.info.confidence).toBeLessThan(config.confidence_threshold);
+    expect(result.info.guardrail_name).toBe('Prompt Injection Detection');
+    expect(result.info.evidence).toBeNull();
+  });
+
+  it('should handle context with previous messages', async () => {
+    const contextWithHistory = {
+      ...mockContext,
+      getConversationHistory: () => [
+        { role: 'user', content: 'Can you help me?' },
+        { role: 'assistant', content: 'Of course!' },
+        { role: 'user', content: 'What is the weather in Tokyo?' },
+        { role: 'assistant', content: 'I will check the weather for you.' },
+        { type: 'function_call', name: 'get_weather', arguments: '{"location": "Tokyo"}' },
+      ],
+    };
+
+    const result = await promptInjectionDetectionCheck(contextWithHistory, 'test data', config);
+
+    expect(result.tripwireTriggered).toBe(false);
+    expect(result.info.user_goal).toContain('What is the weather in Tokyo?');
+    expect(result.info.user_goal).toContain('Previous context');
+  });
+
+  it('should process tool outputs correctly', async () => {
+    const contextWithToolOutput = {
+      ...mockContext,
+      getConversationHistory: () => [
+        { role: 'user', content: 'Check the weather in Paris' },
+        { type: 'function_call', name: 'get_weather', arguments: '{"location": "Paris"}' },
+        { type: 'function_call_output', call_id: 'call_456', output: '{"temperature": 18}' },
+      ],
+    };
+
+    const result = await promptInjectionDetectionCheck(contextWithToolOutput, 'test data', config);
+
+    expect(result.info.action).toBeDefined();
+    expect(result.info.action.length).toBeGreaterThan(0);
+  });
+
+  it('should propagate evidence when LLM flags injection', async () => {
+    const flaggedOpenAI = {
+      chat: {
+        completions: {
+          create: async () => ({
+            choices: [
+              {
+                message: {
+                  content: JSON.stringify({
+                    flagged: true,
+                    confidence: 0.9,
+                    observation: 'Detected malicious function call unrelated to user intent',
+                    evidence: 'function call: delete_files with arguments {}',
+                  }),
+                },
+              },
+            ],
+          }),
+        },
+      },
+    };
+
+    const flaggedContext = {
+      ...mockContext,
+      guardrailLlm: flaggedOpenAI as unknown as OpenAI,
+    };
+
+    const result = await promptInjectionDetectionCheck(flaggedContext, 'test data', config);
+
+    expect(result.tripwireTriggered).toBe(true);
+    expect(result.info.evidence).toBe('function call: delete_files with arguments {}');
+  });
+
+  it('should handle empty tool output', async () => {
+    const contextWithEmptyOutput = {
+      ...mockContext,
+      getConversationHistory: () => [
+        { role: 'user', content: 'Test query' },
+        { type: 'function_call', name: 'test_function', arguments: '{}' },
+        { type: 'function_call_output', call_id: 'call_789', output: '' },
+      ],
+    };
+
+    const result = await promptInjectionDetectionCheck(
+      contextWithEmptyOutput,
+      'test data',
+      config
+    );
+
+    expect(result.tripwireTriggered).toBe(false);
+    expect(result.info.action).toBeDefined();
   });
 });
diff --git a/src/checks/hallucination-detection.ts b/src/checks/hallucination-detection.ts
@@ -20,6 +20,7 @@
 import { z } from 'zod';
 import { CheckFn, GuardrailResult, GuardrailLLMContext } from '../types';
 import { defaultSpecRegistry } from '../registry';
+import { createErrorResult, LLMErrorOutput } from './llm-base';
 
 /**
  * Configuration schema for hallucination detection.
@@ -196,22 +197,24 @@ export const hallucination_detection: CheckFn<
       parsedJson = JSON.parse(jsonText);
     } catch (error) {
       console.warn('Failed to parse LLM response as JSON:', jsonText);
-      // Return a safe default if JSON parsing fails
-      return {
-        tripwireTriggered: false,
-        info: {
-          guardrail_name: 'Hallucination Detection',
-          flagged: false,
-          confidence: 0.0,
+      // Return a safe default if JSON parsing fails using shared error helper
+      const errorOutput: LLMErrorOutput = {
+        flagged: false,
+        confidence: 0.0,
+        info: { error_message: `JSON parsing failed: ${error instanceof Error ? error.message : String(error)}` },
+      };
+      return createErrorResult(
+        'Hallucination Detection',
+        errorOutput,
+        candidate,
+        {
+          threshold: config.confidence_threshold,
           reasoning: 'LLM response could not be parsed as JSON',
           hallucination_type: null,
           hallucinated_statements: null,
           verified_statements: null,
-          threshold: config.confidence_threshold,
-          error: `JSON parsing failed: ${error instanceof Error ? error.message : String(error)}`,
-          checked_text: candidate,
-        },
-      };
+        }
+      );
     }
 
     const analysis = HallucinationDetectionOutput.parse(parsedJson);
@@ -234,23 +237,25 @@ export const hallucination_detection: CheckFn<
       },
     };
   } catch (error) {
-    // Log unexpected errors and return safe default
+    // Log unexpected errors and return safe default using shared error helper
     console.error('Unexpected error in hallucination_detection:', error);
-    return {
-      tripwireTriggered: false,
-      info: {
-        guardrail_name: 'Hallucination Detection',
-        flagged: false,
-        confidence: 0.0,
+    const errorOutput: LLMErrorOutput = {
+      flagged: false,
+      confidence: 0.0,
+      info: { error_message: error instanceof Error ? error.message : String(error) },
+    };
+    return createErrorResult(
+      'Hallucination Detection',
+      errorOutput,
+      candidate,
+      {
+        threshold: config.confidence_threshold,
         reasoning: `Analysis failed: ${error instanceof Error ? error.message : String(error)}`,
         hallucination_type: null,
         hallucinated_statements: null,
         verified_statements: null,
-        threshold: config.confidence_threshold,
-        error: error instanceof Error ? error.message : String(error),
-        checked_text: candidate, // Hallucination Detection doesn't modify text, pass through unchanged
-      },
-    };
+      }
+    );
   }
 };
 

diff --git a/src/checks/llm-base.ts b/src/checks/llm-base.ts
@@ -62,6 +62,37 @@ export const LLMErrorOutput = LLMOutput.extend({
 
 export type LLMErrorOutput = z.infer<typeof LLMErrorOutput>;
 
+/**
+ * Create a standardized error result for LLM-based guardrails.
+ *
+ * This helper provides a consistent way to handle errors across all LLM-based checks,
+ * ensuring uniform error reporting and preventing tripwire triggers on execution failures.
+ *
+ * @param guardrailName - Name of the guardrail that encountered the error.
+ * @param analysis - LLMErrorOutput containing error information.
+ * @param checkedText - The original text that was being checked.
+ * @param additionalInfo - Optional additional information to include in the result.
+ * @returns GuardrailResult with tripwireTriggered=false and error information.
+ */
+export function createErrorResult(
+  guardrailName: string,
+  analysis: LLMErrorOutput,
+  checkedText: string,
+  additionalInfo: Record<string, unknown> = {}
+): GuardrailResult {
+  return {
+    tripwireTriggered: false,
+    info: {
+      guardrail_name: guardrailName,
+      flagged: analysis.flagged,
+      confidence: analysis.confidence,
+      checked_text: checkedText,
+      ...analysis.info,
+      ...additionalInfo,
+    },
+  };
+}
+
 /**
  * Assemble a complete LLM prompt with instructions and response schema.
  *