Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,6 @@ site/
__pycache__/
*.pyc
.pytest_cache/

# internal
internal_examples/
Binary file modified docs/benchmarking/alignment_roc_curves.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 10 additions & 10 deletions docs/ref/checks/prompt_injection_detection.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ Returns a `GuardrailResult` with the following `info` dictionary:
"observation": "The assistant is calling get_weather function with location parameter",
"flagged": false,
"confidence": 0.1,
"evidence": null,
"threshold": 0.7,
"user_goal": "What's the weather in Tokyo?",
"action": [
Expand All @@ -81,6 +82,7 @@ Returns a `GuardrailResult` with the following `info` dictionary:
- **`observation`**: What the AI action is doing
- **`flagged`**: Whether the action is misaligned (boolean)
- **`confidence`**: Confidence score (0.0 to 1.0) that the action is misaligned
- **`evidence`**: Specific evidence from conversation history that supports the decision (null when aligned)
- **`threshold`**: The confidence threshold that was configured
- **`user_goal`**: The tracked user intent from conversation
- **`action`**: The list of function calls or tool outputs analyzed for alignment
Expand All @@ -92,10 +94,8 @@ Returns a `GuardrailResult` with the following `info` dictionary:

This benchmark evaluates model performance on agent conversation traces:

- **Synthetic dataset**: 1,000 samples with 500 positive cases (50% prevalence) simulating realistic agent traces
- **AgentDojo dataset**: 1,046 samples from AgentDojo's workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
- **Test scenarios**: Multi-turn conversations with function calls and tool outputs across realistic workplace domains
- **Misalignment examples**: Unrelated function calls, harmful operations, and data leakage
- **[AgentDojo dataset](https://github.com/ethz-spylab/agentdojo)**: 1,046 samples generated from running AgentDojo's benchmark script on workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
- **Internal synthetic dataset**: 537 positive cases simulating realistic, multi-turn agent conversation traces

**Example of misaligned conversation:**

Expand All @@ -113,12 +113,12 @@ This benchmark evaluates model performance on agent conversation traces:

| Model | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 |
|---------------|---------|-------------|-------------|-------------|-----------------|
| gpt-5 | 0.9604 | 0.998 | 0.995 | 0.963 | 0.431 |
| gpt-5-mini | 0.9796 | 0.999 | 0.999 | 0.966 | 0.000 |
| gpt-5-nano | 0.8651 | 0.963 | 0.963 | 0.951 | 0.056 |
| gpt-4.1 | 0.9846 | 0.998 | 0.998 | 0.998 | 0.000 |
| gpt-4.1-mini (default) | 0.9728 | 0.995 | 0.995 | 0.995 | 0.000 |
| gpt-4.1-nano | 0.8677 | 0.974 | 0.974 | 0.974 | 0.000 |
| gpt-5 | 0.9931 | 0.9992 | 0.9992 | 0.9992 | 0.5845 |
| gpt-5-mini | 0.9536 | 0.9951 | 0.9951 | 0.9951 | 0.0000 |
| gpt-5-nano | 0.9283 | 0.9913 | 0.9913 | 0.9717 | 0.0350 |
| gpt-4.1 | 0.9794 | 0.9973 | 0.9973 | 0.9973 | 0.0000 |
| gpt-4.1-mini (default) | 0.9865 | 0.9986 | 0.9986 | 0.9986 | 0.0000 |
| gpt-4.1-nano | 0.9142 | 0.9948 | 0.9948 | 0.9387 | 0.0000 |

**Notes:**

Expand Down
103 changes: 101 additions & 2 deletions src/__tests__/unit/prompt_injection_detection.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ const mockOpenAI = {
flagged: false,
confidence: 0.2,
observation: "The LLM action is aligned with the user's goal",
evidence: null,
}),
},
},
Expand Down Expand Up @@ -65,8 +66,9 @@ describe('Prompt Injection Detection Check', () => {
const result = await promptInjectionDetectionCheck(contextWithoutHistory, 'test data', config);

expect(result.tripwireTriggered).toBe(false);
expect(result.info.observation).toBe('No conversation history available');
expect(result.info.observation).toBe('No actionable tool messages to evaluate');
expect(result.info.guardrail_name).toBe('Prompt Injection Detection');
expect(result.info.evidence).toBeNull();
});

it('should return skip result when only user messages', async () => {
Expand Down Expand Up @@ -120,6 +122,103 @@ describe('Prompt Injection Detection Check', () => {
const result = await promptInjectionDetectionCheck(contextWithError, 'test data', config);

expect(result.tripwireTriggered).toBe(false);
expect(result.info.observation).toBe('No conversation history available');
expect(result.info.observation).toBe('No actionable tool messages to evaluate');
});

it('should not flag benign weather check', async () => {
const result = await promptInjectionDetectionCheck(mockContext, 'test data', config);

expect(result.tripwireTriggered).toBe(false);
expect(result.info.confidence).toBeLessThan(config.confidence_threshold);
expect(result.info.guardrail_name).toBe('Prompt Injection Detection');
expect(result.info.evidence).toBeNull();
});

it('should handle context with previous messages', async () => {
const contextWithHistory = {
...mockContext,
getConversationHistory: () => [
{ role: 'user', content: 'Can you help me?' },
{ role: 'assistant', content: 'Of course!' },
{ role: 'user', content: 'What is the weather in Tokyo?' },
{ role: 'assistant', content: 'I will check the weather for you.' },
{ type: 'function_call', name: 'get_weather', arguments: '{"location": "Tokyo"}' },
],
};

const result = await promptInjectionDetectionCheck(contextWithHistory, 'test data', config);

expect(result.tripwireTriggered).toBe(false);
expect(result.info.user_goal).toContain('What is the weather in Tokyo?');
expect(result.info.user_goal).toContain('Previous context');
});

it('should process tool outputs correctly', async () => {
const contextWithToolOutput = {
...mockContext,
getConversationHistory: () => [
{ role: 'user', content: 'Check the weather in Paris' },
{ type: 'function_call', name: 'get_weather', arguments: '{"location": "Paris"}' },
{ type: 'function_call_output', call_id: 'call_456', output: '{"temperature": 18}' },
],
};

const result = await promptInjectionDetectionCheck(contextWithToolOutput, 'test data', config);

expect(result.info.action).toBeDefined();
expect(result.info.action.length).toBeGreaterThan(0);
});

it('should propagate evidence when LLM flags injection', async () => {
const flaggedOpenAI = {
chat: {
completions: {
create: async () => ({
choices: [
{
message: {
content: JSON.stringify({
flagged: true,
confidence: 0.9,
observation: 'Detected malicious function call unrelated to user intent',
evidence: 'function call: delete_files with arguments {}',
}),
},
},
],
}),
},
},
};

const flaggedContext = {
...mockContext,
guardrailLlm: flaggedOpenAI as unknown as OpenAI,
};

const result = await promptInjectionDetectionCheck(flaggedContext, 'test data', config);

expect(result.tripwireTriggered).toBe(true);
expect(result.info.evidence).toBe('function call: delete_files with arguments {}');
});

it('should handle empty tool output', async () => {
const contextWithEmptyOutput = {
...mockContext,
getConversationHistory: () => [
{ role: 'user', content: 'Test query' },
{ type: 'function_call', name: 'test_function', arguments: '{}' },
{ type: 'function_call_output', call_id: 'call_789', output: '' },
],
};

const result = await promptInjectionDetectionCheck(
contextWithEmptyOutput,
'test data',
config
);

expect(result.tripwireTriggered).toBe(false);
expect(result.info.action).toBeDefined();
});
});
53 changes: 29 additions & 24 deletions src/checks/hallucination-detection.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import { z } from 'zod';
import { CheckFn, GuardrailResult, GuardrailLLMContext } from '../types';
import { defaultSpecRegistry } from '../registry';
import { createErrorResult, LLMErrorOutput } from './llm-base';

/**
* Configuration schema for hallucination detection.
Expand Down Expand Up @@ -196,22 +197,24 @@ export const hallucination_detection: CheckFn<
parsedJson = JSON.parse(jsonText);
} catch (error) {
console.warn('Failed to parse LLM response as JSON:', jsonText);
// Return a safe default if JSON parsing fails
return {
tripwireTriggered: false,
info: {
guardrail_name: 'Hallucination Detection',
flagged: false,
confidence: 0.0,
// Return a safe default if JSON parsing fails using shared error helper
const errorOutput: LLMErrorOutput = {
flagged: false,
confidence: 0.0,
info: { error_message: `JSON parsing failed: ${error instanceof Error ? error.message : String(error)}` },
};
return createErrorResult(
'Hallucination Detection',
errorOutput,
candidate,
{
threshold: config.confidence_threshold,
reasoning: 'LLM response could not be parsed as JSON',
hallucination_type: null,
hallucinated_statements: null,
verified_statements: null,
threshold: config.confidence_threshold,
error: `JSON parsing failed: ${error instanceof Error ? error.message : String(error)}`,
checked_text: candidate,
},
};
}
);
}

const analysis = HallucinationDetectionOutput.parse(parsedJson);
Expand All @@ -234,23 +237,25 @@ export const hallucination_detection: CheckFn<
},
};
} catch (error) {
// Log unexpected errors and return safe default
// Log unexpected errors and return safe default using shared error helper
console.error('Unexpected error in hallucination_detection:', error);
return {
tripwireTriggered: false,
info: {
guardrail_name: 'Hallucination Detection',
flagged: false,
confidence: 0.0,
const errorOutput: LLMErrorOutput = {
flagged: false,
confidence: 0.0,
info: { error_message: error instanceof Error ? error.message : String(error) },
};
return createErrorResult(
'Hallucination Detection',
errorOutput,
candidate,
{
threshold: config.confidence_threshold,
reasoning: `Analysis failed: ${error instanceof Error ? error.message : String(error)}`,
hallucination_type: null,
hallucinated_statements: null,
verified_statements: null,
threshold: config.confidence_threshold,
error: error instanceof Error ? error.message : String(error),
checked_text: candidate, // Hallucination Detection doesn't modify text, pass through unchanged
},
};
}
);
}
};

Expand Down
31 changes: 31 additions & 0 deletions src/checks/llm-base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,37 @@ export const LLMErrorOutput = LLMOutput.extend({

export type LLMErrorOutput = z.infer<typeof LLMErrorOutput>;

/**
* Create a standardized error result for LLM-based guardrails.
*
* This helper provides a consistent way to handle errors across all LLM-based checks,
* ensuring uniform error reporting and preventing tripwire triggers on execution failures.
*
* @param guardrailName - Name of the guardrail that encountered the error.
* @param analysis - LLMErrorOutput containing error information.
* @param checkedText - The original text that was being checked.
* @param additionalInfo - Optional additional information to include in the result.
* @returns GuardrailResult with tripwireTriggered=false and error information.
*/
export function createErrorResult(
guardrailName: string,
analysis: LLMErrorOutput,
checkedText: string,
additionalInfo: Record<string, unknown> = {}
): GuardrailResult {
return {
tripwireTriggered: false,
info: {
guardrail_name: guardrailName,
flagged: analysis.flagged,
confidence: analysis.confidence,
checked_text: checkedText,
...analysis.info,
...additionalInfo,
},
};
}

/**
* Assemble a complete LLM prompt with instructions and response schema.
*
Expand Down
Loading