Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions src/__tests__/unit/checks/keywords-urls.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,232 @@ describe('keywords guardrail', () => {
expect(result.tripwireTriggered).toBe(false);
expect(result.info?.matchedKeywords).toEqual([]);
});

it('does not match partial words', () => {
const result = keywordsCheck(
{},
'Hello, world!',
KeywordsConfig.parse({ keywords: ['orld'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(false);
});

it('matches numbers', () => {
const result = keywordsCheck(
{},
'Hello, world123',
KeywordsConfig.parse({ keywords: ['world123'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['world123']);
});

it('does not match partial numbers', () => {
const result = keywordsCheck(
{},
'Hello, world12345',
KeywordsConfig.parse({ keywords: ['world123'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(false);
});

it('matches underscores', () => {
const result = keywordsCheck(
{},
'Hello, w_o_r_l_d',
KeywordsConfig.parse({ keywords: ['w_o_r_l_d'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['w_o_r_l_d']);
});

it('does not match when underscores appear inside other words', () => {
const result = keywordsCheck(
{},
'Hello, test_world_test',
KeywordsConfig.parse({ keywords: ['world'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(false);
});

it('matches chinese characters', () => {
const result = keywordsCheck(
{},
'你好',
KeywordsConfig.parse({ keywords: ['你好'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
});

it('matches chinese characters with numbers', () => {
const result = keywordsCheck(
{},
'你好123',
KeywordsConfig.parse({ keywords: ['你好123'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['你好123']);
});

it('does not match partial chinese characters with numbers', () => {
const result = keywordsCheck(
{},
'你好12345',
KeywordsConfig.parse({ keywords: ['你好123'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(false);
});

it('applies word boundaries across multi-keyword patterns', () => {
const result = keywordsCheck(
{},
'testing hello world',
KeywordsConfig.parse({ keywords: ['test', 'hello', 'world'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['hello', 'world']);
});

it('matches keywords that start with special characters embedded in text', () => {
const result = keywordsCheck(
{},
'Reach me via example@foo.com later',
KeywordsConfig.parse({ keywords: ['@foo'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['@foo']);
});

it('matches keywords that start with # even when preceded by letters', () => {
const result = keywordsCheck(
{},
'Use example#foo for the ID',
KeywordsConfig.parse({ keywords: ['#foo'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['#foo']);
});

it('ignores keywords that become empty after sanitization', () => {
const result = keywordsCheck(
{},
'Totally benign text',
KeywordsConfig.parse({ keywords: ['!!!'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(false);
expect(result.info?.matchedKeywords).toEqual([]);
expect(result.info?.sanitizedKeywords).toEqual(['']);
});

it('still matches other keywords when some sanitize to empty strings', () => {
const result = keywordsCheck(
{},
'Please keep this secret!',
KeywordsConfig.parse({ keywords: ['...', 'secret!!!'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['secret']);
});

it('matches keywords ending with special characters', () => {
const result = keywordsCheck(
{},
'Use foo@ in the config',
KeywordsConfig.parse({ keywords: ['foo@'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['foo@']);
});

it('matches keywords ending with punctuation when followed by word characters', () => {
const result = keywordsCheck(
{},
'Check foo@example',
KeywordsConfig.parse({ keywords: ['foo@'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['foo@']);
});

it('matches mixed script keywords', () => {
const result = keywordsCheck(
{},
'Welcome to hello你好world section',
KeywordsConfig.parse({ keywords: ['hello你好world'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['hello你好world']);
});

it('does not match partial mixed script keywords', () => {
const result = keywordsCheck(
{},
'This is hello你好worldextra',
KeywordsConfig.parse({ keywords: ['hello你好world'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(false);
});

it('matches Arabic characters', () => {
const result = keywordsCheck(
{},
'مرحبا بك',
KeywordsConfig.parse({ keywords: ['مرحبا'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['مرحبا']);
});

it('matches Cyrillic characters', () => {
const result = keywordsCheck(
{},
'Привет мир',
KeywordsConfig.parse({ keywords: ['Привет'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['Привет']);
});

it('matches keywords with only punctuation', () => {
const result = keywordsCheck(
{},
'Use the @@ symbol',
KeywordsConfig.parse({ keywords: ['@@'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['@@']);
});

it('matches mixed punctuation and alphanumeric keywords', () => {
const result = keywordsCheck(
{},
'Contact via @user123@',
KeywordsConfig.parse({ keywords: ['@user123@'] })
) as GuardrailResult;

expect(result.tripwireTriggered).toBe(true);
expect(result.info?.matchedKeywords).toEqual(['@user123@']);
});
});

describe('urls guardrail', () => {
Expand Down
50 changes: 43 additions & 7 deletions src/checks/keywords.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,15 @@ export type KeywordsContext = z.infer<typeof KeywordsContext>;
* @param config Configuration specifying keywords and behavior
* @returns GuardrailResult indicating if tripwire was triggered
*/
const WORD_CHAR_CLASS = '[\\p{L}\\p{N}_]';
const isWordChar = (() => {
const wordCharRegex = new RegExp(WORD_CHAR_CLASS, 'u');
return (char: string | undefined): boolean => {
if (!char) return false;
return wordCharRegex.test(char);
};
})();

export const keywordsCheck: CheckFn<KeywordsContext, string, KeywordsConfig> = (
ctx,
text,
Expand All @@ -52,13 +61,40 @@ export const keywordsCheck: CheckFn<KeywordsContext, string, KeywordsConfig> = (
// Sanitize keywords by stripping trailing punctuation
const sanitizedKeywords = keywords.map((k: string) => k.replace(/[.,!?;:]+$/, ''));

// Create regex pattern with word boundaries
// Escape special regex characters and join with word boundaries
const escapedKeywords = sanitizedKeywords.map((k: string) =>
k.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
);
const patternText = `\\b(?:${escapedKeywords.join('|')})\\b`;
const pattern = new RegExp(patternText, 'gi'); // case-insensitive, global
const keywordEntries = sanitizedKeywords
.map((sanitized) => ({
sanitized,
escaped: sanitized.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'),
}))
.filter(({ sanitized }) => sanitized.length > 0);

if (keywordEntries.length === 0) {
return {
tripwireTriggered: false,
info: {
matchedKeywords: [],
originalKeywords: keywords,
sanitizedKeywords,
totalKeywords: keywords.length,
textLength: text.length,
},
};
}

// Apply unicode-aware word boundaries per keyword so tokens that start/end with punctuation still match.
const keywordPatterns = keywordEntries.map(({ sanitized, escaped }) => {
const keywordChars = Array.from(sanitized);
const firstChar = keywordChars[0];
const lastChar = keywordChars[keywordChars.length - 1];
const needsLeftBoundary = isWordChar(firstChar);
const needsRightBoundary = isWordChar(lastChar);
const leftBoundary = needsLeftBoundary ? `(?<!${WORD_CHAR_CLASS})` : '';
const rightBoundary = needsRightBoundary ? `(?!${WORD_CHAR_CLASS})` : '';
return `${leftBoundary}${escaped}${rightBoundary}`;
});

const patternText = `(?:${keywordPatterns.join('|')})`;
const pattern = new RegExp(patternText, 'giu'); // case-insensitive, global, unicode aware

const matches: string[] = [];
let match;
Expand Down
Loading