From 033f9b1ac93feba157a17102ba746d9f228eb20d Mon Sep 17 00:00:00 2001 From: Steven C Date: Tue, 18 Nov 2025 15:40:52 -0500 Subject: [PATCH 1/3] Support Unicode characters in keywords check --- .../unit/checks/keywords-urls.test.ts | 203 ++++++++++++++++++ src/checks/keywords.ts | 27 ++- 2 files changed, 226 insertions(+), 4 deletions(-) diff --git a/src/__tests__/unit/checks/keywords-urls.test.ts b/src/__tests__/unit/checks/keywords-urls.test.ts index 8f5cfa5..89cc102 100644 --- a/src/__tests__/unit/checks/keywords-urls.test.ts +++ b/src/__tests__/unit/checks/keywords-urls.test.ts @@ -32,6 +32,209 @@ describe('keywords guardrail', () => { expect(result.tripwireTriggered).toBe(false); expect(result.info?.matchedKeywords).toEqual([]); }); + + it('does not match partial words', () => { + const result = keywordsCheck( + {}, + 'Hello, world!', + KeywordsConfig.parse({ keywords: ['orld'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(false); + }); + + it('matches numbers', () => { + const result = keywordsCheck( + {}, + 'Hello, world123', + KeywordsConfig.parse({ keywords: ['world123'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['world123']); + }); + + it('does not match partial numbers', () => { + const result = keywordsCheck( + {}, + 'Hello, world12345', + KeywordsConfig.parse({ keywords: ['world123'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(false); + }); + + it('matches underscores', () => { + const result = keywordsCheck( + {}, + 'Hello, w_o_r_l_d', + KeywordsConfig.parse({ keywords: ['w_o_r_l_d'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['w_o_r_l_d']); + }); + + it('does not match when underscores appear inside other words', () => { + const result = keywordsCheck( + {}, + 'Hello, test_world_test', + KeywordsConfig.parse({ keywords: ['world'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(false); + }); + + it('matches chinese characters', () => { + const result = keywordsCheck( + {}, + '你好', + KeywordsConfig.parse({ keywords: ['你好'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + }); + + it('matches chinese characters with numbers', () => { + const result = keywordsCheck( + {}, + '你好123', + KeywordsConfig.parse({ keywords: ['你好123'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['你好123']); + }); + + it('does not match partial chinese characters with numbers', () => { + const result = keywordsCheck( + {}, + '你好12345', + KeywordsConfig.parse({ keywords: ['你好123'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(false); + }); + + it('applies word boundaries across multi-keyword patterns', () => { + const result = keywordsCheck( + {}, + 'testing hello world', + KeywordsConfig.parse({ keywords: ['test', 'hello', 'world'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['hello', 'world']); + }); + + it('matches keywords that start with special characters embedded in text', () => { + const result = keywordsCheck( + {}, + 'Reach me via example@foo.com later', + KeywordsConfig.parse({ keywords: ['@foo'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['@foo']); + }); + + it('matches keywords that start with # even when preceded by letters', () => { + const result = keywordsCheck( + {}, + 'Use example#foo for the ID', + KeywordsConfig.parse({ keywords: ['#foo'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['#foo']); + }); + + it('matches keywords ending with special characters', () => { + const result = keywordsCheck( + {}, + 'Use foo@ in the config', + KeywordsConfig.parse({ keywords: ['foo@'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['foo@']); + }); + + it('matches keywords ending with punctuation when followed by word characters', () => { + const result = keywordsCheck( + {}, + 'Check foo@example', + KeywordsConfig.parse({ keywords: ['foo@'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['foo@']); + }); + + it('matches mixed script keywords', () => { + const result = keywordsCheck( + {}, + 'Welcome to hello你好world section', + KeywordsConfig.parse({ keywords: ['hello你好world'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['hello你好world']); + }); + + it('does not match partial mixed script keywords', () => { + const result = keywordsCheck( + {}, + 'This is hello你好worldextra', + KeywordsConfig.parse({ keywords: ['hello你好world'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(false); + }); + + it('matches Arabic characters', () => { + const result = keywordsCheck( + {}, + 'مرحبا بك', + KeywordsConfig.parse({ keywords: ['مرحبا'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['مرحبا']); + }); + + it('matches Cyrillic characters', () => { + const result = keywordsCheck( + {}, + 'Привет мир', + KeywordsConfig.parse({ keywords: ['Привет'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['Привет']); + }); + + it('matches keywords with only punctuation', () => { + const result = keywordsCheck( + {}, + 'Use the @@ symbol', + KeywordsConfig.parse({ keywords: ['@@'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['@@']); + }); + + it('matches mixed punctuation and alphanumeric keywords', () => { + const result = keywordsCheck( + {}, + 'Contact via @user123@', + KeywordsConfig.parse({ keywords: ['@user123@'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['@user123@']); + }); }); describe('urls guardrail', () => { diff --git a/src/checks/keywords.ts b/src/checks/keywords.ts index fe23ea6..60bb48f 100644 --- a/src/checks/keywords.ts +++ b/src/checks/keywords.ts @@ -52,13 +52,32 @@ export const keywordsCheck: CheckFn = ( // Sanitize keywords by stripping trailing punctuation const sanitizedKeywords = keywords.map((k: string) => k.replace(/[.,!?;:]+$/, '')); - // Create regex pattern with word boundaries - // Escape special regex characters and join with word boundaries + // Escape special regex characters so keywords are treated literally const escapedKeywords = sanitizedKeywords.map((k: string) => k.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') ); - const patternText = `\\b(?:${escapedKeywords.join('|')})\\b`; - const pattern = new RegExp(patternText, 'gi'); // case-insensitive, global + + const isWordChar = (char: string | undefined) => { + if (!char) return false; + if (char === '_') return true; + return /[\p{L}\p{N}]/u.test(char); + }; + + // Apply unicode-aware word boundaries per keyword so tokens that start/end with punctuation still match. + const keywordPatterns = escapedKeywords.map((keyword, index) => { + const originalKeyword = sanitizedKeywords[index]; + const keywordChars = Array.from(originalKeyword); + const firstChar = keywordChars[0]; + const lastChar = keywordChars[keywordChars.length - 1]; + const needsLeftBoundary = isWordChar(firstChar); + const needsRightBoundary = isWordChar(lastChar); + const leftBoundary = needsLeftBoundary ? '(? Date: Tue, 18 Nov 2025 16:50:13 -0500 Subject: [PATCH 2/3] Copilot fixes --- .../unit/checks/keywords-urls.test.ts | 23 ++++++++++ src/checks/keywords.ts | 43 +++++++++++++------ 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/__tests__/unit/checks/keywords-urls.test.ts b/src/__tests__/unit/checks/keywords-urls.test.ts index 89cc102..91ed9c5 100644 --- a/src/__tests__/unit/checks/keywords-urls.test.ts +++ b/src/__tests__/unit/checks/keywords-urls.test.ts @@ -149,6 +149,29 @@ describe('keywords guardrail', () => { expect(result.info?.matchedKeywords).toEqual(['#foo']); }); + it('ignores keywords that become empty after sanitization', () => { + const result = keywordsCheck( + {}, + 'Totally benign text', + KeywordsConfig.parse({ keywords: ['!!!'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(false); + expect(result.info?.matchedKeywords).toEqual([]); + expect(result.info?.sanitizedKeywords).toEqual(['']); + }); + + it('still matches other keywords when some sanitize to empty strings', () => { + const result = keywordsCheck( + {}, + 'Please keep this secret!', + KeywordsConfig.parse({ keywords: ['...', 'secret!!!'] }) + ) as GuardrailResult; + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.matchedKeywords).toEqual(['secret']); + }); + it('matches keywords ending with special characters', () => { const result = keywordsCheck( {}, diff --git a/src/checks/keywords.ts b/src/checks/keywords.ts index 60bb48f..b1badf6 100644 --- a/src/checks/keywords.ts +++ b/src/checks/keywords.ts @@ -40,6 +40,13 @@ export type KeywordsContext = z.infer; * @param config Configuration specifying keywords and behavior * @returns GuardrailResult indicating if tripwire was triggered */ +const unicodeWordCharRegex = /[\p{L}\p{N}]/u; +const isWordChar = (char: string | undefined): boolean => { + if (!char) return false; + if (char === '_') return true; + return unicodeWordCharRegex.test(char); +}; + export const keywordsCheck: CheckFn = ( ctx, text, @@ -52,28 +59,36 @@ export const keywordsCheck: CheckFn = ( // Sanitize keywords by stripping trailing punctuation const sanitizedKeywords = keywords.map((k: string) => k.replace(/[.,!?;:]+$/, '')); - // Escape special regex characters so keywords are treated literally - const escapedKeywords = sanitizedKeywords.map((k: string) => - k.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') - ); - - const isWordChar = (char: string | undefined) => { - if (!char) return false; - if (char === '_') return true; - return /[\p{L}\p{N}]/u.test(char); - }; + const keywordEntries = sanitizedKeywords + .map((sanitized) => ({ + sanitized, + escaped: sanitized.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'), + })) + .filter(({ sanitized }) => sanitized.length > 0); + + if (keywordEntries.length === 0) { + return { + tripwireTriggered: false, + info: { + matchedKeywords: [], + originalKeywords: keywords, + sanitizedKeywords, + totalKeywords: keywords.length, + textLength: text.length, + }, + }; + } // Apply unicode-aware word boundaries per keyword so tokens that start/end with punctuation still match. - const keywordPatterns = escapedKeywords.map((keyword, index) => { - const originalKeyword = sanitizedKeywords[index]; - const keywordChars = Array.from(originalKeyword); + const keywordPatterns = keywordEntries.map(({ sanitized, escaped }) => { + const keywordChars = Array.from(sanitized); const firstChar = keywordChars[0]; const lastChar = keywordChars[keywordChars.length - 1]; const needsLeftBoundary = isWordChar(firstChar); const needsRightBoundary = isWordChar(lastChar); const leftBoundary = needsLeftBoundary ? '(? Date: Wed, 19 Nov 2025 10:06:53 -0500 Subject: [PATCH 3/3] Copilot nits --- src/checks/keywords.ts | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/checks/keywords.ts b/src/checks/keywords.ts index b1badf6..6b37abf 100644 --- a/src/checks/keywords.ts +++ b/src/checks/keywords.ts @@ -40,12 +40,14 @@ export type KeywordsContext = z.infer; * @param config Configuration specifying keywords and behavior * @returns GuardrailResult indicating if tripwire was triggered */ -const unicodeWordCharRegex = /[\p{L}\p{N}]/u; -const isWordChar = (char: string | undefined): boolean => { - if (!char) return false; - if (char === '_') return true; - return unicodeWordCharRegex.test(char); -}; +const WORD_CHAR_CLASS = '[\\p{L}\\p{N}_]'; +const isWordChar = (() => { + const wordCharRegex = new RegExp(WORD_CHAR_CLASS, 'u'); + return (char: string | undefined): boolean => { + if (!char) return false; + return wordCharRegex.test(char); + }; +})(); export const keywordsCheck: CheckFn = ( ctx, @@ -86,8 +88,8 @@ export const keywordsCheck: CheckFn = ( const lastChar = keywordChars[keywordChars.length - 1]; const needsLeftBoundary = isWordChar(firstChar); const needsRightBoundary = isWordChar(lastChar); - const leftBoundary = needsLeftBoundary ? '(?