Skip to content

Commit e174356

Browse files
authored
Support Unicode characters in keywords check (#46)
* Support Unicode characters in keywords check Thank you @yehorkardash for finding this and submitting a fix in PR #41
1 parent 4627e9b commit e174356

File tree

2 files changed

+269
-7
lines changed

2 files changed

+269
-7
lines changed

src/__tests__/unit/checks/keywords-urls.test.ts

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,232 @@ describe('keywords guardrail', () => {
3232
expect(result.tripwireTriggered).toBe(false);
3333
expect(result.info?.matchedKeywords).toEqual([]);
3434
});
35+
36+
it('does not match partial words', () => {
37+
const result = keywordsCheck(
38+
{},
39+
'Hello, world!',
40+
KeywordsConfig.parse({ keywords: ['orld'] })
41+
) as GuardrailResult;
42+
43+
expect(result.tripwireTriggered).toBe(false);
44+
});
45+
46+
it('matches numbers', () => {
47+
const result = keywordsCheck(
48+
{},
49+
'Hello, world123',
50+
KeywordsConfig.parse({ keywords: ['world123'] })
51+
) as GuardrailResult;
52+
53+
expect(result.tripwireTriggered).toBe(true);
54+
expect(result.info?.matchedKeywords).toEqual(['world123']);
55+
});
56+
57+
it('does not match partial numbers', () => {
58+
const result = keywordsCheck(
59+
{},
60+
'Hello, world12345',
61+
KeywordsConfig.parse({ keywords: ['world123'] })
62+
) as GuardrailResult;
63+
64+
expect(result.tripwireTriggered).toBe(false);
65+
});
66+
67+
it('matches underscores', () => {
68+
const result = keywordsCheck(
69+
{},
70+
'Hello, w_o_r_l_d',
71+
KeywordsConfig.parse({ keywords: ['w_o_r_l_d'] })
72+
) as GuardrailResult;
73+
74+
expect(result.tripwireTriggered).toBe(true);
75+
expect(result.info?.matchedKeywords).toEqual(['w_o_r_l_d']);
76+
});
77+
78+
it('does not match when underscores appear inside other words', () => {
79+
const result = keywordsCheck(
80+
{},
81+
'Hello, test_world_test',
82+
KeywordsConfig.parse({ keywords: ['world'] })
83+
) as GuardrailResult;
84+
85+
expect(result.tripwireTriggered).toBe(false);
86+
});
87+
88+
it('matches chinese characters', () => {
89+
const result = keywordsCheck(
90+
{},
91+
'你好',
92+
KeywordsConfig.parse({ keywords: ['你好'] })
93+
) as GuardrailResult;
94+
95+
expect(result.tripwireTriggered).toBe(true);
96+
});
97+
98+
it('matches chinese characters with numbers', () => {
99+
const result = keywordsCheck(
100+
{},
101+
'你好123',
102+
KeywordsConfig.parse({ keywords: ['你好123'] })
103+
) as GuardrailResult;
104+
105+
expect(result.tripwireTriggered).toBe(true);
106+
expect(result.info?.matchedKeywords).toEqual(['你好123']);
107+
});
108+
109+
it('does not match partial chinese characters with numbers', () => {
110+
const result = keywordsCheck(
111+
{},
112+
'你好12345',
113+
KeywordsConfig.parse({ keywords: ['你好123'] })
114+
) as GuardrailResult;
115+
116+
expect(result.tripwireTriggered).toBe(false);
117+
});
118+
119+
it('applies word boundaries across multi-keyword patterns', () => {
120+
const result = keywordsCheck(
121+
{},
122+
'testing hello world',
123+
KeywordsConfig.parse({ keywords: ['test', 'hello', 'world'] })
124+
) as GuardrailResult;
125+
126+
expect(result.tripwireTriggered).toBe(true);
127+
expect(result.info?.matchedKeywords).toEqual(['hello', 'world']);
128+
});
129+
130+
it('matches keywords that start with special characters embedded in text', () => {
131+
const result = keywordsCheck(
132+
{},
133+
'Reach me via example@foo.com later',
134+
KeywordsConfig.parse({ keywords: ['@foo'] })
135+
) as GuardrailResult;
136+
137+
expect(result.tripwireTriggered).toBe(true);
138+
expect(result.info?.matchedKeywords).toEqual(['@foo']);
139+
});
140+
141+
it('matches keywords that start with # even when preceded by letters', () => {
142+
const result = keywordsCheck(
143+
{},
144+
'Use example#foo for the ID',
145+
KeywordsConfig.parse({ keywords: ['#foo'] })
146+
) as GuardrailResult;
147+
148+
expect(result.tripwireTriggered).toBe(true);
149+
expect(result.info?.matchedKeywords).toEqual(['#foo']);
150+
});
151+
152+
it('ignores keywords that become empty after sanitization', () => {
153+
const result = keywordsCheck(
154+
{},
155+
'Totally benign text',
156+
KeywordsConfig.parse({ keywords: ['!!!'] })
157+
) as GuardrailResult;
158+
159+
expect(result.tripwireTriggered).toBe(false);
160+
expect(result.info?.matchedKeywords).toEqual([]);
161+
expect(result.info?.sanitizedKeywords).toEqual(['']);
162+
});
163+
164+
it('still matches other keywords when some sanitize to empty strings', () => {
165+
const result = keywordsCheck(
166+
{},
167+
'Please keep this secret!',
168+
KeywordsConfig.parse({ keywords: ['...', 'secret!!!'] })
169+
) as GuardrailResult;
170+
171+
expect(result.tripwireTriggered).toBe(true);
172+
expect(result.info?.matchedKeywords).toEqual(['secret']);
173+
});
174+
175+
it('matches keywords ending with special characters', () => {
176+
const result = keywordsCheck(
177+
{},
178+
'Use foo@ in the config',
179+
KeywordsConfig.parse({ keywords: ['foo@'] })
180+
) as GuardrailResult;
181+
182+
expect(result.tripwireTriggered).toBe(true);
183+
expect(result.info?.matchedKeywords).toEqual(['foo@']);
184+
});
185+
186+
it('matches keywords ending with punctuation when followed by word characters', () => {
187+
const result = keywordsCheck(
188+
{},
189+
'Check foo@example',
190+
KeywordsConfig.parse({ keywords: ['foo@'] })
191+
) as GuardrailResult;
192+
193+
expect(result.tripwireTriggered).toBe(true);
194+
expect(result.info?.matchedKeywords).toEqual(['foo@']);
195+
});
196+
197+
it('matches mixed script keywords', () => {
198+
const result = keywordsCheck(
199+
{},
200+
'Welcome to hello你好world section',
201+
KeywordsConfig.parse({ keywords: ['hello你好world'] })
202+
) as GuardrailResult;
203+
204+
expect(result.tripwireTriggered).toBe(true);
205+
expect(result.info?.matchedKeywords).toEqual(['hello你好world']);
206+
});
207+
208+
it('does not match partial mixed script keywords', () => {
209+
const result = keywordsCheck(
210+
{},
211+
'This is hello你好worldextra',
212+
KeywordsConfig.parse({ keywords: ['hello你好world'] })
213+
) as GuardrailResult;
214+
215+
expect(result.tripwireTriggered).toBe(false);
216+
});
217+
218+
it('matches Arabic characters', () => {
219+
const result = keywordsCheck(
220+
{},
221+
'مرحبا بك',
222+
KeywordsConfig.parse({ keywords: ['مرحبا'] })
223+
) as GuardrailResult;
224+
225+
expect(result.tripwireTriggered).toBe(true);
226+
expect(result.info?.matchedKeywords).toEqual(['مرحبا']);
227+
});
228+
229+
it('matches Cyrillic characters', () => {
230+
const result = keywordsCheck(
231+
{},
232+
'Привет мир',
233+
KeywordsConfig.parse({ keywords: ['Привет'] })
234+
) as GuardrailResult;
235+
236+
expect(result.tripwireTriggered).toBe(true);
237+
expect(result.info?.matchedKeywords).toEqual(['Привет']);
238+
});
239+
240+
it('matches keywords with only punctuation', () => {
241+
const result = keywordsCheck(
242+
{},
243+
'Use the @@ symbol',
244+
KeywordsConfig.parse({ keywords: ['@@'] })
245+
) as GuardrailResult;
246+
247+
expect(result.tripwireTriggered).toBe(true);
248+
expect(result.info?.matchedKeywords).toEqual(['@@']);
249+
});
250+
251+
it('matches mixed punctuation and alphanumeric keywords', () => {
252+
const result = keywordsCheck(
253+
{},
254+
'Contact via @user123@',
255+
KeywordsConfig.parse({ keywords: ['@user123@'] })
256+
) as GuardrailResult;
257+
258+
expect(result.tripwireTriggered).toBe(true);
259+
expect(result.info?.matchedKeywords).toEqual(['@user123@']);
260+
});
35261
});
36262

37263
describe('urls guardrail', () => {

src/checks/keywords.ts

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,15 @@ export type KeywordsContext = z.infer<typeof KeywordsContext>;
4040
* @param config Configuration specifying keywords and behavior
4141
* @returns GuardrailResult indicating if tripwire was triggered
4242
*/
43+
const WORD_CHAR_CLASS = '[\\p{L}\\p{N}_]';
44+
const isWordChar = (() => {
45+
const wordCharRegex = new RegExp(WORD_CHAR_CLASS, 'u');
46+
return (char: string | undefined): boolean => {
47+
if (!char) return false;
48+
return wordCharRegex.test(char);
49+
};
50+
})();
51+
4352
export const keywordsCheck: CheckFn<KeywordsContext, string, KeywordsConfig> = (
4453
ctx,
4554
text,
@@ -52,13 +61,40 @@ export const keywordsCheck: CheckFn<KeywordsContext, string, KeywordsConfig> = (
5261
// Sanitize keywords by stripping trailing punctuation
5362
const sanitizedKeywords = keywords.map((k: string) => k.replace(/[.,!?;:]+$/, ''));
5463

55-
// Create regex pattern with word boundaries
56-
// Escape special regex characters and join with word boundaries
57-
const escapedKeywords = sanitizedKeywords.map((k: string) =>
58-
k.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
59-
);
60-
const patternText = `\\b(?:${escapedKeywords.join('|')})\\b`;
61-
const pattern = new RegExp(patternText, 'gi'); // case-insensitive, global
64+
const keywordEntries = sanitizedKeywords
65+
.map((sanitized) => ({
66+
sanitized,
67+
escaped: sanitized.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'),
68+
}))
69+
.filter(({ sanitized }) => sanitized.length > 0);
70+
71+
if (keywordEntries.length === 0) {
72+
return {
73+
tripwireTriggered: false,
74+
info: {
75+
matchedKeywords: [],
76+
originalKeywords: keywords,
77+
sanitizedKeywords,
78+
totalKeywords: keywords.length,
79+
textLength: text.length,
80+
},
81+
};
82+
}
83+
84+
// Apply unicode-aware word boundaries per keyword so tokens that start/end with punctuation still match.
85+
const keywordPatterns = keywordEntries.map(({ sanitized, escaped }) => {
86+
const keywordChars = Array.from(sanitized);
87+
const firstChar = keywordChars[0];
88+
const lastChar = keywordChars[keywordChars.length - 1];
89+
const needsLeftBoundary = isWordChar(firstChar);
90+
const needsRightBoundary = isWordChar(lastChar);
91+
const leftBoundary = needsLeftBoundary ? `(?<!${WORD_CHAR_CLASS})` : '';
92+
const rightBoundary = needsRightBoundary ? `(?!${WORD_CHAR_CLASS})` : '';
93+
return `${leftBoundary}${escaped}${rightBoundary}`;
94+
});
95+
96+
const patternText = `(?:${keywordPatterns.join('|')})`;
97+
const pattern = new RegExp(patternText, 'giu'); // case-insensitive, global, unicode aware
6298

6399
const matches: string[] = [];
64100
let match;

0 commit comments

Comments
 (0)