Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions src/guardrails/checks/text/keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,21 @@ def _compile_pattern(keywords: tuple[str, ...]) -> re.Pattern[str]:
Returns:
re.Pattern[str]: Compiled regex pattern to match any given keyword.
"""
escaped = (re.escape(k) for k in keywords)
pattern_text = r"\b(?:" + "|".join(escaped) + r")\b"
# Build individual patterns with conditional boundary assertions
# Only apply (?<!\w) if keyword starts with word char, (?!\w) if it ends with word char
patterns = []
for keyword in keywords:
escaped = re.escape(keyword)
# Check first and last character of the original keyword for word character status
starts_with_word_char = keyword and (keyword[0].isalnum() or keyword[0] == "_")
ends_with_word_char = keyword and (keyword[-1].isalnum() or keyword[-1] == "_")

prefix = r"(?<!\w)" if starts_with_word_char else ""
suffix = r"(?!\w)" if ends_with_word_char else ""
patterns.append(f"{prefix}{escaped}{suffix}")

# (?<!\w) and (?!\w) use Unicode-aware lookbehind/lookahead to enforce word boundaries.
pattern_text = "(?:" + "|".join(patterns) + ")"

return re.compile(pattern_text, re.IGNORECASE)

Expand Down
129 changes: 129 additions & 0 deletions tests/unit/checks/test_keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,132 @@ async def test_keywords_does_not_trigger_on_benign_text() -> None:
result = await keywords(ctx=None, data="Safe content", config=config)

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_does_not_match_partial_words() -> None:
"""Ensure substrings embedded in larger words are ignored."""
config = KeywordCfg(keywords=["orld"])
result = match_keywords("Hello, world!", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_handles_numeric_tokens() -> None:
"""Keywords containing digits should match exact tokens."""
config = KeywordCfg(keywords=["world123"])
result = match_keywords("Hello, world123", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["world123"] # noqa: S101


def test_match_keywords_rejects_partial_numeric_tokens() -> None:
"""Numeric keywords should not match when extra digits follow."""
config = KeywordCfg(keywords=["world123"])
result = match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_handles_underscored_tokens() -> None:
"""Underscored keywords should be detected exactly once."""
config = KeywordCfg(keywords=["w_o_r_l_d"])
result = match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["w_o_r_l_d"] # noqa: S101


def test_match_keywords_rejects_words_embedded_in_underscores() -> None:
"""Words surrounded by underscores should not trigger partial matches."""
config = KeywordCfg(keywords=["world"])
result = match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_handles_chinese_characters() -> None:
"""Unicode keywords such as Chinese characters should match."""
config = KeywordCfg(keywords=["你好"])
result = match_keywords("你好", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["你好"] # noqa: S101


def test_match_keywords_handles_chinese_tokens_with_digits() -> None:
"""Unicode keywords that include digits should match whole tokens."""
config = KeywordCfg(keywords=["你好123"])
result = match_keywords("你好123", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["你好123"] # noqa: S101


def test_match_keywords_rejects_partial_chinese_tokens_with_digits() -> None:
"""Unicode keywords with trailing digits should not match supersets."""
config = KeywordCfg(keywords=["你好123"])
result = match_keywords("你好12345", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is False # noqa: S101


def test_match_keywords_applies_boundaries_to_all_keywords() -> None:
"""Every keyword in a multi-token pattern should respect Unicode boundaries."""
config = KeywordCfg(keywords=["test", "hello", "world"])
result = match_keywords("testing hello world", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["hello", "world"] # noqa: S101


def test_match_keywords_detects_email_like_patterns() -> None:
"""Email-like keywords starting with punctuation should match after word chars."""
config = KeywordCfg(keywords=["@corp.com"])
result = match_keywords("foo@corp.com", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["@corp.com"] # noqa: S101


def test_match_keywords_detects_hashtag_patterns() -> None:
"""Hashtag keywords starting with punctuation should match after word chars."""
config = KeywordCfg(keywords=["#leak"])
result = match_keywords("abc#leak", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["#leak"] # noqa: S101


def test_match_keywords_respects_end_boundary_for_punctuation_prefixed() -> None:
"""Punctuation-prefixed keywords ending with word chars need end boundary."""
config = KeywordCfg(keywords=["@leak"])
# Should not match when word chars continue after
result = match_keywords("foo@leakmore", config, guardrail_name="Test Guardrail")
assert result.tripwire_triggered is False # noqa: S101

# Should match when followed by non-word char
result = match_keywords("foo@leak bar", config, guardrail_name="Test Guardrail")
assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["@leak"] # noqa: S101


def test_match_keywords_handles_full_punctuation_keywords() -> None:
"""Keywords consisting only of punctuation should match anywhere."""
config = KeywordCfg(keywords=["@#$"])
result = match_keywords("test@#$test", config, guardrail_name="Test Guardrail")

assert result.tripwire_triggered is True # noqa: S101
assert result.info["matched"] == ["@#$"] # noqa: S101


def test_match_keywords_mixed_punctuation_and_word_chars() -> None:
"""Keywords with both punctuation prefix and suffix should work correctly."""
config = KeywordCfg(keywords=["@user@"])
# Should match when embedded
result = match_keywords("test@user@test", config, guardrail_name="Test Guardrail")
assert result.tripwire_triggered is True # noqa: S101

# Should match even when followed by more text (no boundaries applied to punctuation edges)
result = match_keywords("test@user@more", config, guardrail_name="Test Guardrail")
assert result.tripwire_triggered is True # noqa: S101
Loading