diff --git a/src/guardrails/checks/text/keywords.py b/src/guardrails/checks/text/keywords.py index 297bf96..2de4649 100644 --- a/src/guardrails/checks/text/keywords.py +++ b/src/guardrails/checks/text/keywords.py @@ -73,8 +73,21 @@ def _compile_pattern(keywords: tuple[str, ...]) -> re.Pattern[str]: Returns: re.Pattern[str]: Compiled regex pattern to match any given keyword. """ - escaped = (re.escape(k) for k in keywords) - pattern_text = r"\b(?:" + "|".join(escaped) + r")\b" + # Build individual patterns with conditional boundary assertions + # Only apply (? None: result = await keywords(ctx=None, data="Safe content", config=config) assert result.tripwire_triggered is False # noqa: S101 + + +def test_match_keywords_does_not_match_partial_words() -> None: + """Ensure substrings embedded in larger words are ignored.""" + config = KeywordCfg(keywords=["orld"]) + result = match_keywords("Hello, world!", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is False # noqa: S101 + + +def test_match_keywords_handles_numeric_tokens() -> None: + """Keywords containing digits should match exact tokens.""" + config = KeywordCfg(keywords=["world123"]) + result = match_keywords("Hello, world123", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["world123"] # noqa: S101 + + +def test_match_keywords_rejects_partial_numeric_tokens() -> None: + """Numeric keywords should not match when extra digits follow.""" + config = KeywordCfg(keywords=["world123"]) + result = match_keywords("Hello, world12345", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is False # noqa: S101 + + +def test_match_keywords_handles_underscored_tokens() -> None: + """Underscored keywords should be detected exactly once.""" + config = KeywordCfg(keywords=["w_o_r_l_d"]) + result = match_keywords("Hello, w_o_r_l_d", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["w_o_r_l_d"] # noqa: S101 + + +def test_match_keywords_rejects_words_embedded_in_underscores() -> None: + """Words surrounded by underscores should not trigger partial matches.""" + config = KeywordCfg(keywords=["world"]) + result = match_keywords("Hello, test_world_test", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is False # noqa: S101 + + +def test_match_keywords_handles_chinese_characters() -> None: + """Unicode keywords such as Chinese characters should match.""" + config = KeywordCfg(keywords=["你好"]) + result = match_keywords("你好", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["你好"] # noqa: S101 + + +def test_match_keywords_handles_chinese_tokens_with_digits() -> None: + """Unicode keywords that include digits should match whole tokens.""" + config = KeywordCfg(keywords=["你好123"]) + result = match_keywords("你好123", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["你好123"] # noqa: S101 + + +def test_match_keywords_rejects_partial_chinese_tokens_with_digits() -> None: + """Unicode keywords with trailing digits should not match supersets.""" + config = KeywordCfg(keywords=["你好123"]) + result = match_keywords("你好12345", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is False # noqa: S101 + + +def test_match_keywords_applies_boundaries_to_all_keywords() -> None: + """Every keyword in a multi-token pattern should respect Unicode boundaries.""" + config = KeywordCfg(keywords=["test", "hello", "world"]) + result = match_keywords("testing hello world", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["hello", "world"] # noqa: S101 + + +def test_match_keywords_detects_email_like_patterns() -> None: + """Email-like keywords starting with punctuation should match after word chars.""" + config = KeywordCfg(keywords=["@corp.com"]) + result = match_keywords("foo@corp.com", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["@corp.com"] # noqa: S101 + + +def test_match_keywords_detects_hashtag_patterns() -> None: + """Hashtag keywords starting with punctuation should match after word chars.""" + config = KeywordCfg(keywords=["#leak"]) + result = match_keywords("abc#leak", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["#leak"] # noqa: S101 + + +def test_match_keywords_respects_end_boundary_for_punctuation_prefixed() -> None: + """Punctuation-prefixed keywords ending with word chars need end boundary.""" + config = KeywordCfg(keywords=["@leak"]) + # Should not match when word chars continue after + result = match_keywords("foo@leakmore", config, guardrail_name="Test Guardrail") + assert result.tripwire_triggered is False # noqa: S101 + + # Should match when followed by non-word char + result = match_keywords("foo@leak bar", config, guardrail_name="Test Guardrail") + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["@leak"] # noqa: S101 + + +def test_match_keywords_handles_full_punctuation_keywords() -> None: + """Keywords consisting only of punctuation should match anywhere.""" + config = KeywordCfg(keywords=["@#$"]) + result = match_keywords("test@#$test", config, guardrail_name="Test Guardrail") + + assert result.tripwire_triggered is True # noqa: S101 + assert result.info["matched"] == ["@#$"] # noqa: S101 + + +def test_match_keywords_mixed_punctuation_and_word_chars() -> None: + """Keywords with both punctuation prefix and suffix should work correctly.""" + config = KeywordCfg(keywords=["@user@"]) + # Should match when embedded + result = match_keywords("test@user@test", config, guardrail_name="Test Guardrail") + assert result.tripwire_triggered is True # noqa: S101 + + # Should match even when followed by more text (no boundaries applied to punctuation edges) + result = match_keywords("test@user@more", config, guardrail_name="Test Guardrail") + assert result.tripwire_triggered is True # noqa: S101