diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 23fd0dd3c..9e2c0e989 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1650,6 +1650,9 @@ class _RuleFeatureIndex: # Mapping from rule name to list of Bytes features that have to match. # All these features will be evaluated whenever a Bytes feature is encountered. bytes_rules: dict[str, list[Feature]] + # Rules that cannot be safely handled by optimized candidate filtering. + # These are always evaluated via the unoptimized path. + fallback_rules: set[str] # this routine is unstable and may change before the next major release. @staticmethod @@ -1802,13 +1805,18 @@ def and_score_key(item): # Ideally we find a way to get rid of all of these, eventually. string_rules: dict[str, list[Feature]] = {} bytes_rules: dict[str, list[Feature]] = {} + fallback_rules: set[str] = set() for rule in rules: rule_name = rule.meta["name"] root = rule.statement item = rec(rule_name, root) - assert item is not None + if item is None: + logger.debug("indexing: no stable index feature for rule, will use fallback matcher: %s", rule_name) + scores_by_rule[rule_name] = 0 + fallback_rules.add(rule_name) + continue score, features = item string_features = [ @@ -1847,8 +1855,9 @@ def and_score_key(item): logger.debug( "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules) ) + logger.debug("indexing: %d rules evaluated via fallback matcher", len(fallback_rules)) - return RuleSet._RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules) + return RuleSet._RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules, fallback_rules) @staticmethod def _get_rules_for_scope(rules, scope) -> list[Rule]: @@ -1976,7 +1985,7 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> tuple[Fea # Find all the rules that could match the given feature set. # Ideally we want this set to be as small and focused as possible, # and we can tune it by tweaking `_index_rules_by_feature`. - candidate_rule_names: set[str] = set() + candidate_rule_names: set[str] = set(feature_index.fallback_rules) for feature in features: candidate_rule_names.update(feature_index.rules_by_feature.get(feature, ())) @@ -2109,13 +2118,9 @@ def match( This wrapper around _match exists so that we can assert it matches precisely the same as `capa.engine.match`, just faster. - This matcher does not handle some edge cases: - - top level NOT statements - - also top level counted features with zero occurances, like: `count(menmonic(mov)): 0` - - nested NOT statements (NOT: NOT: foo) - - We should discourage/forbid these constructs from our rules and add lints for them. - TODO(williballenthin): add lints for logic edge cases + Some edge-case rule constructs cannot be safely optimized for candidate filtering. + In those cases, this matcher transparently falls back to unoptimized evaluation + for the affected rules. Args: paranoid: when true, demonstrate that the naive matcher agrees with this diff --git a/tests/test_match.py b/tests/test_match.py index 8fdd146dc..0c1b451a7 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -198,6 +198,31 @@ def test_match_range_with_zero(): assert "test rule" not in matches +def test_match_top_level_range_exact_zero(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - count(number(100)): 0 + """ + ) + r = capa.rules.Rule.from_yaml(rule) + + _, matches = match([r], {}, 0x0) + assert "test rule" in matches + + _, matches = match([r], {capa.features.insn.Number(100): {}}, 0x0) + assert "test rule" in matches + + _, matches = match([r], {capa.features.insn.Number(100): {1}}, 0x0) + assert "test rule" not in matches + + def test_match_adds_matched_rule_feature(): """show that using `match` adds a feature for matched rules.""" rule = textwrap.dedent( @@ -585,7 +610,6 @@ def test_regex_get_value_str(pattern): assert capa.features.common.Regex(pattern).get_value_str() == pattern -@pytest.mark.xfail(reason="can't have top level NOT") def test_match_only_not(): rule = textwrap.dedent( """ @@ -630,7 +654,6 @@ def test_match_not(): assert "test rule" in matches -@pytest.mark.xfail(reason="can't have nested NOT") def test_match_not_not(): rule = textwrap.dedent( """ diff --git a/tests/test_rules.py b/tests/test_rules.py index 9a8b6874f..72f61fb6f 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -1685,3 +1685,62 @@ def test_circular_dependency(): ] with pytest.raises(capa.rules.InvalidRule): list(capa.rules.get_rules_and_dependencies(rules, rules[0].name)) + + +def test_invalid_top_level_not_statement(): + with pytest.raises(capa.rules.InvalidRule, match="top level statement may not be a `not` statement"): + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - not: + - number: 1 + """ + ) + ) + + +def test_invalid_nested_not_statement(): + with pytest.raises(capa.rules.InvalidRule, match="nested `not` statements are not supported"): + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - and: + - mnemonic: mov + - not: + - not: + - number: 1 + """ + ) + ) + + +def test_invalid_top_level_count_zero_statement(): + with pytest.raises(capa.rules.InvalidRule, match="top level statement may not be `count\\(...\\): 0`"): + capa.rules.Rule.from_yaml( + textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - count(number(100)): 0 + """ + ) + )