From 8b0076bfcea16ddfb249b497929e6e426b3adcf5 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 14 May 2024 21:41:18 +0200
Subject: [PATCH 01/28] features: mark format as a global feature

---
 capa/features/common.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/capa/features/common.py b/capa/features/common.py
index 279332e6e..b817fafb9 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -490,6 +490,6 @@ def __init__(self, value: str, description=None):
 def is_global_feature(feature):
     """
     is this a feature that is extracted at every scope?
-    today, these are OS and arch features.
+    today, these are OS, arch, and format features.
     """
-    return isinstance(feature, (OS, Arch))
+    return isinstance(feature, (OS, Arch, Format))

From 8858537af8936c836b67a91b5922d360363ddb0d Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 14 May 2024 21:41:33 +0200
Subject: [PATCH 02/28] pep8

---
 capa/features/extractors/binexport2/insn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 9b027ec88..1a6a99b21 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,7 +14,7 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, AnalysisContext
+from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, ReadMemoryError, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 

From 9c0c66245e9aaee8f82cbe92869de4fba1549712 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 14 May 2024 21:41:40 +0200
Subject: [PATCH 03/28] rules: optimize rule pre-filtering, first revision

---
 capa/rules/__init__.py | 528 ++++++++++++++++++++++++++---------------
 1 file changed, 337 insertions(+), 191 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 67d0b03ea..43dcb7665 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -9,11 +9,13 @@
 import io
 import os
 import re
+import copy
 import uuid
 import codecs
 import logging
 import binascii
 import collections
+import dataclasses
 from enum import Enum
 from pathlib import Path
 
@@ -1365,32 +1367,52 @@ def __init__(
 
         rules = capa.optimizer.optimize_rules(rules)
 
-        self.file_rules = self._get_rules_for_scope(rules, Scope.FILE)
-        self.process_rules = self._get_rules_for_scope(rules, Scope.PROCESS)
-        self.thread_rules = self._get_rules_for_scope(rules, Scope.THREAD)
-        self.call_rules = self._get_rules_for_scope(rules, Scope.CALL)
-        self.function_rules = self._get_rules_for_scope(rules, Scope.FUNCTION)
-        self.basic_block_rules = self._get_rules_for_scope(rules, Scope.BASIC_BLOCK)
-        self.instruction_rules = self._get_rules_for_scope(rules, Scope.INSTRUCTION)
+        scopes = (
+            Scope.FILE,
+            Scope.PROCESS,
+            Scope.THREAD,
+            Scope.CALL,
+            Scope.FUNCTION,
+            Scope.BASIC_BLOCK,
+            Scope.INSTRUCTION,
+        )
+
         self.rules = {rule.name: rule for rule in rules}
         self.rules_by_namespace = index_rules_by_namespace(rules)
+        self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes}
 
         # unstable
-        (self._easy_file_rules_by_feature, self._hard_file_rules) = self._index_rules_by_feature(self.file_rules)
-        (self._easy_process_rules_by_feature, self._hard_process_rules) = self._index_rules_by_feature(
-            self.process_rules
-        )
-        (self._easy_thread_rules_by_feature, self._hard_thread_rules) = self._index_rules_by_feature(self.thread_rules)
-        (self._easy_call_rules_by_feature, self._hard_call_rules) = self._index_rules_by_feature(self.call_rules)
-        (self._easy_function_rules_by_feature, self._hard_function_rules) = self._index_rules_by_feature(
-            self.function_rules
-        )
-        (self._easy_basic_block_rules_by_feature, self._hard_basic_block_rules) = self._index_rules_by_feature(
-            self.basic_block_rules
-        )
-        (self._easy_instruction_rules_by_feature, self._hard_instruction_rules) = self._index_rules_by_feature(
-            self.instruction_rules
-        )
+        self._feature_indexes_by_scopes = {
+            scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope]) for scope in scopes
+        }
+
+    @property
+    def file_rules(self):
+        return self.rules_by_scope[Scope.FILE]
+
+    @property
+    def process_rules(self):
+        return self.rules_by_scope[Scope.PROCESS]
+
+    @property
+    def thread_rules(self):
+        return self.rules_by_scope[Scope.THREAD]
+
+    @property
+    def call_rules(self):
+        return self.rules_by_scope[Scope.CALL]
+
+    @property
+    def function_rules(self):
+        return self.rules_by_scope[Scope.FUNCTION]
+
+    @property
+    def basic_block_rules(self):
+        return self.rules_by_scope[Scope.BASIC_BLOCK]
+
+    @property
+    def instruction_rules(self):
+        return self.rules_by_scope[Scope.INSTRUCTION]
 
     def __len__(self):
         return len(self.rules)
@@ -1402,123 +1424,186 @@ def __contains__(self, rulename):
         return rulename in self.rules
 
     @staticmethod
-    def _index_rules_by_feature(rules) -> Tuple[Dict[Feature, Set[str]], List[str]]:
-        """
-        split the given rules into two structures:
-          - "easy rules" are indexed by feature,
-            such that you can quickly find the rules that contain a given feature.
-          - "hard rules" are those that contain substring/regex/bytes features or match statements.
-            these continue to be ordered topologically.
+    def _score_number(v: int) -> int:
+        if -0x8000 <= v <= 0x8000:
+            return 3
 
-        a rule evaluator can use the "easy rule" index to restrict the
-        candidate rules that might match a given set of features.
+        if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF:
+            return 3
 
-        at this time, a rule evaluator can't do anything special with
-        the "hard rules". it must still do a full top-down match of each
-        rule, in topological order.
+        return 7
 
-        this does not index global features, because these are not selective, and
-        won't be used as the sole feature used to match.
-        """
+    @staticmethod
+    def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int:
+        if isinstance(
+            node,
+            capa.features.common.MatchedRule,
+        ):
+            if node.value in scores_by_rule:
+                # other rule must match before this one, in same scope.
+                # use score from that rule, which will have already been processed.
+                return scores_by_rule[node.value]
+            else:
+                # scores_by_rule only contains rules for the current scope
+                # so the requested rule must be from a smaller scope
+                # and we can assume the feature will exist.
+                #
+                # We don't know what the score should be, unfortunately.
+                # Could try to thread that through. Use "5" in the meantime.
+                return 5
+
+        elif capa.features.common.is_global_feature(node):
+            # we don't want to index global features
+            # because they're not very selective.
+            # they also don't stand on their own - there's always some other logic.
+            return 0
+
+        elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)):
+            v = node.value
+            assert isinstance(v, int)
+            return RuleSet._score_number(v)
+
+        C = node.__class__
+        return {
+            # Scanning features (non-hashable)
+            # These are the non-hashable features.
+            # We can't use these for quick matching.
+            capa.features.common.Substring: 0,
+            capa.features.common.Regex: 0,
+            capa.features.common.Bytes: 0,
+            # hashable features
+            capa.features.common.Characteristic: 4,
+            capa.features.common.String: 9,
+            capa.features.common.Class: 5,
+            capa.features.common.Namespace: 5,
+            capa.features.insn.API: 8,
+            capa.features.insn.Property: 5,
+            capa.features.insn.Offset: 4,
+            capa.features.insn.Mnemonic: 2,
+            capa.features.insn.OperandOffset: 4,
+            capa.features.basicblock.BasicBlock: 1,
+            capa.features.file.Export: 7,
+            capa.features.file.Import: 5,
+            capa.features.file.Section: 5,
+            capa.features.file.FunctionName: 5,
+        }[C]
+
+    @dataclass
+    class RuleFeatureIndex:
+        rules_by_feature: Dict[Feature, Set[str]] = dataclasses.field(default=dict)
+        string_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict)
+        bytes_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict)
 
-        # we'll do a couple phases:
-        #
-        #  1. recursively visit all nodes in all rules,
-        #    a. indexing all features
-        #    b. recording the types of features found per rule
-        #  2. compute the easy and hard rule sets
-        #  3. remove hard rules from the rules-by-feature index
-        #  4. construct the topologically ordered list of hard rules
-        rules_with_easy_features: Set[str] = set()
-        rules_with_hard_features: Set[str] = set()
+    @staticmethod
+    def _index_rules_by_feature(scope: Scope, rules: List[Rule]) -> RuleFeatureIndex:
         rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set)
+        scores_by_rule: Dict[str, int] = {}
 
-        def rec(rule_name: str, node: Union[Feature, Statement]):
+        # note closure over scores_by_rule
+        def rec(
+            rule_name: str, node: Union[Feature, Statement]
+        ) -> Union[None, Tuple[int, Feature], Tuple[int, Set[Feature]]]:
             """
-            walk through a rule's logic tree, indexing the easy and hard rules,
-            and the features referenced by easy rules.
+            Walk through a rule's logic tree, picking the features to use for indexing,
+            returning the feature and an associated score.
+            The higher the score, the more selective the feature is expected to be.
+            The score is only used internally, to pick the best fetaure from within
+             and AND block.
             """
-            if isinstance(
-                node,
-                (
-                    # these are the "hard features"
-                    # substring: scanning feature
-                    capa.features.common.Substring,
-                    # regex: scanning feature
-                    capa.features.common.Regex,
-                    # bytes: scanning feature
-                    capa.features.common.Bytes,
-                    # match: dependency on another rule,
-                    # which we have to evaluate first,
-                    # and is therefore tricky.
-                    capa.features.common.MatchedRule,
-                ),
-            ):
-                # hard feature: requires scan or match lookup
-                rules_with_hard_features.add(rule_name)
-            elif isinstance(node, capa.features.common.Feature):
-                if capa.features.common.is_global_feature(node):
-                    # we don't want to index global features
-                    # because they're not very selective.
-                    #
-                    # they're global, so if they match at one location in a file,
-                    # they'll match at every location in a file.
-                    # so that's not helpful to decide how to downselect.
-                    #
-                    # and, a global rule will never be the sole selector in a rule.
-                    pass
-                else:
-                    # easy feature: hash lookup
-                    rules_with_easy_features.add(rule_name)
-                    rules_by_feature[node].add(rule_name)
-            elif isinstance(node, (ceng.Not)):
-                # `not:` statements are tricky to deal with.
-                #
-                # first, features found under a `not:` should not be indexed,
-                # because they're not wanted to be found.
-                # second, `not:` can be nested under another `not:`, or two, etc.
-                # third, `not:` at the root or directly under an `or:`
-                # means the rule will match against *anything* not specified there,
-                # which is a difficult set of things to compute and index.
-                #
-                # so, if a rule has a `not:` statement, its hard.
-                # as of writing, this is an uncommon statement, with only 6 instances in 740 rules.
-                rules_with_hard_features.add(rule_name)
+
+            if isinstance(node, (ceng.Not)):
+                # we don't index features within NOT blocks
+                return None
+
             elif isinstance(node, (ceng.Some)) and node.count == 0:
-                # `optional:` and `0 or more:` are tricky to deal with.
-                #
                 # when a subtree is optional, it may match, but not matching
                 # doesn't have any impact either.
                 # now, our rule authors *should* not put this under `or:`
                 # and this is checked by the linter,
-                # but this could still happen (e.g. private rule set without linting)
-                # and would be hard to trace down.
-                #
-                # so better to be safe than sorry and consider this a hard case.
-                rules_with_hard_features.add(rule_name)
+                return None
+
             elif isinstance(node, (ceng.Range)) and node.min == 0:
-                # `count(foo): 0 or more` are tricky to deal with.
-                # because the min is 0,
-                # this subtree *can* match just about any feature
-                # (except the given one)
-                # which is a difficult set of things to compute and index.
-                rules_with_hard_features.add(rule_name)
+                # `count(foo): 0 or more` is just like an optional block,
+                # because the min is 0, this subtree *can* match just about any feature.
+                return None
+
+            elif isinstance(node, capa.features.common.Feature) and capa.features.common.is_global_feature(node):
+                # we don't want to index global features
+                # because they're not very selective.
+                # they also don't stand on their own - there's always some other logic.
+                return None
+
+            elif isinstance(node, capa.features.common.Feature):
+                return (RuleSet._score_feature(scores_by_rule, node), node)
+
             elif isinstance(node, (ceng.Range)):
-                rec(rule_name, node.child)
-            elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
+                # feature is found N times
+                return rec(rule_name, node.child)
+
+            elif isinstance(node, ceng.And):
+                scores = []
                 for child in node.children:
-                    rec(rule_name, child)
+                    try:
+                        score = rec(rule_name, child)
+                    except AssertionError as e:
+                        # if one branch isn't possible to index,
+                        # thats ok, we can require a different one to match
+                        logger.warning("and: swallowing: %s: %s", e, rule_name)
+                        continue
+
+                    if not score:
+                        continue
+
+                    scores.append(score)
+
+                # otherwise we can't index this rule
+                assert len(scores) > 0
+
+                def and_score_key(item):
+                    # order by score, then fewest number of features.
+                    # TODO(wb): minimize number of features? play with this.
+                    score, features = item
+
+                    if isinstance(features, set):
+                        return (score, -len(features))
+                    else:
+                        return (score, -1)
+
+                scores.sort(key=and_score_key, reverse=True)
+
+                # pick the best feature
+                return scores[0]
+
+            elif isinstance(node, (ceng.Or, ceng.Some)):
+                min_score = 10000000  # assume this is larger than any score
+                features = set()
+
+                for child in node.children:
+                    item = rec(rule_name, child)
+                    assert item is not None, "can't index OR branch"
+
+                    score, feature = item
+
+                    min_score = min(min_score, score)
+
+                    if isinstance(feature, set):
+                        features.update(feature)
+                    else:
+                        features.add(feature)
+
+                return min_score, features
+
             elif isinstance(node, ceng.Statement):
-                # unhandled type of statement.
-                # this should only happen if a new subtype of `Statement`
+                # Unhandled type of statement.
+                # This should only happen if a new subtype of `Statement`
                 # has since been added to capa.
                 #
-                # ideally, we'd like to use mypy for exhaustiveness checking
+                # Ideally, we'd like to use mypy for exhaustiveness checking
                 # for all the subtypes of `Statement`.
-                # but, as far as i can tell, mypy does not support this type
+                # But, as far as I can tell, mypy does not support this type
                 # of checking.
                 #
-                # in a way, this makes some intuitive sense:
+                # In a way, this makes some intuitive sense:
                 # the set of subtypes of type A is unbounded,
                 # because any user might come along and create a new subtype B,
                 # so mypy can't reason about this set of types.
@@ -1527,28 +1612,71 @@ def rec(rule_name: str, node: Union[Feature, Statement]):
                 # programming error
                 assert_never(node)
 
+        string_rules: Dict[str, List[Feature]] = {}
+        bytes_rules: Dict[str, List[Feature]] = {}
+
         for rule in rules:
             rule_name = rule.meta["name"]
+
             root = rule.statement
-            rec(rule_name, root)
+            try:
+                item = rec(rule_name, root)
+            except AssertionError as e:
+                logger.warning("fail: %s: %s", e, rule_name)
+                continue
 
-        # if a rule has a hard feature,
-        # don't consider it easy, and therefore,
-        # don't index any of its features.
-        #
-        # otherwise, its an easy rule, and index its features
-        for rules_with_feature in rules_by_feature.values():
-            rules_with_feature.difference_update(rules_with_hard_features)
-        easy_rules_by_feature = rules_by_feature
-
-        # `rules` is already topologically ordered,
-        # so extract our hard set into the topological ordering.
-        hard_rules = []
-        for rule in rules:
-            if rule.meta["name"] in rules_with_hard_features:
-                hard_rules.append(rule.meta["name"])
+            if item is None:
+                logger.warning("fail: can't index rule: %s", rule_name)
+                continue
+            assert item is not None, "can't index rule"
+
+            score, feature = item
+
+            if isinstance(feature, set):
+                features = feature
+            else:
+                features = {feature}
+
+            string_features = [
+                feature
+                for feature in features
+                if isinstance(feature, (capa.features.common.Substring, capa.features.common.Regex))
+            ]
+            bytes_features = [feature for feature in features if isinstance(feature, capa.features.common.Bytes)]
+            hashable_features = [
+                feature
+                for feature in features
+                if not isinstance(
+                    feature, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)
+                )
+            ]
+
+            logger.debug("indexing: features: %d, score: %d, rule: %s", len(features), score, rule_name)
+            scores_by_rule[rule_name] = score
+            for feature in features:
+                logger.debug("        : [%d] %s", RuleSet._score_feature(scores_by_rule, feature), feature)
+
+            if string_features:
+                string_rules[rule_name] = string_features
+
+            if bytes_features:
+                bytes_rules[rule_name] = bytes_features
+
+            for feature in hashable_features:
+                rules_by_feature[feature].add(rule_name)
+
+        logger.debug("indexing: %d features indexed for scope %s", len(rules_by_feature), scope)
+        logger.debug("indexing: %d indexed features are shared by more than 3 rules",
+                     len([feature for feature, rules in rules_by_feature.items() if len(rules) > 3]))
+        logger.debug("indexing: %d scanning string features, %d scanning bytes features",
+                     len(string_rules), len(bytes_rules))
 
-        return (easy_rules_by_feature, hard_rules)
+        # TODO(wb): remember, when evaluating candidates, make sure
+        # to do it in topological order, so match statements work.
+
+        # TODO(wb): remember, as rule matches are found,
+        # the candidates must be extended again, to account for match statements.
+        return RuleSet.RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules)
 
     @staticmethod
     def _get_rules_for_scope(rules, scope) -> List[Rule]:
@@ -1625,74 +1753,92 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat
         this routine should act just like `capa.engine.match`,
         except that it may be more performant.
         """
-        easy_rules_by_feature = {}
-        if scope == Scope.FILE:
-            easy_rules_by_feature = self._easy_file_rules_by_feature
-            hard_rule_names = self._hard_file_rules
-        elif scope == Scope.PROCESS:
-            easy_rules_by_feature = self._easy_process_rules_by_feature
-            hard_rule_names = self._hard_process_rules
-        elif scope == Scope.THREAD:
-            easy_rules_by_feature = self._easy_thread_rules_by_feature
-            hard_rule_names = self._hard_thread_rules
-        elif scope == Scope.CALL:
-            easy_rules_by_feature = self._easy_call_rules_by_feature
-            hard_rule_names = self._hard_call_rules
-        elif scope == Scope.FUNCTION:
-            easy_rules_by_feature = self._easy_function_rules_by_feature
-            hard_rule_names = self._hard_function_rules
-        elif scope == Scope.BASIC_BLOCK:
-            easy_rules_by_feature = self._easy_basic_block_rules_by_feature
-            hard_rule_names = self._hard_basic_block_rules
-        elif scope == Scope.INSTRUCTION:
-            easy_rules_by_feature = self._easy_instruction_rules_by_feature
-            hard_rule_names = self._hard_instruction_rules
-        else:
-            assert_never(scope)
+
+        feature_index = self._feature_indexes_by_scopes[scope]
+        rules = self.rules_by_scope[scope]
+        rules_by_name = {rule.name: rule for rule in rules}
+        # topologic location of rule given its name
+        rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)}
+
+        def resort_rules_topologically(rules: List[Rule]):
+            # note closure over `rule_index_by_rule_name`
+            rules.sort(key=lambda r: rule_index_by_rule_name[r.name])
 
         candidate_rule_names = set()
         for feature in features:
-            easy_rule_names = easy_rules_by_feature.get(feature)
-            if easy_rule_names:
-                candidate_rule_names.update(easy_rule_names)
+            candidate_rule_names.update(feature_index.rules_by_feature.get(feature, ()))
+
+        if feature_index.string_rules:
+            string_features = {}
+            for feature, locations in features.items():
+                if isinstance(feature, capa.features.common.String):
+                    string_features[feature] = locations
+
+            if string_features:
+                for rule_name, wanted_strings in feature_index.string_rules.items():
+                    for wanted_string in wanted_strings:
+                        if wanted_string.evaluate(string_features):
+                            candidate_rule_names.add(rule_name)
+
+        if feature_index.bytes_rules:
+            bytes_features = {}
+            for feature, locations in features.items():
+                if isinstance(feature, capa.features.common.Bytes):
+                    bytes_features[feature] = locations
+
+            if bytes_features:
+                for rule_name, wanted_bytess in feature_index.bytes_rules.items():
+                    for wanted_bytes in wanted_bytess:
+                        if wanted_bytes.evaluate(bytes_features):
+                            candidate_rule_names.add(rule_name)
+
+        # logger.debug("perf: match: %s: %s: %d features, %d candidate rules",
+        #              scope, addr, len(features), len(candidate_rule_names))
 
         # first, match against the set of rules that have at least one
         # feature shared with our feature set.
         candidate_rules = [self.rules[name] for name in candidate_rule_names]
-        features2, easy_matches = ceng.match(candidate_rules, features, addr)
+        resort_rules_topologically(candidate_rules)
 
-        # note that we've stored the updated feature set in `features2`.
-        # this contains a superset of the features in `features`;
-        # it contains additional features for any easy rule matches.
-        # we'll pass this feature set to hard rule matching, since one
-        # of those rules might rely on an easy rule match.
         #
-        # the updated feature set from hard matching will go into `features3`.
-        # this is a superset of `features2` is a superset of `features`.
-        # ultimately, this is what we'll return to the caller.
+        # The following is derived from ceng.match
+        # extended to interact with candidate_rules upon rule match.
         #
-        # in each case, we could have assigned the updated feature set back to `features`,
-        # but this is slightly more explicit how we're tracking the data.
-
-        # now, match against (topologically ordered) list of rules
-        # that we can't really make any guesses about.
-        # these are rules with hard features, like substring/regex/bytes and match statements.
-        hard_rules = [self.rules[name] for name in hard_rule_names]
-        features3, hard_matches = ceng.match(hard_rules, features2, addr)
-
-        # note that above, we probably are skipping matching a bunch of
-        # rules that definitely would never hit.
-        # specifically, "easy rules" that don't share any features with
-        # feature set.
-
-        # MatchResults doesn't technically have an .update() method
-        # but a dict does.
-        matches = {}  # type: ignore
-        matches.update(easy_matches)
-        matches.update(hard_matches)
-
-        return (features3, matches)
+        results: MatchResults = collections.defaultdict(list)
 
+        # copy features so that we can modify it
+        # without affecting the caller (keep this function pure)
+        #
+        # note: copy doesn't notice this is a defaultdict, so we'll recreate that manually.
+        features = collections.defaultdict(set, copy.copy(features))
+
+        while candidate_rules:
+            rule = candidate_rules.pop(0)
+            res = rule.evaluate(features, short_circuit=True)
+            if res:
+                # we first matched the rule with short circuiting enabled.
+                # this is much faster than without short circuiting.
+                # however, we want to collect all results thoroughly,
+                # so once we've found a match quickly,
+                # go back and capture results without short circuiting.
+                res = rule.evaluate(features, short_circuit=False)
+
+                # sanity check
+                assert bool(res) is True
+
+                results[rule.name].append((addr, res))
+                # we need to update the current `features`
+                # because subsequent iterations of this loop may use newly added features,
+                # such as rule or namespace matches.
+                ceng.index_rule_matches(features, rule, [addr])
+
+                new_candidates = feature_index.rules_by_feature.get(capa.features.common.MatchedRule(rule.name), ())
+                if new_candidates:
+                    candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates])
+                    resort_rules_topologically(candidate_rules)
+
+        return (features, results)
+        
 
 def is_nursery_rule_path(path: Path) -> bool:
     """

From 2d9c82fb17b6fb19087deb8e49c23c64ef58cccf Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 22 May 2024 10:15:04 +0200
Subject: [PATCH 04/28] lints

---
 capa/features/extractors/binexport2/insn.py |  5 ++---
 capa/rules/__init__.py                      | 16 +++++++++-------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 1a6a99b21..e77669b7d 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -14,7 +14,7 @@
 from capa.features.insn import API, Number, Mnemonic, OperandNumber
 from capa.features.common import Bytes, String, Feature, Characteristic
 from capa.features.address import Address, AbsoluteVirtualAddress
-from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, ReadMemoryError, InstructionContext
+from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
 from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
@@ -31,8 +31,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
     insn = be2.instruction[ii.instruction_index]
 
     for addr in insn.call_target:
-        if addr in be2_analysis.thunks:
-            addr = be2_analysis.thunks[addr]
+        addr = be2_analysis.thunks.get(addr, addr)
 
         if addr not in be2_index.vertex_index_by_address:
             # disassembler did not define function at address
diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 43dcb7665..f525e1ab5 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1666,10 +1666,13 @@ def and_score_key(item):
                 rules_by_feature[feature].add(rule_name)
 
         logger.debug("indexing: %d features indexed for scope %s", len(rules_by_feature), scope)
-        logger.debug("indexing: %d indexed features are shared by more than 3 rules",
-                     len([feature for feature, rules in rules_by_feature.items() if len(rules) > 3]))
-        logger.debug("indexing: %d scanning string features, %d scanning bytes features",
-                     len(string_rules), len(bytes_rules))
+        logger.debug(
+            "indexing: %d indexed features are shared by more than 3 rules",
+            len([feature for feature, rules in rules_by_feature.items() if len(rules) > 3]),
+        )
+        logger.debug(
+            "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules)
+        )
 
         # TODO(wb): remember, when evaluating candidates, make sure
         # to do it in topological order, so match statements work.
@@ -1756,7 +1759,6 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat
 
         feature_index = self._feature_indexes_by_scopes[scope]
         rules = self.rules_by_scope[scope]
-        rules_by_name = {rule.name: rule for rule in rules}
         # topologic location of rule given its name
         rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)}
 
@@ -1804,7 +1806,7 @@ def resort_rules_topologically(rules: List[Rule]):
         # The following is derived from ceng.match
         # extended to interact with candidate_rules upon rule match.
         #
-        results: MatchResults = collections.defaultdict(list)
+        results: ceng.MatchResults = collections.defaultdict(list)
 
         # copy features so that we can modify it
         # without affecting the caller (keep this function pure)
@@ -1838,7 +1840,7 @@ def resort_rules_topologically(rules: List[Rule]):
                     resort_rules_topologically(candidate_rules)
 
         return (features, results)
-        
+
 
 def is_nursery_rule_path(path: Path) -> bool:
     """

From 0dc0c515dca1d6bc00f9a2a6176b1dfd764b1d91 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 22 May 2024 11:25:00 +0200
Subject: [PATCH 05/28] rules: add documentation for optimized match routine

---
 capa/rules/__init__.py | 251 ++++++++++++++++++++++++++---------------
 1 file changed, 162 insertions(+), 89 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index f525e1ab5..9722ab970 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1423,69 +1423,69 @@ def __getitem__(self, rulename):
     def __contains__(self, rulename):
         return rulename in self.rules
 
-    @staticmethod
-    def _score_number(v: int) -> int:
-        if -0x8000 <= v <= 0x8000:
-            return 3
-
-        if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF:
-            return 3
-
-        return 7
-
+    # unstable
     @staticmethod
     def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int:
-        if isinstance(
+        if capa.features.common.is_global_feature(node):
+            # We don't want to index global features
+            # because they're not very selective.
+            # They also don't stand on their own - there's always some other logic.
+            raise ValueError("don't index global features")
+
+        elif isinstance(
             node,
             capa.features.common.MatchedRule,
         ):
-            if node.value in scores_by_rule:
-                # other rule must match before this one, in same scope.
-                # use score from that rule, which will have already been processed.
-                return scores_by_rule[node.value]
-            else:
-                # scores_by_rule only contains rules for the current scope
-                # so the requested rule must be from a smaller scope
-                # and we can assume the feature will exist.
-                #
-                # We don't know what the score should be, unfortunately.
-                # Could try to thread that through. Use "5" in the meantime.
-                return 5
-
-        elif capa.features.common.is_global_feature(node):
-            # we don't want to index global features
-            # because they're not very selective.
-            # they also don't stand on their own - there's always some other logic.
-            return 0
+            # If present, other rule must match before this one, in same scope.
+            # Use score from that rule, which will have already been processed due to topological sorting.
+            # Otherwise, use a default score of 5.
+            return scores_by_rule.get(node.value, 5)
 
         elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)):
             v = node.value
             assert isinstance(v, int)
-            return RuleSet._score_number(v)
+
+            if -0x8000 <= v <= 0x8000:
+                # Small numbers are probably pretty common, like structure offsets, etc.
+                return 3
+
+            if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF:
+                # Numbers close to u32::max_int are also probably pretty common,
+                # like signed numbers closed to 0 that are stored as unsigned ints.
+                return 3
+
+            if 0xFFFF_FFFF_FFFF_FF00 <= v <= 0xFFFF_FFFF_FFFF_FFFF:
+                return 3
+
+            # Other numbers are assumed to be uncommon.
+            return 7
+
+        elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)):
+            # Scanning features (non-hashable), which we can't use for quick matching/filtering.
+            return 0
 
         C = node.__class__
         return {
-            # Scanning features (non-hashable)
-            # These are the non-hashable features.
-            # We can't use these for quick matching.
-            capa.features.common.Substring: 0,
-            capa.features.common.Regex: 0,
-            capa.features.common.Bytes: 0,
-            # hashable features
-            capa.features.common.Characteristic: 4,
             capa.features.common.String: 9,
+            capa.features.insn.API: 8,
+            capa.features.file.Export: 7,
+            # "uncommon numbers": 7
             capa.features.common.Class: 5,
             capa.features.common.Namespace: 5,
-            capa.features.insn.API: 8,
             capa.features.insn.Property: 5,
-            capa.features.insn.Offset: 4,
-            capa.features.insn.Mnemonic: 2,
-            capa.features.insn.OperandOffset: 4,
-            capa.features.basicblock.BasicBlock: 1,
-            capa.features.file.Export: 7,
             capa.features.file.Import: 5,
             capa.features.file.Section: 5,
             capa.features.file.FunctionName: 5,
+            # default MatchedRule: 5
+            capa.features.common.Characteristic: 4,
+            capa.features.insn.Offset: 4,
+            capa.features.insn.OperandOffset: 4,
+            # "common numbers": 3
+            capa.features.insn.Mnemonic: 2,
+            capa.features.basicblock.BasicBlock: 1,
+            # substring: 0
+            # regex: 0
+            # bytes: 0
         }[C]
 
     @dataclass
@@ -1494,6 +1494,7 @@ class RuleFeatureIndex:
         string_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict)
         bytes_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict)
 
+    # unstable
     @staticmethod
     def _index_rules_by_feature(scope: Scope, rules: List[Rule]) -> RuleFeatureIndex:
         rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set)
@@ -1619,16 +1620,8 @@ def and_score_key(item):
             rule_name = rule.meta["name"]
 
             root = rule.statement
-            try:
-                item = rec(rule_name, root)
-            except AssertionError as e:
-                logger.warning("fail: %s: %s", e, rule_name)
-                continue
-
-            if item is None:
-                logger.warning("fail: can't index rule: %s", rule_name)
-                continue
-            assert item is not None, "can't index rule"
+            item = rec(rule_name, root)
+            assert item is not None
 
             score, feature = item
 
@@ -1674,11 +1667,6 @@ def and_score_key(item):
             "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules)
         )
 
-        # TODO(wb): remember, when evaluating candidates, make sure
-        # to do it in topological order, so match statements work.
-
-        # TODO(wb): remember, as rule matches are found,
-        # the candidates must be extended again, to account for match statements.
         return RuleSet.RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules)
 
     @staticmethod
@@ -1749,29 +1737,89 @@ def filter_rules_by_meta(self, tag: str) -> "RuleSet":
                             break
         return RuleSet(list(rules_filtered))
 
+    # unstable
+    @staticmethod
+    def _sort_rules_by_index(rule_index_by_rule_name: Dict[str, int], rules: List[Rule]):
+        """
+        Sort (in place) the given rules by their index provided by the given Dict.
+        This mapping is intended to represent the topologic index of the given rule;
+         that is, rules with a lower index should be evaluated first, since their dependencies
+         will be evaluated later.
+        """
+        rules.sort(key=lambda r: rule_index_by_rule_name[r.name])
+
     def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]:
         """
-        match rules from this ruleset at the given scope against the given features.
+        Match rules from this ruleset at the given scope against the given features.
 
-        this routine should act just like `capa.engine.match`,
-        except that it may be more performant.
+        This routine should act just like `capa.engine.match`, except that it may be more performant.
+        It uses its knowledge of all the rules to evaluate a minimal set of candidate rules for the given features.
         """
 
-        feature_index = self._feature_indexes_by_scopes[scope]
-        rules = self.rules_by_scope[scope]
-        # topologic location of rule given its name
+        feature_index: RuleSet.RuleFeatureIndex = self._feature_indexes_by_scopes[scope]
+        rules: List[Rule] = self.rules_by_scope[scope]
+        # Topologic location of rule given its name.
+        # That is, rules with a lower index should be evaluated first, since their dependencies
+        # will be evaluated later.
         rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)}
 
-        def resort_rules_topologically(rules: List[Rule]):
-            # note closure over `rule_index_by_rule_name`
-            rules.sort(key=lambda r: rule_index_by_rule_name[r.name])
-
-        candidate_rule_names = set()
+        # This algorithm is optimized to evaluate as few rules as possible,
+        # because the less work we do, the faster capa can run.
+        #
+        # It relies on the observation that most rules don't match,
+        # and that most rules have an uncommon feature that *must* be present for the rule to match.
+        #
+        # Therefore, we record which uncommon feature(s) is required for each rule to match,
+        # and then only inspect these few candidates when a feature is seen in some scope.
+        # Ultimately, the exact same rules are matched with precisely the same results,
+        # its just done faster, because we ignore most of the rules that never would have matched anyways.
+        #
+        # In `_index_rules_by_feature`, we do the hard work of computing the minimal set of
+        # uncommon features for each rule. While its a little expensive, its a single pass
+        # that gets reused at every scope instance (read: thousands or millions of times).
+        #
+        # In the current routine, we collect all the rules that might match, given the presence
+        # of any uncommon feature. We sort the rules topographically, so that rule dependencies work out,
+        # and then we evaluate the candidate rules. In practice, this saves 20-50x the work!
+        #
+        # Recall that some features cannot be matched quickly via hash lookup: Regex, Bytes, etc.
+        # When these features are the uncommon features used to filter rules, we have to evaluate the
+        # feature frequently whenever a string/bytes feature is encountered. Its slow, but we can't
+        # get around it. Reducing our reliance on regex/bytes feature and/or finding a way to
+        # index these can futher improve performance.
+
+        # Find all the rules that could match the given feature set.
+        # Ideally we want this set to be as small and focused as possible,
+        # and we can tune it by tweaking `_index_rules_by_feature`.
+        candidate_rule_names: Set[str] = set()
         for feature in features:
             candidate_rule_names.update(feature_index.rules_by_feature.get(feature, ()))
 
+        # Some rules rely totally on regex features, like the HTTP User-Agent rules.
+        # In these cases, when we encounter any string feature, we have to scan those
+        # regexes to find the candidate rules.
+        # As mentioned above, this is not good for performance, but its required for correctness.
+        #
+        # We may want to try to pre-evaluate these strings, based on their presence in the file,
+        # to reduce the number of evaluations we do here.
+        # See: https://github.com/mandiant/capa/issues/2063#issuecomment-2095639672
+        #
+        # We may also want to specialize case-insensitive strings, which would enable them to
+        # be indexed, and therefore skip the scanning here, improving performance.
+        # This strategy is described here:
+        # https://github.com/mandiant/capa/issues/2063#issuecomment-2107083068
         if feature_index.string_rules:
-            string_features = {}
+
+            # This is a FeatureSet that contains only String features.
+            # Since we'll only be evaluating String/Regex features below, we don't care about
+            # other sorts of features (Mnemonic, Number, etc.) and therefore can save some time
+            # during evaluation.
+            #
+            # Specifically, we can address the issue described here:
+            # https://github.com/mandiant/capa/issues/2063#issuecomment-2095397884
+            # That we spend a lot of time collecting String instances within `Regex.evaluate`.
+            # We don't have to address that issue further as long as we pre-filter the features here.
+            string_features: FeatureSet = {}
             for feature, locations in features.items():
                 if isinstance(feature, capa.features.common.String):
                     string_features[feature] = locations
@@ -1782,8 +1830,14 @@ def resort_rules_topologically(rules: List[Rule]):
                         if wanted_string.evaluate(string_features):
                             candidate_rule_names.add(rule_name)
 
+        # Like with String/Regex features above, we have to scan for Bytes to find candidate rules.
+        #
+        # We may want to index bytes when they have a common length, like 16 or 32.
+        # This would help us avoid the scanning here, which would improve performance.
+        # The strategy is described here:
+        # https://github.com/mandiant/capa/issues/2063#issuecomment-2107052190
         if feature_index.bytes_rules:
-            bytes_features = {}
+            bytes_features: FeatureSet = {}
             for feature, locations in features.items():
                 if isinstance(feature, capa.features.common.Bytes):
                     bytes_features[feature] = locations
@@ -1794,52 +1848,71 @@ def resort_rules_topologically(rules: List[Rule]):
                         if wanted_bytes.evaluate(bytes_features):
                             candidate_rule_names.add(rule_name)
 
-        # logger.debug("perf: match: %s: %s: %d features, %d candidate rules",
-        #              scope, addr, len(features), len(candidate_rule_names))
+        # trace
+        logger.debug(
+            "perf: match: %s: %s: %d features, %d candidate rules",
+            scope,
+            addr,
+            len(features),
+            len(candidate_rule_names),
+        )
 
-        # first, match against the set of rules that have at least one
-        # feature shared with our feature set.
+        # No rules can possibly match, so quickly return.
+        if not candidate_rule_names:
+            return (features, {})
+
+        # Here are the candidate rules (before we just had their names).
         candidate_rules = [self.rules[name] for name in candidate_rule_names]
-        resort_rules_topologically(candidate_rules)
+
+        # Order rules topologically, so that rules with dependencies work correctly.
+        RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
 
         #
         # The following is derived from ceng.match
         # extended to interact with candidate_rules upon rule match.
         #
+
         results: ceng.MatchResults = collections.defaultdict(list)
 
-        # copy features so that we can modify it
-        # without affecting the caller (keep this function pure)
-        #
-        # note: copy doesn't notice this is a defaultdict, so we'll recreate that manually.
-        features = collections.defaultdict(set, copy.copy(features))
+        # If we match a rule, then we'll add a MatchedRule to the features that will be returned,
+        # but we want to do that in a copy. We'll lazily create the copy below, once a match has
+        # actually been found.
+        augmented_features = features
 
         while candidate_rules:
             rule = candidate_rules.pop(0)
-            res = rule.evaluate(features, short_circuit=True)
+            res = rule.evaluate(augmented_features, short_circuit=True)
             if res:
                 # we first matched the rule with short circuiting enabled.
                 # this is much faster than without short circuiting.
                 # however, we want to collect all results thoroughly,
                 # so once we've found a match quickly,
                 # go back and capture results without short circuiting.
-                res = rule.evaluate(features, short_circuit=False)
+                res = rule.evaluate(augmented_features, short_circuit=False)
 
                 # sanity check
                 assert bool(res) is True
 
                 results[rule.name].append((addr, res))
-                # we need to update the current `features`
-                # because subsequent iterations of this loop may use newly added features,
+                # We need to update the current features because subsequent iterations may use newly added features,
                 # such as rule or namespace matches.
-                ceng.index_rule_matches(features, rule, [addr])
+                if augmented_features is features:
+                    # lazily create the copy of features only when a rule matches, since it could be expensive.
+                    augmented_features = collections.defaultdict(set, copy.copy(features))
+
+                ceng.index_rule_matches(augmented_features, rule, [addr])
 
+                # Its possible that we're relying on a MatchedRule feature to be the
+                # uncommon feature used to filter other rules. So, extend the candidate
+                # rules with any of these dependencies. If we find any, also ensure they're
+                # evaluated in the correct topologic order, so that further dependencies work.
                 new_candidates = feature_index.rules_by_feature.get(capa.features.common.MatchedRule(rule.name), ())
                 if new_candidates:
+                    candidate_rule_names.update(new_candidates)
                     candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates])
-                    resort_rules_topologically(candidate_rules)
+                    RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
 
-        return (features, results)
+        return (augmented_features, results)
 
 
 def is_nursery_rule_path(path: Path) -> bool:

From f86a60c85e9c7264acb68f8ea639565428a3deb3 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 22 May 2024 14:40:47 +0200
Subject: [PATCH 06/28] bytes: log length of bytes evaluations

---
 capa/features/common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/capa/features/common.py b/capa/features/common.py
index b817fafb9..5f0ca7941 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -387,6 +387,7 @@ def __init__(self, value: bytes, description=None):
     def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.bytes"] += 1
+        capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1
 
         assert isinstance(self.value, bytes)
         for feature, locations in features.items():

From 6e50f4817e36af7a0adf8f5ad8561115f005c787 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 22 May 2024 15:20:19 +0200
Subject: [PATCH 07/28] ruleset: document optimized match behavior

---
 capa/rules/__init__.py | 221 ++++++++++++++++++++++++++---------------
 1 file changed, 142 insertions(+), 79 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 9722ab970..00e99fa03 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1368,13 +1368,13 @@ def __init__(
         rules = capa.optimizer.optimize_rules(rules)
 
         scopes = (
-            Scope.FILE,
-            Scope.PROCESS,
-            Scope.THREAD,
             Scope.CALL,
-            Scope.FUNCTION,
-            Scope.BASIC_BLOCK,
+            Scope.THREAD,
+            Scope.PROCESS,
             Scope.INSTRUCTION,
+            Scope.BASIC_BLOCK,
+            Scope.FUNCTION,
+            Scope.FILE,
         )
 
         self.rules = {rule.name: rule for rule in rules}
@@ -1382,8 +1382,9 @@ def __init__(
         self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes}
 
         # unstable
+        scores_by_rule = {}
         self._feature_indexes_by_scopes = {
-            scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope]) for scope in scopes
+            scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes
         }
 
     @property
@@ -1426,13 +1427,17 @@ def __contains__(self, rulename):
     # unstable
     @staticmethod
     def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int:
-        if capa.features.common.is_global_feature(node):
-            # We don't want to index global features
-            # because they're not very selective.
-            # They also don't stand on their own - there's always some other logic.
-            raise ValueError("don't index global features")
+        """
+        Score the given feature by how "uncommon" it is, where a higher score is more uncommon.
+        Features that are not good for indexing will have a low score, or 0.
+        """
+
+        #
+        # Today, these scores are manually assigned by intuition/experience/guesswork.
+        # We could do a large-scale feature collection and use the results to assign scores.
+        #
 
-        elif isinstance(
+        if isinstance(
             node,
             capa.features.common.MatchedRule,
         ):
@@ -1455,6 +1460,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
                 return 3
 
             if 0xFFFF_FFFF_FFFF_FF00 <= v <= 0xFFFF_FFFF_FFFF_FFFF:
+                # Like signed numbers closed to 0 that are stored as unsigned long ints.
                 return 3
 
             # Other numbers are assumed to be uncommon.
@@ -1466,10 +1472,20 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
 
         C = node.__class__
         return {
+            #
+            # Very uncommon features that are probably very selective in capa's domain.
+            # When possible, we want rules to be indexed by these features.
+            #
             capa.features.common.String: 9,
             capa.features.insn.API: 8,
             capa.features.file.Export: 7,
             # "uncommon numbers": 7
+            #
+            # -----------------------------------------------------------------
+            #
+            # Features that are probably somewhat common, and/or rarely used within capa.
+            # Its ok to index rules by these.
+            #
             capa.features.common.Class: 5,
             capa.features.common.Namespace: 5,
             capa.features.insn.Property: 5,
@@ -1477,12 +1493,34 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             capa.features.file.Section: 5,
             capa.features.file.FunctionName: 5,
             # default MatchedRule: 5
+            #
+            # -----------------------------------------------------------------
+            #
+            # Features that are pretty common and we'd prefer not to index, but can if we have to.
+            #
             capa.features.common.Characteristic: 4,
             capa.features.insn.Offset: 4,
             capa.features.insn.OperandOffset: 4,
             # "common numbers": 3
+            #
+            # -----------------------------------------------------------------
+            #
+            # Very common features, which we'd only prefer to non-hashable features, like Regex/Substring/Bytes.
+            #
             capa.features.insn.Mnemonic: 2,
             capa.features.basicblock.BasicBlock: 1,
+            #
+            #
+            # We don't *want* to index global features because they're not very selective.
+            # They also don't usually stand on their own - there's always some other logic.
+            #
+            capa.features.common.OS: 0,
+            capa.features.common.Arch: 0,
+            capa.features.common.Format: 0,
+            # -----------------------------------------------------------------
+            #
+            # Non-hashable features, which will require a scan to evaluate, and are therefore quite expensive.
+            #
             # substring: 0
             # regex: 0
             # bytes: 0
@@ -1490,36 +1528,52 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
 
     @dataclass
     class RuleFeatureIndex:
+        # Mapping from hashable feature to a list of rules that might have this feature.
         rules_by_feature: Dict[Feature, Set[str]] = dataclasses.field(default=dict)
+        # Mapping from rule name to list of Regex/Substring features that have to match.
+        # All these features will be evaluated whenever a String feature is encountered.
         string_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict)
+        # Mapping from rule name to list of Bytes features that have to match.
+        # All these features will be evaluated whenever a Bytes feature is encountered.
         bytes_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict)
 
     # unstable
     @staticmethod
-    def _index_rules_by_feature(scope: Scope, rules: List[Rule]) -> RuleFeatureIndex:
+    def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> RuleFeatureIndex:
+        """
+        Index the given rules by their minimal set of most "uncommon" features required to match.
+
+        If absolutely necessary, provide the Regex/Substring/Bytes features 
+        (which are not hashable and require a scan) that have to match, too.
+        """
+
         rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set)
-        scores_by_rule: Dict[str, int] = {}
 
-        # note closure over scores_by_rule
         def rec(
-            rule_name: str, node: Union[Feature, Statement]
-        ) -> Union[None, Tuple[int, Feature], Tuple[int, Set[Feature]]]:
+            rule_name: str, node: Union[Feature, Statement],
+            # closure over: scores_by_rule
+        ) -> Optional[Tuple[int, Set[Feature]]]:
             """
             Walk through a rule's logic tree, picking the features to use for indexing,
             returning the feature and an associated score.
             The higher the score, the more selective the feature is expected to be.
-            The score is only used internally, to pick the best fetaure from within
-             and AND block.
+            The score is only used internally, to pick the best feature from within AND blocks.
+
+            Note closure over `scores_by_rule`.
             """
 
             if isinstance(node, (ceng.Not)):
-                # we don't index features within NOT blocks
+                # We don't index features within NOT blocks, because we're only looking for
+                # features that should be present.
+                #
+                # Technically we could have a rule that does `not: not: foo` and we'd want to
+                # index `foo`. But this is not seen today.
                 return None
 
             elif isinstance(node, (ceng.Some)) and node.count == 0:
-                # when a subtree is optional, it may match, but not matching
+                # When a subtree is optional, it may match, but not matching
                 # doesn't have any impact either.
-                # now, our rule authors *should* not put this under `or:`
+                # Now, our rule authors *should* not put this under `or:`
                 # and this is checked by the linter,
                 return None
 
@@ -1528,31 +1582,45 @@ def rec(
                 # because the min is 0, this subtree *can* match just about any feature.
                 return None
 
-            elif isinstance(node, capa.features.common.Feature) and capa.features.common.is_global_feature(node):
-                # we don't want to index global features
-                # because they're not very selective.
-                # they also don't stand on their own - there's always some other logic.
-                return None
-
             elif isinstance(node, capa.features.common.Feature):
-                return (RuleSet._score_feature(scores_by_rule, node), node)
+                return (RuleSet._score_feature(scores_by_rule, node), {node})
 
             elif isinstance(node, (ceng.Range)):
                 # feature is found N times
                 return rec(rule_name, node.child)
 
             elif isinstance(node, ceng.And):
-                scores = []
+                # When evaluating an AND block, all of the children need to match.
+                #
+                # So when we index rules, we want to pick the most uncommon feature(s)
+                # for each AND block. If the AND block matches, that feature must be there.
+                # We recursively explore children, computing their
+                # score, and pick the child with the greatest score.
+                #
+                # For example, given the rule:
+                #
+                #     and:
+                #       - mnemonic: mov
+                #       - api: CreateFile
+                #
+                # we prefer to pick `api: CreateFile` because we expect it to be more uncommon.
+                #
+                # Note that the children nodes might be complex, like:
+                #
+                #     and:
+                #       - mnemonic: mov
+                #       - or:
+                #         - api: CreateFile
+                #         - api: DeleteFile
+                #
+                # In this case, we prefer to pick the pair of API features since each is expected
+                # to be more common than the mnemonic.
+                scores: List[Tuple[int, Set[Feature]]] = []
                 for child in node.children:
-                    try:
-                        score = rec(rule_name, child)
-                    except AssertionError as e:
-                        # if one branch isn't possible to index,
-                        # thats ok, we can require a different one to match
-                        logger.warning("and: swallowing: %s: %s", e, rule_name)
-                        continue
+                    score = rec(rule_name, child)
 
                     if not score:
+                        # maybe an optional block or similar
                         continue
 
                     scores.append(score)
@@ -1562,13 +1630,8 @@ def rec(
 
                 def and_score_key(item):
                     # order by score, then fewest number of features.
-                    # TODO(wb): minimize number of features? play with this.
                     score, features = item
-
-                    if isinstance(features, set):
-                        return (score, -len(features))
-                    else:
-                        return (score, -1)
+                    return (score, -len(features))
 
                 scores.sort(key=and_score_key, reverse=True)
 
@@ -1576,6 +1639,30 @@ def and_score_key(item):
                 return scores[0]
 
             elif isinstance(node, (ceng.Or, ceng.Some)):
+                # When evaluating an OR block, any of the children need to match.
+                # It could be any of them, so we can't decide to only index some of them.
+                #
+                # For example, given the rule:
+                #
+                #     or:
+                #       - mnemonic: mov
+                #       - api: CreateFile
+                #
+                # we have to pick both `mnemonic` and `api` features.
+                #
+                # Note that the children nodes might be complex, like:
+                #
+                #     or:
+                #       - mnemonic: mov
+                #       - and:
+                #         - api: CreateFile
+                #         - api: DeleteFile
+                #
+                # In this case, we have to pick both the `mnemonic` and one of the `api` features.
+                #
+                # When computing the score of an OR branch, we have to use the min value encountered.
+                # While many of the children might be very specific, there might be a branch that is common
+                # and we need to handle that correctly.
                 min_score = 10000000  # assume this is larger than any score
                 features = set()
 
@@ -1583,36 +1670,18 @@ def and_score_key(item):
                     item = rec(rule_name, child)
                     assert item is not None, "can't index OR branch"
 
-                    score, feature = item
-
+                    score, _features = item
                     min_score = min(min_score, score)
-
-                    if isinstance(feature, set):
-                        features.update(feature)
-                    else:
-                        features.add(feature)
+                    features.update(_features)
 
                 return min_score, features
 
-            elif isinstance(node, ceng.Statement):
-                # Unhandled type of statement.
-                # This should only happen if a new subtype of `Statement`
-                # has since been added to capa.
-                #
-                # Ideally, we'd like to use mypy for exhaustiveness checking
-                # for all the subtypes of `Statement`.
-                # But, as far as I can tell, mypy does not support this type
-                # of checking.
-                #
-                # In a way, this makes some intuitive sense:
-                # the set of subtypes of type A is unbounded,
-                # because any user might come along and create a new subtype B,
-                # so mypy can't reason about this set of types.
-                assert_never(node)
             else:
                 # programming error
                 assert_never(node)
 
+        # These are the Regex/Substring/Bytes features that we have to use for filtering.
+        # Ideally we find a way to get rid of all of these, eventually.
         string_rules: Dict[str, List[Feature]] = {}
         bytes_rules: Dict[str, List[Feature]] = {}
 
@@ -1622,13 +1691,7 @@ def and_score_key(item):
             root = rule.statement
             item = rec(rule_name, root)
             assert item is not None
-
-            score, feature = item
-
-            if isinstance(feature, set):
-                features = feature
-            else:
-                features = {feature}
+            score, features = item
 
             string_features = [
                 feature
@@ -1849,13 +1912,13 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat
                             candidate_rule_names.add(rule_name)
 
         # trace
-        logger.debug(
-            "perf: match: %s: %s: %d features, %d candidate rules",
-            scope,
-            addr,
-            len(features),
-            len(candidate_rule_names),
-        )
+        # logger.debug(
+        #     "perf: match: %s: %s: %d features, %d candidate rules",
+        #     scope,
+        #     addr,
+        #     len(features),
+        #     len(candidate_rule_names),
+        # )
 
         # No rules can possibly match, so quickly return.
         if not candidate_rule_names:

From b7d07346b682a1d8466b1aa53b877e722cef3f56 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 22 May 2024 15:23:15 +0200
Subject: [PATCH 08/28] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fe3ccc821..76142df32 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 
 - add function in capa/helpers to load plain and compressed JSON reports #1883 @Rohit1123
 - document Antivirus warnings and VirusTotal false positive detections #2028 @RionEV @mr-tz
+- optimize rule matching #2080 @williballenthin
 
 ### Breaking Changes
 

From f853214ca091a506f75ef12daeb35a0f2c63c143 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Wed, 22 May 2024 15:40:06 +0200
Subject: [PATCH 09/28] ruleset: infrastructure to test optimized matcher

---
 capa/rules/__init__.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 00e99fa03..1823ddfec 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1811,7 +1811,7 @@ def _sort_rules_by_index(rule_index_by_rule_name: Dict[str, int], rules: List[Ru
         """
         rules.sort(key=lambda r: rule_index_by_rule_name[r.name])
 
-    def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]:
+    def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]:
         """
         Match rules from this ruleset at the given scope against the given features.
 
@@ -1977,6 +1977,29 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat
 
         return (augmented_features, results)
 
+    def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]:
+        """
+        Match rules from this ruleset at the given scope against the given features.
+
+        This wrapper around _match exists so that we can assert it matches precisely 
+        the same as `capa.engine.match`, just faster.
+        """
+        features1, matches1 = self._match(scope, features, addr)
+
+        # enable this branch to demonstrate that the naive matcher agrees with this optimized matcher.
+        if True:
+            features2, matches2 = capa.engine.match(self.rules.values(), features, addr)
+
+            for feature, locations in features1.items():
+                assert feature in features2
+                assert locations == features2[feature]
+
+            for rulename, results in matches1.items():
+                assert rulename in matches2
+                assert len(results) == len(matches2[rulename])
+
+        return features1, matches1
+
 
 def is_nursery_rule_path(path: Path) -> bool:
     """

From 9b7fb4e24bc0b9d1c994680bbe199d670c0db6a1 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 09:30:43 +0200
Subject: [PATCH 10/28] pep8

---
 capa/rules/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 1823ddfec..ed009b5c1 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1543,14 +1543,15 @@ def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dic
         """
         Index the given rules by their minimal set of most "uncommon" features required to match.
 
-        If absolutely necessary, provide the Regex/Substring/Bytes features 
+        If absolutely necessary, provide the Regex/Substring/Bytes features
         (which are not hashable and require a scan) that have to match, too.
         """
 
         rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set)
 
         def rec(
-            rule_name: str, node: Union[Feature, Statement],
+            rule_name: str,
+            node: Union[Feature, Statement],
             # closure over: scores_by_rule
         ) -> Optional[Tuple[int, Set[Feature]]]:
             """
@@ -1981,7 +1982,7 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat
         """
         Match rules from this ruleset at the given scope against the given features.
 
-        This wrapper around _match exists so that we can assert it matches precisely 
+        This wrapper around _match exists so that we can assert it matches precisely
         the same as `capa.engine.match`, just faster.
         """
         features1, matches1 = self._match(scope, features, addr)

From e8ef897a1f87d852faff9a471d5c32cf0a8bcf72 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 09:51:00 +0200
Subject: [PATCH 11/28] linters

---
 capa/features/common.py |  3 ++-
 capa/rules/__init__.py  | 40 +++++++++++++++++++++++-----------------
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/capa/features/common.py b/capa/features/common.py
index 5f0ca7941..cde91d1b9 100644
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -385,11 +385,12 @@ def __init__(self, value: bytes, description=None):
         self.value = value
 
     def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
+        assert isinstance(self.value, bytes)
+
         capa.perf.counters["evaluate.feature"] += 1
         capa.perf.counters["evaluate.feature.bytes"] += 1
         capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1
 
-        assert isinstance(self.value, bytes)
         for feature, locations in features.items():
             if not isinstance(feature, (Bytes,)):
                 continue
diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index ed009b5c1..1718aff5e 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -28,7 +28,7 @@
     # https://github.com/python/mypy/issues/1153
     from backports.functools_lru_cache import lru_cache  # type: ignore
 
-from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional
+from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional, cast
 from dataclasses import asdict, dataclass
 
 import yaml
@@ -1382,7 +1382,7 @@ def __init__(
         self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes}
 
         # unstable
-        scores_by_rule = {}
+        scores_by_rule: Dict[str, int] = {}
         self._feature_indexes_by_scopes = {
             scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes
         }
@@ -1444,6 +1444,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             # If present, other rule must match before this one, in same scope.
             # Use score from that rule, which will have already been processed due to topological sorting.
             # Otherwise, use a default score of 5.
+            assert isinstance(node.value, str)
             return scores_by_rule.get(node.value, 5)
 
         elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)):
@@ -1526,20 +1527,21 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             # bytes: 0
         }[C]
 
+    # unstable
     @dataclass
-    class RuleFeatureIndex:
+    class _RuleFeatureIndex:
         # Mapping from hashable feature to a list of rules that might have this feature.
-        rules_by_feature: Dict[Feature, Set[str]] = dataclasses.field(default=dict)
+        rules_by_feature: Dict[Feature, Set[str]]
         # Mapping from rule name to list of Regex/Substring features that have to match.
         # All these features will be evaluated whenever a String feature is encountered.
-        string_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict)
+        string_rules: Dict[str, List[Feature]]
         # Mapping from rule name to list of Bytes features that have to match.
         # All these features will be evaluated whenever a Bytes feature is encountered.
-        bytes_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict)
+        bytes_rules: Dict[str, List[Feature]]
 
     # unstable
     @staticmethod
-    def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> RuleFeatureIndex:
+    def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> _RuleFeatureIndex:
         """
         Index the given rules by their minimal set of most "uncommon" features required to match.
 
@@ -1671,8 +1673,8 @@ def and_score_key(item):
                     item = rec(rule_name, child)
                     assert item is not None, "can't index OR branch"
 
-                    score, _features = item
-                    min_score = min(min_score, score)
+                    _score, _features = item
+                    min_score = min(min_score, _score)
                     features.update(_features)
 
                 return min_score, features
@@ -1714,10 +1716,10 @@ def and_score_key(item):
                 logger.debug("        : [%d] %s", RuleSet._score_feature(scores_by_rule, feature), feature)
 
             if string_features:
-                string_rules[rule_name] = string_features
+                string_rules[rule_name] = cast(List[Feature], string_features)
 
             if bytes_features:
-                bytes_rules[rule_name] = bytes_features
+                bytes_rules[rule_name] = cast(List[Feature], bytes_features)
 
             for feature in hashable_features:
                 rules_by_feature[feature].add(rule_name)
@@ -1731,7 +1733,7 @@ def and_score_key(item):
             "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules)
         )
 
-        return RuleSet.RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules)
+        return RuleSet._RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules)
 
     @staticmethod
     def _get_rules_for_scope(rules, scope) -> List[Rule]:
@@ -1820,7 +1822,7 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea
         It uses its knowledge of all the rules to evaluate a minimal set of candidate rules for the given features.
         """
 
-        feature_index: RuleSet.RuleFeatureIndex = self._feature_indexes_by_scopes[scope]
+        feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope]
         rules: List[Rule] = self.rules_by_scope[scope]
         # Topologic location of rule given its name.
         # That is, rules with a lower index should be evaluated first, since their dependencies
@@ -1978,18 +1980,22 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea
 
         return (augmented_features, results)
 
-    def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]:
+    def match(
+        self, scope: Scope, features: FeatureSet, addr: Address, paranoid=False
+    ) -> Tuple[FeatureSet, ceng.MatchResults]:
         """
         Match rules from this ruleset at the given scope against the given features.
 
         This wrapper around _match exists so that we can assert it matches precisely
         the same as `capa.engine.match`, just faster.
+
+        Args:
+          paranoid: when true, demonstrate that the naive matcher agrees with this optimized matcher (much slower!).
         """
         features1, matches1 = self._match(scope, features, addr)
 
-        # enable this branch to demonstrate that the naive matcher agrees with this optimized matcher.
-        if True:
-            features2, matches2 = capa.engine.match(self.rules.values(), features, addr)
+        if paranoid:
+            features2, matches2 = capa.engine.match(list(self.rules.values()), features, addr)
 
             for feature, locations in features1.items():
                 assert feature in features2

From e49d47de41ad6c0e62b3951f28a0e57ec21a6011 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 10:00:43 +0200
Subject: [PATCH 12/28] rules: match: handle namespace match statements

---
 capa/engine.py         | 15 ++++++++++-----
 capa/rules/__init__.py | 21 ++++++++++++++-------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/capa/engine.py b/capa/engine.py
index 649d0367c..25c26cb96 100644
--- a/capa/engine.py
+++ b/capa/engine.py
@@ -270,6 +270,14 @@ def evaluate(self, features: FeatureSet, short_circuit=True):
 MatchResults = Mapping[str, List[Tuple[Address, Result]]]
 
 
+def get_rule_namespaces(rule: "capa.rules.Rule") -> Iterator[str]:
+    namespace = rule.meta.get("namespace")
+    if namespace:
+        while namespace:
+            yield namespace
+            namespace, _, _ = namespace.rpartition("/")
+
+
 def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: Iterable[Address]):
     """
     record into the given featureset that the given rule matched at the given locations.
@@ -280,11 +288,8 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations:
     updates `features` in-place. doesn't modify the remaining arguments.
     """
     features[capa.features.common.MatchedRule(rule.name)].update(locations)
-    namespace = rule.meta.get("namespace")
-    if namespace:
-        while namespace:
-            features[capa.features.common.MatchedRule(namespace)].update(locations)
-            namespace, _, _ = namespace.rpartition("/")
+    for namespace in get_rule_namespaces(rule):
+        features[capa.features.common.MatchedRule(namespace)].update(locations)
 
 
 def match(rules: List["capa.rules.Rule"], features: FeatureSet, addr: Address) -> Tuple[FeatureSet, MatchResults]:
diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 1718aff5e..2a5dd8a7a 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -15,7 +15,6 @@
 import logging
 import binascii
 import collections
-import dataclasses
 from enum import Enum
 from pathlib import Path
 
@@ -1968,15 +1967,23 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea
 
                 ceng.index_rule_matches(augmented_features, rule, [addr])
 
-                # Its possible that we're relying on a MatchedRule feature to be the
+                # Its possible that we're relying on a MatchedRule (or namespace) feature to be the
                 # uncommon feature used to filter other rules. So, extend the candidate
                 # rules with any of these dependencies. If we find any, also ensure they're
                 # evaluated in the correct topologic order, so that further dependencies work.
-                new_candidates = feature_index.rules_by_feature.get(capa.features.common.MatchedRule(rule.name), ())
-                if new_candidates:
-                    candidate_rule_names.update(new_candidates)
-                    candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates])
-                    RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
+                new_features = [capa.features.common.MatchedRule(rule.name)]
+                for namespace in ceng.get_rule_namespaces(rule):
+                    new_features.append(capa.features.common.MatchedRule(namespace))
+
+                if new_features:
+                    new_candidates: List[str] = []
+                    for new_feature in new_features:
+                        new_candidates.extend(feature_index.rules_by_feature.get(new_feature, ()))
+
+                    if new_candidates:
+                        candidate_rule_names.update(new_candidates)
+                        candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates])
+                        RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules)
 
         return (augmented_features, results)
 

From a4f4f0bef38aec42ba1108c05496763fc6a42fde Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 10:15:20 +0200
Subject: [PATCH 13/28] rules: more tests for logic edge cases

---
 capa/rules/__init__.py |  6 ++++-
 tests/test_match.py    | 61 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 2a5dd8a7a..072d8eff1 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1579,11 +1579,15 @@ def rec(
                 # and this is checked by the linter,
                 return None
 
-            elif isinstance(node, (ceng.Range)) and node.min == 0:
+            elif isinstance(node, (ceng.Range)) and node.min == 0 and node.max != 0:
                 # `count(foo): 0 or more` is just like an optional block,
                 # because the min is 0, this subtree *can* match just about any feature.
                 return None
 
+            elif isinstance(node, (ceng.Range)) and node.min == 0 and node.max == 0:
+                # `count(foo): 0` is like a not block, which we don't index.
+                return None
+
             elif isinstance(node, capa.features.common.Feature):
                 return (RuleSet._score_feature(scores_by_rule, node), {node})
 
diff --git a/tests/test_match.py b/tests/test_match.py
index 07af33d78..4aa6db305 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -5,9 +5,10 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
-
 import textwrap
 
+import pytest
+
 import capa.rules
 import capa.engine
 import capa.features.insn
@@ -130,22 +131,29 @@ def test_match_range_exact_zero():
                     static: function
                     dynamic: process
             features:
-                - count(number(100)): 0
+                - and:
+                    - count(number(100)): 0
+
+                    # we can't have `count(foo): 0` at the top level,
+                    # since we don't support top level NOT statements.
+                    # so we have this additional trivial feature.
+                    - mnemonic: mov
+
         """
     )
     r = capa.rules.Rule.from_yaml(rule)
 
     # feature isn't indexed - good.
-    _, matches = match([r], {}, 0x0)
+    _, matches = match([r], {capa.features.insn.Mnemonic("mov"): {}}, 0x0)
     assert "test rule" in matches
 
     # feature is indexed, but no matches.
     # i don't think we should ever really have this case, but good to check anyways.
-    _, matches = match([r], {capa.features.insn.Number(100): {}}, 0x0)
+    _, matches = match([r], {capa.features.insn.Number(100): {}, capa.features.insn.Mnemonic("mov"): {}}, 0x0)
     assert "test rule" in matches
 
     # too many matches
-    _, matches = match([r], {capa.features.insn.Number(100): {1}}, 0x0)
+    _, matches = match([r], {capa.features.insn.Number(100): {1}, capa.features.insn.Mnemonic("mov"): {1}}, 0x0)
     assert "test rule" not in matches
 
 
@@ -159,21 +167,27 @@ def test_match_range_with_zero():
                     static: function
                     dynamic: process
              features:
-                 - count(number(100)): (0, 1)
+                - and:
+                    - count(number(100)): (0, 1)
+
+                    # we can't have `count(foo): 0` at the top level,
+                    # since we don't support top level NOT statements.
+                    # so we have this additional trivial feature.
+                    - mnemonic: mov
          """
     )
     r = capa.rules.Rule.from_yaml(rule)
 
     # ok
-    _, matches = match([r], {}, 0x0)
+    _, matches = match([r], {capa.features.insn.Mnemonic("mov"): {}}, 0x0)
     assert "test rule" in matches
-    _, matches = match([r], {capa.features.insn.Number(100): {}}, 0x0)
+    _, matches = match([r], {capa.features.insn.Number(100): {}, capa.features.insn.Mnemonic("mov"): {}}, 0x0)
     assert "test rule" in matches
-    _, matches = match([r], {capa.features.insn.Number(100): {1}}, 0x0)
+    _, matches = match([r], {capa.features.insn.Number(100): {1}, capa.features.insn.Mnemonic("mov"): {1}}, 0x0)
     assert "test rule" in matches
 
     # too many matches
-    _, matches = match([r], {capa.features.insn.Number(100): {1, 2}}, 0x0)
+    _, matches = match([r], {capa.features.insn.Number(100): {1, 2}, capa.features.insn.Mnemonic("mov"): {1, 2}}, 0x0)
     assert "test rule" not in matches
 
 
@@ -551,7 +565,8 @@ def test_match_regex_values_always_string():
     assert capa.features.common.MatchedRule("test rule") in features
 
 
-def test_match_not():
+@pytest.mark.xfail(reason="can't have top level NOT")
+def test_match_only_not():
     rule = textwrap.dedent(
         """
         rule:
@@ -572,6 +587,30 @@ def test_match_not():
     assert "test rule" in matches
 
 
+def test_match_not():
+    rule = textwrap.dedent(
+        """
+        rule:
+            meta:
+                name: test rule
+                scopes:
+                    static: function
+                    dynamic: process
+                namespace: testns1/testns2
+            features:
+                - and:
+                    - mnemonic: mov
+                    - not:
+                        - number: 99
+        """
+    )
+    r = capa.rules.Rule.from_yaml(rule)
+
+    _, matches = match([r], {capa.features.insn.Number(100): {1, 2}, capa.features.insn.Mnemonic("mov"): {1, 2}}, 0x0)
+    assert "test rule" in matches
+
+
+@pytest.mark.xfail(reason="can't have nested NOT")
 def test_match_not_not():
     rule = textwrap.dedent(
         """

From bff7f0a3ec66484c1301431453216886bb983a15 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 10:25:42 +0200
Subject: [PATCH 14/28] rules: match paranoid true

---
 capa/rules/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 072d8eff1..69741a9d9 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1992,7 +1992,7 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea
         return (augmented_features, results)
 
     def match(
-        self, scope: Scope, features: FeatureSet, addr: Address, paranoid=False
+        self, scope: Scope, features: FeatureSet, addr: Address, paranoid=True
     ) -> Tuple[FeatureSet, ceng.MatchResults]:
         """
         Match rules from this ruleset at the given scope against the given features.

From d20f040d97d6fe7df0dad8ef1ed0bc724a04c20e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 10:34:51 +0200
Subject: [PATCH 15/28] rules: document logic edge cases

---
 capa/rules/__init__.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 69741a9d9..3937c249d 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -2000,6 +2000,14 @@ def match(
         This wrapper around _match exists so that we can assert it matches precisely
         the same as `capa.engine.match`, just faster.
 
+        This matcher does not handle some edge cases:
+          - top level NOT statements
+              - also top level counted features with zero occurances, like: `count(menmonic(mov)): 0`
+          - nested NOT statements (NOT: NOT: foo)
+
+        We should discourage/forbid these constructs from our rules and add lints for them.
+        TODO(williballenthin): add lints for logic edge cases
+
         Args:
           paranoid: when true, demonstrate that the naive matcher agrees with this optimized matcher (much slower!).
         """

From a7e24e6c784d278bdfe366f6226d7d677fe72f35 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 16:36:25 +0200
Subject: [PATCH 16/28] pep8

---
 capa/features/extractors/binexport2/__init__.py   | 4 +++-
 capa/features/extractors/binexport2/basicblock.py | 1 +
 capa/features/extractors/binexport2/extractor.py  | 4 +++-
 capa/features/extractors/binexport2/insn.py       | 4 +++-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py
index ba488cd86..a09ca2a52 100644
--- a/capa/features/extractors/binexport2/__init__.py
+++ b/capa/features/extractors/binexport2/__init__.py
@@ -261,7 +261,9 @@ def _find_base_address(self):
     def _compute_thunks(self):
         for addr, idx in self.idx.vertex_index_by_address.items():
             vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[idx]
-            if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.THUNK):
+            if not capa.features.extractors.binexport2.helpers.is_vertex_type(
+                vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+            ):
                 continue
 
             curr_idx: int = idx
diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py
index 5d7398aa1..bcb7977b4 100644
--- a/capa/features/extractors/binexport2/basicblock.py
+++ b/capa/features/extractors/binexport2/basicblock.py
@@ -13,6 +13,7 @@
 from capa.features.basicblock import BasicBlock
 from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext
 from capa.features.extractors.base_extractor import BBHandle, FunctionHandle
+from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2
 
 
 def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]:
diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py
index 6e1e4c633..1c3c4d393 100644
--- a/capa/features/extractors/binexport2/extractor.py
+++ b/capa/features/extractors/binexport2/extractor.py
@@ -75,7 +75,9 @@ def get_functions(self) -> Iterator[FunctionHandle]:
             be2_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[vertex_idx]
 
             # skip thunks
-            if capa.features.extractors.binexport2.helpers.is_vertex_type(be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK):
+            if capa.features.extractors.binexport2.helpers.is_vertex_type(
+                be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK
+            ):
                 continue
 
             yield FunctionHandle(
diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py
index 13b067497..d9c758fef 100644
--- a/capa/features/extractors/binexport2/insn.py
+++ b/capa/features/extractors/binexport2/insn.py
@@ -54,7 +54,9 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle
         vertex_idx: int = be2_index.vertex_index_by_address[addr]
         vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx]
 
-        if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED):
+        if not capa.features.extractors.binexport2.helpers.is_vertex_type(
+            vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED
+        ):
             continue
 
         if not vertex.HasField("mangled_name"):

From a66524ae458a6a15e391d738abc66dcfdb207614 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 16:36:33 +0200
Subject: [PATCH 17/28] rules: match: better debug paranoid matching

---
 capa/rules/__init__.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 3937c249d..010c5c6fb 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -2011,20 +2011,27 @@ def match(
         Args:
           paranoid: when true, demonstrate that the naive matcher agrees with this optimized matcher (much slower!).
         """
-        features1, matches1 = self._match(scope, features, addr)
+        features, matches = self._match(scope, features, addr)
 
         if paranoid:
-            features2, matches2 = capa.engine.match(list(self.rules.values()), features, addr)
+            rules: List[Rule] = self.rules_by_scope[scope]
+            paranoid_features, paranoid_matches = capa.engine.match(rules, features, addr)
 
-            for feature, locations in features1.items():
-                assert feature in features2
-                assert locations == features2[feature]
+            if features != paranoid_features:
+                logger.warning("paranoid: %s: %s", scope, addr)
+                for feature in sorted(set(features.keys()) & set(paranoid_features.keys())):
+                    logger.warning("paranoid:   %s", feature)
 
-            for rulename, results in matches1.items():
-                assert rulename in matches2
-                assert len(results) == len(matches2[rulename])
+                for feature in sorted(set(features.keys()) - set(paranoid_features.keys())):
+                    logger.warning("paranoid: + %s", feature)
 
-        return features1, matches1
+                for feature in sorted(set(paranoid_features.keys()) - set(features.keys())):
+                    logger.warning("paranoid: - %s", feature)
+
+            assert features == paranoid_features
+            assert set(matches.keys()) == set(paranoid_matches.keys())
+
+        return features, matches
 
 
 def is_nursery_rule_path(path: Path) -> bool:

From 61d01bb0e9c16a9a4d2c5d14ed58090cdcf233dd Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 16:52:05 +0200
Subject: [PATCH 18/28] rules: matcher: more doc

---
 capa/rules/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 010c5c6fb..6e5b5262a 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -2009,7 +2009,8 @@ def match(
         TODO(williballenthin): add lints for logic edge cases
 
         Args:
-          paranoid: when true, demonstrate that the naive matcher agrees with this optimized matcher (much slower!).
+          paranoid: when true, demonstrate that the naive matcher agrees with this
+           optimized matcher (much slower! around 10x slower).
         """
         features, matches = self._match(scope, features, addr)
 

From 62c44521f0944c6b481025c5066da6ce67a8c1be Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 17:11:33 +0200
Subject: [PATCH 19/28] rules: match: disable paranoid mode by default

---
 capa/rules/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 6e5b5262a..ae7a30e02 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1992,7 +1992,7 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea
         return (augmented_features, results)
 
     def match(
-        self, scope: Scope, features: FeatureSet, addr: Address, paranoid=True
+        self, scope: Scope, features: FeatureSet, addr: Address, paranoid=False
     ) -> Tuple[FeatureSet, ceng.MatchResults]:
         """
         Match rules from this ruleset at the given scope against the given features.

From 8ccae6e0274725489685e4ec37571fc892cd157b Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Mon, 3 Jun 2024 17:34:23 +0200
Subject: [PATCH 20/28] add tests demonstrating optimized matching behavior

---
 capa/rules/__init__.py |  2 +
 tests/test_match.py    | 89 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index ae7a30e02..909a0d708 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1856,6 +1856,8 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea
         # feature frequently whenever a string/bytes feature is encountered. Its slow, but we can't
         # get around it. Reducing our reliance on regex/bytes feature and/or finding a way to
         # index these can futher improve performance.
+        #
+        # See the corresponding unstable tests in `test_match.py::test_index_features_*`.
 
         # Find all the rules that could match the given feature set.
         # Ideally we want this set to be as small and focused as possible,
diff --git a/tests/test_match.py b/tests/test_match.py
index 4aa6db305..24dc89a6a 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -788,3 +788,92 @@ def test_match_os_any():
         0x0,
     )
     assert "test rule" in matches
+
+
+def test_index_features_and_unstable():
+    rule = textwrap.dedent(
+        """
+        rule:
+            meta:
+                name: test rule
+                scopes:
+                    static: function
+                    dynamic: process
+            features:
+                - and:
+                    - mnemonic: mov
+                    - api: CreateFileW
+        """
+    )
+    r = capa.rules.Rule.from_yaml(rule)
+    rr = capa.rules.RuleSet([r])
+    index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION]
+
+    # there's a single rule, and its indexed by a single feature
+    assert len(index.rules_by_feature) == 1
+    # and we index by the more uncommon API feature, not the common mnemonic feature
+    assert capa.features.insn.API("CreateFileW") in index.rules_by_feature
+
+    assert not index.string_rules
+    assert not index.bytes_rules
+
+
+def test_index_features_or_unstable():
+    rule = textwrap.dedent(
+        """
+        rule:
+            meta:
+                name: test rule
+                scopes:
+                    static: function
+                    dynamic: process
+            features:
+                - or:
+                    - mnemonic: mov
+                    - api: CreateFileW
+        """
+    )
+    r = capa.rules.Rule.from_yaml(rule)
+    rr = capa.rules.RuleSet([r])
+    index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION]
+
+    # there's a single rule, and its indexed by both features,
+    # because they fall under the single root OR node.
+    assert len(index.rules_by_feature) == 2
+    assert capa.features.insn.API("CreateFileW") in index.rules_by_feature
+    assert capa.features.insn.Mnemonic("mov") in index.rules_by_feature
+
+    assert not index.string_rules
+    assert not index.bytes_rules
+
+
+def test_index_features_nested_unstable():
+    rule = textwrap.dedent(
+        """
+        rule:
+            meta:
+                name: test rule
+                scopes:
+                    static: function
+                    dynamic: process
+            features:
+                - and:
+                    - mnemonic: mov
+                    - or:
+                        - api: CreateFileW
+                        - string: foo
+        """
+    )
+    r = capa.rules.Rule.from_yaml(rule)
+    rr = capa.rules.RuleSet([r])
+    index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION]
+
+    # there's a single rule, and its indexed by the two uncommon features,
+    # not the single common feature.
+    assert len(index.rules_by_feature) == 2
+    assert capa.features.insn.API("CreateFileW") in index.rules_by_feature
+    assert capa.features.common.String("foo") in index.rules_by_feature
+    assert capa.features.insn.Mnemonic("mov") not in index.rules_by_feature
+
+    assert not index.string_rules
+    assert not index.bytes_rules

From 12a78f395ba14717947b981af1beb4da51e19554 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Mon, 3 Jun 2024 21:29:00 +0200
Subject: [PATCH 21/28] rules: match: remove inline closure comment

Co-authored-by: Moritz <mr-tz@users.noreply.github.com>
---
 capa/rules/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 909a0d708..ec88982fe 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1553,7 +1553,6 @@ def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dic
         def rec(
             rule_name: str,
             node: Union[Feature, Statement],
-            # closure over: scores_by_rule
         ) -> Optional[Tuple[int, Set[Feature]]]:
             """
             Walk through a rule's logic tree, picking the features to use for indexing,

From 960ee864b9daf116e3040b4bf06ce947158b04aa Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <wballenthin@google.com>
Date: Mon, 3 Jun 2024 21:33:45 +0200
Subject: [PATCH 22/28] typo

Co-authored-by: Mike Hunhoff <mike.hunhoff@gmail.com>
---
 capa/rules/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index ec88982fe..2a2530ca1 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1456,7 +1456,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
 
             if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF:
                 # Numbers close to u32::max_int are also probably pretty common,
-                # like signed numbers closed to 0 that are stored as unsigned ints.
+                # like signed numbers close to 0 that are stored as unsigned ints.
                 return 3
 
             if 0xFFFF_FFFF_FFFF_FF00 <= v <= 0xFFFF_FFFF_FFFF_FFFF:

From e21a70f0be569af15a273e71c936bc6efb05d640 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 4 Jun 2024 12:56:33 +0200
Subject: [PATCH 23/28] rules: clarify the term "unstable" with longer comments

---
 capa/rules/__init__.py | 10 +++++-----
 tests/test_match.py    |  3 +++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 909a0d708..43b0ab8bc 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1380,7 +1380,7 @@ def __init__(
         self.rules_by_namespace = index_rules_by_namespace(rules)
         self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes}
 
-        # unstable
+        # these structures are unstable and may change before the next major release.
         scores_by_rule: Dict[str, int] = {}
         self._feature_indexes_by_scopes = {
             scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes
@@ -1423,7 +1423,7 @@ def __getitem__(self, rulename):
     def __contains__(self, rulename):
         return rulename in self.rules
 
-    # unstable
+    # this routine is unstable and may change before the next major release.
     @staticmethod
     def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int:
         """
@@ -1526,7 +1526,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             # bytes: 0
         }[C]
 
-    # unstable
+    # this class is unstable and may change before the next major release.
     @dataclass
     class _RuleFeatureIndex:
         # Mapping from hashable feature to a list of rules that might have this feature.
@@ -1538,7 +1538,7 @@ class _RuleFeatureIndex:
         # All these features will be evaluated whenever a Bytes feature is encountered.
         bytes_rules: Dict[str, List[Feature]]
 
-    # unstable
+    # this routine is unstable and may change before the next major release.
     @staticmethod
     def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> _RuleFeatureIndex:
         """
@@ -1806,7 +1806,7 @@ def filter_rules_by_meta(self, tag: str) -> "RuleSet":
                             break
         return RuleSet(list(rules_filtered))
 
-    # unstable
+    # this routine is unstable and may change before the next major release.
     @staticmethod
     def _sort_rules_by_index(rule_index_by_rule_name: Dict[str, int], rules: List[Rule]):
         """
diff --git a/tests/test_match.py b/tests/test_match.py
index 24dc89a6a..d621853c7 100644
--- a/tests/test_match.py
+++ b/tests/test_match.py
@@ -790,6 +790,7 @@ def test_match_os_any():
     assert "test rule" in matches
 
 
+# this test demonstrates the behavior of unstable features that may change before the next major release.
 def test_index_features_and_unstable():
     rule = textwrap.dedent(
         """
@@ -818,6 +819,7 @@ def test_index_features_and_unstable():
     assert not index.bytes_rules
 
 
+# this test demonstrates the behavior of unstable features that may change before the next major release.
 def test_index_features_or_unstable():
     rule = textwrap.dedent(
         """
@@ -847,6 +849,7 @@ def test_index_features_or_unstable():
     assert not index.bytes_rules
 
 
+# this test demonstrates the behavior of unstable features that may change before the next major release.
 def test_index_features_nested_unstable():
     rule = textwrap.dedent(
         """

From 457cfe018053f224af9ad6681e767aee1268fa6c Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 4 Jun 2024 13:17:48 +0200
Subject: [PATCH 24/28] rules: more comments describing how features are scored

---
 capa/rules/__init__.py | 47 ++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index f7b3a7972..b5671416c 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1427,8 +1427,20 @@ def __contains__(self, rulename):
     @staticmethod
     def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int:
         """
-        Score the given feature by how "uncommon" it is, where a higher score is more uncommon.
+        Score the given feature by how "uncommon" we think it will be.
+        Features that we expect to be very selective (ie. uniquely identify a rule and be required to match),
+         or "uncommon", should get a high score. 
         Features that are not good for indexing will have a low score, or 0.
+
+        The range of values doesn't really matter, but here we use 0-10, where
+          - 10 is very uncommon, very selective, good for indexing a rule, and
+          - 0 is a very common, not selective, bad for indexing a rule.
+
+        You shouldn't try to interpret the scores, beyond to compare features to pick one or the other.
+
+        Today, these scores are assigned manually, by the capa devs, who use their intuition and experience.
+        We *could* do a large scale analysis of all features emitted by capa across many samples to
+        make this more data driven. If the current approach doesn't work well, consider that.
         """
 
         #
@@ -1440,11 +1452,14 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             node,
             capa.features.common.MatchedRule,
         ):
-            # If present, other rule must match before this one, in same scope.
-            # Use score from that rule, which will have already been processed due to topological sorting.
-            # Otherwise, use a default score of 5.
-            assert isinstance(node.value, str)
-            return scores_by_rule.get(node.value, 5)
+            # The other rule must match before this one, in same scope or smaller.
+            # Because we process the rules small->large scope and topologically,
+            # then we can rely on dependencies being processed first.
+            #
+            # If logic changes and you see issues here, ensure that `scores_by_rule` is correctly provided.
+            rule_name = node.value
+            assert isinstance(rule_name, str)
+            return scores_by_rule[rule_name]
 
         elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)):
             v = node.value
@@ -1472,6 +1487,13 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
 
         C = node.__class__
         return {
+            # The range of values doesn't really matter, but here we use 0-10, where
+            #   - 10 is very uncommon, very selective, good for indexing a rule, and
+            #   - 0 is a very common, not selective, bad for indexing a rule.
+            #
+            # You shouldn't try to interpret the scores, beyond to compare features to pick one or the other.
+
+            # -----------------------------------------------------------------
             #
             # Very uncommon features that are probably very selective in capa's domain.
             # When possible, we want rules to be indexed by these features.
@@ -1479,7 +1501,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             capa.features.common.String: 9,
             capa.features.insn.API: 8,
             capa.features.file.Export: 7,
-            # "uncommon numbers": 7
+            # "uncommon numbers": 7 (placeholder for logic above)
             #
             # -----------------------------------------------------------------
             #
@@ -1492,7 +1514,6 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             capa.features.file.Import: 5,
             capa.features.file.Section: 5,
             capa.features.file.FunctionName: 5,
-            # default MatchedRule: 5
             #
             # -----------------------------------------------------------------
             #
@@ -1501,11 +1522,11 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             capa.features.common.Characteristic: 4,
             capa.features.insn.Offset: 4,
             capa.features.insn.OperandOffset: 4,
-            # "common numbers": 3
+            # "common numbers": 3 (placeholder for logic above)
             #
             # -----------------------------------------------------------------
             #
-            # Very common features, which we'd only prefer to non-hashable features, like Regex/Substring/Bytes.
+            # Very common features, which we'd only prefer instead of non-hashable features, like Regex/Substring/Bytes.
             #
             capa.features.insn.Mnemonic: 2,
             capa.features.basicblock.BasicBlock: 1,
@@ -1521,9 +1542,9 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             #
             # Non-hashable features, which will require a scan to evaluate, and are therefore quite expensive.
             #
-            # substring: 0
-            # regex: 0
-            # bytes: 0
+            # substring: 0 (placeholder for logic above)
+            # regex: 0 (placeholder for logic above)
+            # bytes: 0 (placeholder for logic above)
         }[C]
 
     # this class is unstable and may change before the next major release.

From b34667ff5e29d0fae413fa529cddd7488d25da25 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Tue, 4 Jun 2024 16:04:57 +0200
Subject: [PATCH 25/28] black

---
 capa/rules/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index b5671416c..b413f2aab 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1429,7 +1429,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
         """
         Score the given feature by how "uncommon" we think it will be.
         Features that we expect to be very selective (ie. uniquely identify a rule and be required to match),
-         or "uncommon", should get a high score. 
+         or "uncommon", should get a high score.
         Features that are not good for indexing will have a low score, or 0.
 
         The range of values doesn't really matter, but here we use 0-10, where
@@ -1492,7 +1492,6 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             #   - 0 is a very common, not selective, bad for indexing a rule.
             #
             # You shouldn't try to interpret the scores, beyond to compare features to pick one or the other.
-
             # -----------------------------------------------------------------
             #
             # Very uncommon features that are probably very selective in capa's domain.

From 1f1e142102d530add8059f1b76d509ba6a72a43e Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 6 Jun 2024 09:47:41 +0200
Subject: [PATCH 26/28] rules: match: re-introduce default rule scores

---
 capa/rules/__init__.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index b413f2aab..bdff39464 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1459,6 +1459,25 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             # If logic changes and you see issues here, ensure that `scores_by_rule` is correctly provided.
             rule_name = node.value
             assert isinstance(rule_name, str)
+
+            if rule_name not in scores_by_rule:
+                # Its possible that we haven't scored the rule that is being requested here.
+                # This means that it won't ever match (because it won't be evaluated before this one).
+                # Still, we need to provide a default value here. 
+                # So we give it 9, because it won't match, so its very selective.
+                #
+                # But how could this dependency not exist?
+                # Consider a rule that supports both static and dynamic analysis, but also has
+                # a `instruction: ` block. This block gets translated into a derived rule that only
+                # matches in static mode. Therefore, when the parent rule is run in dynamic mode, it
+                # won't be able to find the derived rule. This is the case we have to handle here.
+                #
+                # A better solution would be to prune this logic based on static/dynamic mode, but
+                # that takes more work and isn't in scope of this feature.
+                #
+                # See discussion in: https://github.com/mandiant/capa/pull/2080/#discussion_r1624783396
+                return 9
+
             return scores_by_rule[rule_name]
 
         elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)):

From 347619d35a1a4efec4e9bdbcce98949b0e54cc46 Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 6 Jun 2024 09:48:35 +0200
Subject: [PATCH 27/28] rules: match: remove trace comments

---
 capa/rules/__init__.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index bdff39464..4b161c30c 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1957,15 +1957,6 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea
                         if wanted_bytes.evaluate(bytes_features):
                             candidate_rule_names.add(rule_name)
 
-        # trace
-        # logger.debug(
-        #     "perf: match: %s: %s: %d features, %d candidate rules",
-        #     scope,
-        #     addr,
-        #     len(features),
-        #     len(candidate_rule_names),
-        # )
-
         # No rules can possibly match, so quickly return.
         if not candidate_rule_names:
             return (features, {})

From b376a8401c82452d7d6ea658da2c15ab61ba74bf Mon Sep 17 00:00:00 2001
From: Willi Ballenthin <willi.ballenthin@gmail.com>
Date: Thu, 6 Jun 2024 09:49:37 +0200
Subject: [PATCH 28/28] black

---
 capa/rules/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py
index 4b161c30c..0d1bbd3c6 100644
--- a/capa/rules/__init__.py
+++ b/capa/rules/__init__.py
@@ -1463,7 +1463,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe
             if rule_name not in scores_by_rule:
                 # Its possible that we haven't scored the rule that is being requested here.
                 # This means that it won't ever match (because it won't be evaluated before this one).
-                # Still, we need to provide a default value here. 
+                # Still, we need to provide a default value here.
                 # So we give it 9, because it won't match, so its very selective.
                 #
                 # But how could this dependency not exist?