diff --git a/CHANGELOG.md b/CHANGELOG.md index 40f02a8ee..7086e6397 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - add function in capa/helpers to load plain and compressed JSON reports #1883 @Rohit1123 - document Antivirus warnings and VirusTotal false positive detections #2028 @RionEV @mr-tz +- optimize rule matching #2080 @williballenthin - replace Halo spinner with Rich #2086 @s-ff ### Breaking Changes diff --git a/capa/engine.py b/capa/engine.py index 649d0367c..25c26cb96 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -270,6 +270,14 @@ def evaluate(self, features: FeatureSet, short_circuit=True): MatchResults = Mapping[str, List[Tuple[Address, Result]]] +def get_rule_namespaces(rule: "capa.rules.Rule") -> Iterator[str]: + namespace = rule.meta.get("namespace") + if namespace: + while namespace: + yield namespace + namespace, _, _ = namespace.rpartition("/") + + def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: Iterable[Address]): """ record into the given featureset that the given rule matched at the given locations. @@ -280,11 +288,8 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: updates `features` in-place. doesn't modify the remaining arguments. """ features[capa.features.common.MatchedRule(rule.name)].update(locations) - namespace = rule.meta.get("namespace") - if namespace: - while namespace: - features[capa.features.common.MatchedRule(namespace)].update(locations) - namespace, _, _ = namespace.rpartition("/") + for namespace in get_rule_namespaces(rule): + features[capa.features.common.MatchedRule(namespace)].update(locations) def match(rules: List["capa.rules.Rule"], features: FeatureSet, addr: Address) -> Tuple[FeatureSet, MatchResults]: diff --git a/capa/features/common.py b/capa/features/common.py index 279332e6e..cde91d1b9 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -385,10 +385,12 @@ def __init__(self, value: bytes, description=None): self.value = value def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True): + assert isinstance(self.value, bytes) + capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.bytes"] += 1 + capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1 - assert isinstance(self.value, bytes) for feature, locations in features.items(): if not isinstance(feature, (Bytes,)): continue @@ -490,6 +492,6 @@ def __init__(self, value: str, description=None): def is_global_feature(feature): """ is this a feature that is extracted at every scope? - today, these are OS and arch features. + today, these are OS, arch, and format features. """ - return isinstance(feature, (OS, Arch)) + return isinstance(feature, (OS, Arch, Format)) diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py index ba488cd86..a09ca2a52 100644 --- a/capa/features/extractors/binexport2/__init__.py +++ b/capa/features/extractors/binexport2/__init__.py @@ -261,7 +261,9 @@ def _find_base_address(self): def _compute_thunks(self): for addr, idx in self.idx.vertex_index_by_address.items(): vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[idx] - if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.THUNK): + if not capa.features.extractors.binexport2.helpers.is_vertex_type( + vertex, BinExport2.CallGraph.Vertex.Type.THUNK + ): continue curr_idx: int = idx diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py index 5d7398aa1..bcb7977b4 100644 --- a/capa/features/extractors/binexport2/basicblock.py +++ b/capa/features/extractors/binexport2/basicblock.py @@ -13,6 +13,7 @@ from capa.features.basicblock import BasicBlock from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext from capa.features.extractors.base_extractor import BBHandle, FunctionHandle +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py index 6e1e4c633..1c3c4d393 100644 --- a/capa/features/extractors/binexport2/extractor.py +++ b/capa/features/extractors/binexport2/extractor.py @@ -75,7 +75,9 @@ def get_functions(self) -> Iterator[FunctionHandle]: be2_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[vertex_idx] # skip thunks - if capa.features.extractors.binexport2.helpers.is_vertex_type(be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK): + if capa.features.extractors.binexport2.helpers.is_vertex_type( + be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK + ): continue yield FunctionHandle( diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py index 1a907eb7b..92ef311ee 100644 --- a/capa/features/extractors/binexport2/insn.py +++ b/capa/features/extractors/binexport2/insn.py @@ -54,7 +54,9 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle vertex_idx: int = be2_index.vertex_index_by_address[addr] vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx] - if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED): + if not capa.features.extractors.binexport2.helpers.is_vertex_type( + vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED + ): continue if not vertex.HasField("mangled_name"): diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 67d0b03ea..0d1bbd3c6 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -9,6 +9,7 @@ import io import os import re +import copy import uuid import codecs import logging @@ -26,7 +27,7 @@ # https://github.com/python/mypy/issues/1153 from backports.functools_lru_cache import lru_cache # type: ignore -from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional +from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional, cast from dataclasses import asdict, dataclass import yaml @@ -1365,32 +1366,53 @@ def __init__( rules = capa.optimizer.optimize_rules(rules) - self.file_rules = self._get_rules_for_scope(rules, Scope.FILE) - self.process_rules = self._get_rules_for_scope(rules, Scope.PROCESS) - self.thread_rules = self._get_rules_for_scope(rules, Scope.THREAD) - self.call_rules = self._get_rules_for_scope(rules, Scope.CALL) - self.function_rules = self._get_rules_for_scope(rules, Scope.FUNCTION) - self.basic_block_rules = self._get_rules_for_scope(rules, Scope.BASIC_BLOCK) - self.instruction_rules = self._get_rules_for_scope(rules, Scope.INSTRUCTION) + scopes = ( + Scope.CALL, + Scope.THREAD, + Scope.PROCESS, + Scope.INSTRUCTION, + Scope.BASIC_BLOCK, + Scope.FUNCTION, + Scope.FILE, + ) + self.rules = {rule.name: rule for rule in rules} self.rules_by_namespace = index_rules_by_namespace(rules) + self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes} - # unstable - (self._easy_file_rules_by_feature, self._hard_file_rules) = self._index_rules_by_feature(self.file_rules) - (self._easy_process_rules_by_feature, self._hard_process_rules) = self._index_rules_by_feature( - self.process_rules - ) - (self._easy_thread_rules_by_feature, self._hard_thread_rules) = self._index_rules_by_feature(self.thread_rules) - (self._easy_call_rules_by_feature, self._hard_call_rules) = self._index_rules_by_feature(self.call_rules) - (self._easy_function_rules_by_feature, self._hard_function_rules) = self._index_rules_by_feature( - self.function_rules - ) - (self._easy_basic_block_rules_by_feature, self._hard_basic_block_rules) = self._index_rules_by_feature( - self.basic_block_rules - ) - (self._easy_instruction_rules_by_feature, self._hard_instruction_rules) = self._index_rules_by_feature( - self.instruction_rules - ) + # these structures are unstable and may change before the next major release. + scores_by_rule: Dict[str, int] = {} + self._feature_indexes_by_scopes = { + scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes + } + + @property + def file_rules(self): + return self.rules_by_scope[Scope.FILE] + + @property + def process_rules(self): + return self.rules_by_scope[Scope.PROCESS] + + @property + def thread_rules(self): + return self.rules_by_scope[Scope.THREAD] + + @property + def call_rules(self): + return self.rules_by_scope[Scope.CALL] + + @property + def function_rules(self): + return self.rules_by_scope[Scope.FUNCTION] + + @property + def basic_block_rules(self): + return self.rules_by_scope[Scope.BASIC_BLOCK] + + @property + def instruction_rules(self): + return self.rules_by_scope[Scope.INSTRUCTION] def __len__(self): return len(self.rules) @@ -1401,154 +1423,358 @@ def __getitem__(self, rulename): def __contains__(self, rulename): return rulename in self.rules + # this routine is unstable and may change before the next major release. @staticmethod - def _index_rules_by_feature(rules) -> Tuple[Dict[Feature, Set[str]], List[str]]: + def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int: """ - split the given rules into two structures: - - "easy rules" are indexed by feature, - such that you can quickly find the rules that contain a given feature. - - "hard rules" are those that contain substring/regex/bytes features or match statements. - these continue to be ordered topologically. + Score the given feature by how "uncommon" we think it will be. + Features that we expect to be very selective (ie. uniquely identify a rule and be required to match), + or "uncommon", should get a high score. + Features that are not good for indexing will have a low score, or 0. - a rule evaluator can use the "easy rule" index to restrict the - candidate rules that might match a given set of features. + The range of values doesn't really matter, but here we use 0-10, where + - 10 is very uncommon, very selective, good for indexing a rule, and + - 0 is a very common, not selective, bad for indexing a rule. - at this time, a rule evaluator can't do anything special with - the "hard rules". it must still do a full top-down match of each - rule, in topological order. + You shouldn't try to interpret the scores, beyond to compare features to pick one or the other. - this does not index global features, because these are not selective, and - won't be used as the sole feature used to match. + Today, these scores are assigned manually, by the capa devs, who use their intuition and experience. + We *could* do a large scale analysis of all features emitted by capa across many samples to + make this more data driven. If the current approach doesn't work well, consider that. """ - # we'll do a couple phases: # - # 1. recursively visit all nodes in all rules, - # a. indexing all features - # b. recording the types of features found per rule - # 2. compute the easy and hard rule sets - # 3. remove hard rules from the rules-by-feature index - # 4. construct the topologically ordered list of hard rules - rules_with_easy_features: Set[str] = set() - rules_with_hard_features: Set[str] = set() + # Today, these scores are manually assigned by intuition/experience/guesswork. + # We could do a large-scale feature collection and use the results to assign scores. + # + + if isinstance( + node, + capa.features.common.MatchedRule, + ): + # The other rule must match before this one, in same scope or smaller. + # Because we process the rules small->large scope and topologically, + # then we can rely on dependencies being processed first. + # + # If logic changes and you see issues here, ensure that `scores_by_rule` is correctly provided. + rule_name = node.value + assert isinstance(rule_name, str) + + if rule_name not in scores_by_rule: + # Its possible that we haven't scored the rule that is being requested here. + # This means that it won't ever match (because it won't be evaluated before this one). + # Still, we need to provide a default value here. + # So we give it 9, because it won't match, so its very selective. + # + # But how could this dependency not exist? + # Consider a rule that supports both static and dynamic analysis, but also has + # a `instruction: ` block. This block gets translated into a derived rule that only + # matches in static mode. Therefore, when the parent rule is run in dynamic mode, it + # won't be able to find the derived rule. This is the case we have to handle here. + # + # A better solution would be to prune this logic based on static/dynamic mode, but + # that takes more work and isn't in scope of this feature. + # + # See discussion in: https://github.com/mandiant/capa/pull/2080/#discussion_r1624783396 + return 9 + + return scores_by_rule[rule_name] + + elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)): + v = node.value + assert isinstance(v, int) + + if -0x8000 <= v <= 0x8000: + # Small numbers are probably pretty common, like structure offsets, etc. + return 3 + + if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF: + # Numbers close to u32::max_int are also probably pretty common, + # like signed numbers close to 0 that are stored as unsigned ints. + return 3 + + if 0xFFFF_FFFF_FFFF_FF00 <= v <= 0xFFFF_FFFF_FFFF_FFFF: + # Like signed numbers closed to 0 that are stored as unsigned long ints. + return 3 + + # Other numbers are assumed to be uncommon. + return 7 + + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)): + # Scanning features (non-hashable), which we can't use for quick matching/filtering. + return 0 + + C = node.__class__ + return { + # The range of values doesn't really matter, but here we use 0-10, where + # - 10 is very uncommon, very selective, good for indexing a rule, and + # - 0 is a very common, not selective, bad for indexing a rule. + # + # You shouldn't try to interpret the scores, beyond to compare features to pick one or the other. + # ----------------------------------------------------------------- + # + # Very uncommon features that are probably very selective in capa's domain. + # When possible, we want rules to be indexed by these features. + # + capa.features.common.String: 9, + capa.features.insn.API: 8, + capa.features.file.Export: 7, + # "uncommon numbers": 7 (placeholder for logic above) + # + # ----------------------------------------------------------------- + # + # Features that are probably somewhat common, and/or rarely used within capa. + # Its ok to index rules by these. + # + capa.features.common.Class: 5, + capa.features.common.Namespace: 5, + capa.features.insn.Property: 5, + capa.features.file.Import: 5, + capa.features.file.Section: 5, + capa.features.file.FunctionName: 5, + # + # ----------------------------------------------------------------- + # + # Features that are pretty common and we'd prefer not to index, but can if we have to. + # + capa.features.common.Characteristic: 4, + capa.features.insn.Offset: 4, + capa.features.insn.OperandOffset: 4, + # "common numbers": 3 (placeholder for logic above) + # + # ----------------------------------------------------------------- + # + # Very common features, which we'd only prefer instead of non-hashable features, like Regex/Substring/Bytes. + # + capa.features.insn.Mnemonic: 2, + capa.features.basicblock.BasicBlock: 1, + # + # + # We don't *want* to index global features because they're not very selective. + # They also don't usually stand on their own - there's always some other logic. + # + capa.features.common.OS: 0, + capa.features.common.Arch: 0, + capa.features.common.Format: 0, + # ----------------------------------------------------------------- + # + # Non-hashable features, which will require a scan to evaluate, and are therefore quite expensive. + # + # substring: 0 (placeholder for logic above) + # regex: 0 (placeholder for logic above) + # bytes: 0 (placeholder for logic above) + }[C] + + # this class is unstable and may change before the next major release. + @dataclass + class _RuleFeatureIndex: + # Mapping from hashable feature to a list of rules that might have this feature. + rules_by_feature: Dict[Feature, Set[str]] + # Mapping from rule name to list of Regex/Substring features that have to match. + # All these features will be evaluated whenever a String feature is encountered. + string_rules: Dict[str, List[Feature]] + # Mapping from rule name to list of Bytes features that have to match. + # All these features will be evaluated whenever a Bytes feature is encountered. + bytes_rules: Dict[str, List[Feature]] + + # this routine is unstable and may change before the next major release. + @staticmethod + def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> _RuleFeatureIndex: + """ + Index the given rules by their minimal set of most "uncommon" features required to match. + + If absolutely necessary, provide the Regex/Substring/Bytes features + (which are not hashable and require a scan) that have to match, too. + """ + rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set) - def rec(rule_name: str, node: Union[Feature, Statement]): + def rec( + rule_name: str, + node: Union[Feature, Statement], + ) -> Optional[Tuple[int, Set[Feature]]]: """ - walk through a rule's logic tree, indexing the easy and hard rules, - and the features referenced by easy rules. + Walk through a rule's logic tree, picking the features to use for indexing, + returning the feature and an associated score. + The higher the score, the more selective the feature is expected to be. + The score is only used internally, to pick the best feature from within AND blocks. + + Note closure over `scores_by_rule`. """ - if isinstance( - node, - ( - # these are the "hard features" - # substring: scanning feature - capa.features.common.Substring, - # regex: scanning feature - capa.features.common.Regex, - # bytes: scanning feature - capa.features.common.Bytes, - # match: dependency on another rule, - # which we have to evaluate first, - # and is therefore tricky. - capa.features.common.MatchedRule, - ), - ): - # hard feature: requires scan or match lookup - rules_with_hard_features.add(rule_name) - elif isinstance(node, capa.features.common.Feature): - if capa.features.common.is_global_feature(node): - # we don't want to index global features - # because they're not very selective. - # - # they're global, so if they match at one location in a file, - # they'll match at every location in a file. - # so that's not helpful to decide how to downselect. - # - # and, a global rule will never be the sole selector in a rule. - pass - else: - # easy feature: hash lookup - rules_with_easy_features.add(rule_name) - rules_by_feature[node].add(rule_name) - elif isinstance(node, (ceng.Not)): - # `not:` statements are tricky to deal with. - # - # first, features found under a `not:` should not be indexed, - # because they're not wanted to be found. - # second, `not:` can be nested under another `not:`, or two, etc. - # third, `not:` at the root or directly under an `or:` - # means the rule will match against *anything* not specified there, - # which is a difficult set of things to compute and index. + + if isinstance(node, (ceng.Not)): + # We don't index features within NOT blocks, because we're only looking for + # features that should be present. # - # so, if a rule has a `not:` statement, its hard. - # as of writing, this is an uncommon statement, with only 6 instances in 740 rules. - rules_with_hard_features.add(rule_name) + # Technically we could have a rule that does `not: not: foo` and we'd want to + # index `foo`. But this is not seen today. + return None + elif isinstance(node, (ceng.Some)) and node.count == 0: - # `optional:` and `0 or more:` are tricky to deal with. - # - # when a subtree is optional, it may match, but not matching + # When a subtree is optional, it may match, but not matching # doesn't have any impact either. - # now, our rule authors *should* not put this under `or:` + # Now, our rule authors *should* not put this under `or:` # and this is checked by the linter, - # but this could still happen (e.g. private rule set without linting) - # and would be hard to trace down. - # - # so better to be safe than sorry and consider this a hard case. - rules_with_hard_features.add(rule_name) - elif isinstance(node, (ceng.Range)) and node.min == 0: - # `count(foo): 0 or more` are tricky to deal with. - # because the min is 0, - # this subtree *can* match just about any feature - # (except the given one) - # which is a difficult set of things to compute and index. - rules_with_hard_features.add(rule_name) + return None + + elif isinstance(node, (ceng.Range)) and node.min == 0 and node.max != 0: + # `count(foo): 0 or more` is just like an optional block, + # because the min is 0, this subtree *can* match just about any feature. + return None + + elif isinstance(node, (ceng.Range)) and node.min == 0 and node.max == 0: + # `count(foo): 0` is like a not block, which we don't index. + return None + + elif isinstance(node, capa.features.common.Feature): + return (RuleSet._score_feature(scores_by_rule, node), {node}) + elif isinstance(node, (ceng.Range)): - rec(rule_name, node.child) - elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): + # feature is found N times + return rec(rule_name, node.child) + + elif isinstance(node, ceng.And): + # When evaluating an AND block, all of the children need to match. + # + # So when we index rules, we want to pick the most uncommon feature(s) + # for each AND block. If the AND block matches, that feature must be there. + # We recursively explore children, computing their + # score, and pick the child with the greatest score. + # + # For example, given the rule: + # + # and: + # - mnemonic: mov + # - api: CreateFile + # + # we prefer to pick `api: CreateFile` because we expect it to be more uncommon. + # + # Note that the children nodes might be complex, like: + # + # and: + # - mnemonic: mov + # - or: + # - api: CreateFile + # - api: DeleteFile + # + # In this case, we prefer to pick the pair of API features since each is expected + # to be more common than the mnemonic. + scores: List[Tuple[int, Set[Feature]]] = [] for child in node.children: - rec(rule_name, child) - elif isinstance(node, ceng.Statement): - # unhandled type of statement. - # this should only happen if a new subtype of `Statement` - # has since been added to capa. + score = rec(rule_name, child) + + if not score: + # maybe an optional block or similar + continue + + scores.append(score) + + # otherwise we can't index this rule + assert len(scores) > 0 + + def and_score_key(item): + # order by score, then fewest number of features. + score, features = item + return (score, -len(features)) + + scores.sort(key=and_score_key, reverse=True) + + # pick the best feature + return scores[0] + + elif isinstance(node, (ceng.Or, ceng.Some)): + # When evaluating an OR block, any of the children need to match. + # It could be any of them, so we can't decide to only index some of them. # - # ideally, we'd like to use mypy for exhaustiveness checking - # for all the subtypes of `Statement`. - # but, as far as i can tell, mypy does not support this type - # of checking. + # For example, given the rule: # - # in a way, this makes some intuitive sense: - # the set of subtypes of type A is unbounded, - # because any user might come along and create a new subtype B, - # so mypy can't reason about this set of types. - assert_never(node) + # or: + # - mnemonic: mov + # - api: CreateFile + # + # we have to pick both `mnemonic` and `api` features. + # + # Note that the children nodes might be complex, like: + # + # or: + # - mnemonic: mov + # - and: + # - api: CreateFile + # - api: DeleteFile + # + # In this case, we have to pick both the `mnemonic` and one of the `api` features. + # + # When computing the score of an OR branch, we have to use the min value encountered. + # While many of the children might be very specific, there might be a branch that is common + # and we need to handle that correctly. + min_score = 10000000 # assume this is larger than any score + features = set() + + for child in node.children: + item = rec(rule_name, child) + assert item is not None, "can't index OR branch" + + _score, _features = item + min_score = min(min_score, _score) + features.update(_features) + + return min_score, features + else: # programming error assert_never(node) + # These are the Regex/Substring/Bytes features that we have to use for filtering. + # Ideally we find a way to get rid of all of these, eventually. + string_rules: Dict[str, List[Feature]] = {} + bytes_rules: Dict[str, List[Feature]] = {} + for rule in rules: rule_name = rule.meta["name"] + root = rule.statement - rec(rule_name, root) + item = rec(rule_name, root) + assert item is not None + score, features = item + + string_features = [ + feature + for feature in features + if isinstance(feature, (capa.features.common.Substring, capa.features.common.Regex)) + ] + bytes_features = [feature for feature in features if isinstance(feature, capa.features.common.Bytes)] + hashable_features = [ + feature + for feature in features + if not isinstance( + feature, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes) + ) + ] - # if a rule has a hard feature, - # don't consider it easy, and therefore, - # don't index any of its features. - # - # otherwise, its an easy rule, and index its features - for rules_with_feature in rules_by_feature.values(): - rules_with_feature.difference_update(rules_with_hard_features) - easy_rules_by_feature = rules_by_feature - - # `rules` is already topologically ordered, - # so extract our hard set into the topological ordering. - hard_rules = [] - for rule in rules: - if rule.meta["name"] in rules_with_hard_features: - hard_rules.append(rule.meta["name"]) + logger.debug("indexing: features: %d, score: %d, rule: %s", len(features), score, rule_name) + scores_by_rule[rule_name] = score + for feature in features: + logger.debug(" : [%d] %s", RuleSet._score_feature(scores_by_rule, feature), feature) + + if string_features: + string_rules[rule_name] = cast(List[Feature], string_features) - return (easy_rules_by_feature, hard_rules) + if bytes_features: + bytes_rules[rule_name] = cast(List[Feature], bytes_features) + + for feature in hashable_features: + rules_by_feature[feature].add(rule_name) + + logger.debug("indexing: %d features indexed for scope %s", len(rules_by_feature), scope) + logger.debug( + "indexing: %d indexed features are shared by more than 3 rules", + len([feature for feature, rules in rules_by_feature.items() if len(rules) > 3]), + ) + logger.debug( + "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules) + ) + + return RuleSet._RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules) @staticmethod def _get_rules_for_scope(rules, scope) -> List[Rule]: @@ -1618,80 +1844,226 @@ def filter_rules_by_meta(self, tag: str) -> "RuleSet": break return RuleSet(list(rules_filtered)) - def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]: + # this routine is unstable and may change before the next major release. + @staticmethod + def _sort_rules_by_index(rule_index_by_rule_name: Dict[str, int], rules: List[Rule]): + """ + Sort (in place) the given rules by their index provided by the given Dict. + This mapping is intended to represent the topologic index of the given rule; + that is, rules with a lower index should be evaluated first, since their dependencies + will be evaluated later. """ - match rules from this ruleset at the given scope against the given features. + rules.sort(key=lambda r: rule_index_by_rule_name[r.name]) - this routine should act just like `capa.engine.match`, - except that it may be more performant. + def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]: """ - easy_rules_by_feature = {} - if scope == Scope.FILE: - easy_rules_by_feature = self._easy_file_rules_by_feature - hard_rule_names = self._hard_file_rules - elif scope == Scope.PROCESS: - easy_rules_by_feature = self._easy_process_rules_by_feature - hard_rule_names = self._hard_process_rules - elif scope == Scope.THREAD: - easy_rules_by_feature = self._easy_thread_rules_by_feature - hard_rule_names = self._hard_thread_rules - elif scope == Scope.CALL: - easy_rules_by_feature = self._easy_call_rules_by_feature - hard_rule_names = self._hard_call_rules - elif scope == Scope.FUNCTION: - easy_rules_by_feature = self._easy_function_rules_by_feature - hard_rule_names = self._hard_function_rules - elif scope == Scope.BASIC_BLOCK: - easy_rules_by_feature = self._easy_basic_block_rules_by_feature - hard_rule_names = self._hard_basic_block_rules - elif scope == Scope.INSTRUCTION: - easy_rules_by_feature = self._easy_instruction_rules_by_feature - hard_rule_names = self._hard_instruction_rules - else: - assert_never(scope) + Match rules from this ruleset at the given scope against the given features. - candidate_rule_names = set() + This routine should act just like `capa.engine.match`, except that it may be more performant. + It uses its knowledge of all the rules to evaluate a minimal set of candidate rules for the given features. + """ + + feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope] + rules: List[Rule] = self.rules_by_scope[scope] + # Topologic location of rule given its name. + # That is, rules with a lower index should be evaluated first, since their dependencies + # will be evaluated later. + rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)} + + # This algorithm is optimized to evaluate as few rules as possible, + # because the less work we do, the faster capa can run. + # + # It relies on the observation that most rules don't match, + # and that most rules have an uncommon feature that *must* be present for the rule to match. + # + # Therefore, we record which uncommon feature(s) is required for each rule to match, + # and then only inspect these few candidates when a feature is seen in some scope. + # Ultimately, the exact same rules are matched with precisely the same results, + # its just done faster, because we ignore most of the rules that never would have matched anyways. + # + # In `_index_rules_by_feature`, we do the hard work of computing the minimal set of + # uncommon features for each rule. While its a little expensive, its a single pass + # that gets reused at every scope instance (read: thousands or millions of times). + # + # In the current routine, we collect all the rules that might match, given the presence + # of any uncommon feature. We sort the rules topographically, so that rule dependencies work out, + # and then we evaluate the candidate rules. In practice, this saves 20-50x the work! + # + # Recall that some features cannot be matched quickly via hash lookup: Regex, Bytes, etc. + # When these features are the uncommon features used to filter rules, we have to evaluate the + # feature frequently whenever a string/bytes feature is encountered. Its slow, but we can't + # get around it. Reducing our reliance on regex/bytes feature and/or finding a way to + # index these can futher improve performance. + # + # See the corresponding unstable tests in `test_match.py::test_index_features_*`. + + # Find all the rules that could match the given feature set. + # Ideally we want this set to be as small and focused as possible, + # and we can tune it by tweaking `_index_rules_by_feature`. + candidate_rule_names: Set[str] = set() for feature in features: - easy_rule_names = easy_rules_by_feature.get(feature) - if easy_rule_names: - candidate_rule_names.update(easy_rule_names) + candidate_rule_names.update(feature_index.rules_by_feature.get(feature, ())) - # first, match against the set of rules that have at least one - # feature shared with our feature set. + # Some rules rely totally on regex features, like the HTTP User-Agent rules. + # In these cases, when we encounter any string feature, we have to scan those + # regexes to find the candidate rules. + # As mentioned above, this is not good for performance, but its required for correctness. + # + # We may want to try to pre-evaluate these strings, based on their presence in the file, + # to reduce the number of evaluations we do here. + # See: https://github.com/mandiant/capa/issues/2063#issuecomment-2095639672 + # + # We may also want to specialize case-insensitive strings, which would enable them to + # be indexed, and therefore skip the scanning here, improving performance. + # This strategy is described here: + # https://github.com/mandiant/capa/issues/2063#issuecomment-2107083068 + if feature_index.string_rules: + + # This is a FeatureSet that contains only String features. + # Since we'll only be evaluating String/Regex features below, we don't care about + # other sorts of features (Mnemonic, Number, etc.) and therefore can save some time + # during evaluation. + # + # Specifically, we can address the issue described here: + # https://github.com/mandiant/capa/issues/2063#issuecomment-2095397884 + # That we spend a lot of time collecting String instances within `Regex.evaluate`. + # We don't have to address that issue further as long as we pre-filter the features here. + string_features: FeatureSet = {} + for feature, locations in features.items(): + if isinstance(feature, capa.features.common.String): + string_features[feature] = locations + + if string_features: + for rule_name, wanted_strings in feature_index.string_rules.items(): + for wanted_string in wanted_strings: + if wanted_string.evaluate(string_features): + candidate_rule_names.add(rule_name) + + # Like with String/Regex features above, we have to scan for Bytes to find candidate rules. + # + # We may want to index bytes when they have a common length, like 16 or 32. + # This would help us avoid the scanning here, which would improve performance. + # The strategy is described here: + # https://github.com/mandiant/capa/issues/2063#issuecomment-2107052190 + if feature_index.bytes_rules: + bytes_features: FeatureSet = {} + for feature, locations in features.items(): + if isinstance(feature, capa.features.common.Bytes): + bytes_features[feature] = locations + + if bytes_features: + for rule_name, wanted_bytess in feature_index.bytes_rules.items(): + for wanted_bytes in wanted_bytess: + if wanted_bytes.evaluate(bytes_features): + candidate_rule_names.add(rule_name) + + # No rules can possibly match, so quickly return. + if not candidate_rule_names: + return (features, {}) + + # Here are the candidate rules (before we just had their names). candidate_rules = [self.rules[name] for name in candidate_rule_names] - features2, easy_matches = ceng.match(candidate_rules, features, addr) - # note that we've stored the updated feature set in `features2`. - # this contains a superset of the features in `features`; - # it contains additional features for any easy rule matches. - # we'll pass this feature set to hard rule matching, since one - # of those rules might rely on an easy rule match. + # Order rules topologically, so that rules with dependencies work correctly. + RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules) + # - # the updated feature set from hard matching will go into `features3`. - # this is a superset of `features2` is a superset of `features`. - # ultimately, this is what we'll return to the caller. + # The following is derived from ceng.match + # extended to interact with candidate_rules upon rule match. # - # in each case, we could have assigned the updated feature set back to `features`, - # but this is slightly more explicit how we're tracking the data. - - # now, match against (topologically ordered) list of rules - # that we can't really make any guesses about. - # these are rules with hard features, like substring/regex/bytes and match statements. - hard_rules = [self.rules[name] for name in hard_rule_names] - features3, hard_matches = ceng.match(hard_rules, features2, addr) - - # note that above, we probably are skipping matching a bunch of - # rules that definitely would never hit. - # specifically, "easy rules" that don't share any features with - # feature set. - - # MatchResults doesn't technically have an .update() method - # but a dict does. - matches = {} # type: ignore - matches.update(easy_matches) - matches.update(hard_matches) - - return (features3, matches) + + results: ceng.MatchResults = collections.defaultdict(list) + + # If we match a rule, then we'll add a MatchedRule to the features that will be returned, + # but we want to do that in a copy. We'll lazily create the copy below, once a match has + # actually been found. + augmented_features = features + + while candidate_rules: + rule = candidate_rules.pop(0) + res = rule.evaluate(augmented_features, short_circuit=True) + if res: + # we first matched the rule with short circuiting enabled. + # this is much faster than without short circuiting. + # however, we want to collect all results thoroughly, + # so once we've found a match quickly, + # go back and capture results without short circuiting. + res = rule.evaluate(augmented_features, short_circuit=False) + + # sanity check + assert bool(res) is True + + results[rule.name].append((addr, res)) + # We need to update the current features because subsequent iterations may use newly added features, + # such as rule or namespace matches. + if augmented_features is features: + # lazily create the copy of features only when a rule matches, since it could be expensive. + augmented_features = collections.defaultdict(set, copy.copy(features)) + + ceng.index_rule_matches(augmented_features, rule, [addr]) + + # Its possible that we're relying on a MatchedRule (or namespace) feature to be the + # uncommon feature used to filter other rules. So, extend the candidate + # rules with any of these dependencies. If we find any, also ensure they're + # evaluated in the correct topologic order, so that further dependencies work. + new_features = [capa.features.common.MatchedRule(rule.name)] + for namespace in ceng.get_rule_namespaces(rule): + new_features.append(capa.features.common.MatchedRule(namespace)) + + if new_features: + new_candidates: List[str] = [] + for new_feature in new_features: + new_candidates.extend(feature_index.rules_by_feature.get(new_feature, ())) + + if new_candidates: + candidate_rule_names.update(new_candidates) + candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates]) + RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules) + + return (augmented_features, results) + + def match( + self, scope: Scope, features: FeatureSet, addr: Address, paranoid=False + ) -> Tuple[FeatureSet, ceng.MatchResults]: + """ + Match rules from this ruleset at the given scope against the given features. + + This wrapper around _match exists so that we can assert it matches precisely + the same as `capa.engine.match`, just faster. + + This matcher does not handle some edge cases: + - top level NOT statements + - also top level counted features with zero occurances, like: `count(menmonic(mov)): 0` + - nested NOT statements (NOT: NOT: foo) + + We should discourage/forbid these constructs from our rules and add lints for them. + TODO(williballenthin): add lints for logic edge cases + + Args: + paranoid: when true, demonstrate that the naive matcher agrees with this + optimized matcher (much slower! around 10x slower). + """ + features, matches = self._match(scope, features, addr) + + if paranoid: + rules: List[Rule] = self.rules_by_scope[scope] + paranoid_features, paranoid_matches = capa.engine.match(rules, features, addr) + + if features != paranoid_features: + logger.warning("paranoid: %s: %s", scope, addr) + for feature in sorted(set(features.keys()) & set(paranoid_features.keys())): + logger.warning("paranoid: %s", feature) + + for feature in sorted(set(features.keys()) - set(paranoid_features.keys())): + logger.warning("paranoid: + %s", feature) + + for feature in sorted(set(paranoid_features.keys()) - set(features.keys())): + logger.warning("paranoid: - %s", feature) + + assert features == paranoid_features + assert set(matches.keys()) == set(paranoid_matches.keys()) + + return features, matches def is_nursery_rule_path(path: Path) -> bool: diff --git a/tests/test_match.py b/tests/test_match.py index 07af33d78..d621853c7 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -5,9 +5,10 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. - import textwrap +import pytest + import capa.rules import capa.engine import capa.features.insn @@ -130,22 +131,29 @@ def test_match_range_exact_zero(): static: function dynamic: process features: - - count(number(100)): 0 + - and: + - count(number(100)): 0 + + # we can't have `count(foo): 0` at the top level, + # since we don't support top level NOT statements. + # so we have this additional trivial feature. + - mnemonic: mov + """ ) r = capa.rules.Rule.from_yaml(rule) # feature isn't indexed - good. - _, matches = match([r], {}, 0x0) + _, matches = match([r], {capa.features.insn.Mnemonic("mov"): {}}, 0x0) assert "test rule" in matches # feature is indexed, but no matches. # i don't think we should ever really have this case, but good to check anyways. - _, matches = match([r], {capa.features.insn.Number(100): {}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {}, capa.features.insn.Mnemonic("mov"): {}}, 0x0) assert "test rule" in matches # too many matches - _, matches = match([r], {capa.features.insn.Number(100): {1}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {1}, capa.features.insn.Mnemonic("mov"): {1}}, 0x0) assert "test rule" not in matches @@ -159,21 +167,27 @@ def test_match_range_with_zero(): static: function dynamic: process features: - - count(number(100)): (0, 1) + - and: + - count(number(100)): (0, 1) + + # we can't have `count(foo): 0` at the top level, + # since we don't support top level NOT statements. + # so we have this additional trivial feature. + - mnemonic: mov """ ) r = capa.rules.Rule.from_yaml(rule) # ok - _, matches = match([r], {}, 0x0) + _, matches = match([r], {capa.features.insn.Mnemonic("mov"): {}}, 0x0) assert "test rule" in matches - _, matches = match([r], {capa.features.insn.Number(100): {}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {}, capa.features.insn.Mnemonic("mov"): {}}, 0x0) assert "test rule" in matches - _, matches = match([r], {capa.features.insn.Number(100): {1}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {1}, capa.features.insn.Mnemonic("mov"): {1}}, 0x0) assert "test rule" in matches # too many matches - _, matches = match([r], {capa.features.insn.Number(100): {1, 2}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {1, 2}, capa.features.insn.Mnemonic("mov"): {1, 2}}, 0x0) assert "test rule" not in matches @@ -551,7 +565,8 @@ def test_match_regex_values_always_string(): assert capa.features.common.MatchedRule("test rule") in features -def test_match_not(): +@pytest.mark.xfail(reason="can't have top level NOT") +def test_match_only_not(): rule = textwrap.dedent( """ rule: @@ -572,6 +587,30 @@ def test_match_not(): assert "test rule" in matches +def test_match_not(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + namespace: testns1/testns2 + features: + - and: + - mnemonic: mov + - not: + - number: 99 + """ + ) + r = capa.rules.Rule.from_yaml(rule) + + _, matches = match([r], {capa.features.insn.Number(100): {1, 2}, capa.features.insn.Mnemonic("mov"): {1, 2}}, 0x0) + assert "test rule" in matches + + +@pytest.mark.xfail(reason="can't have nested NOT") def test_match_not_not(): rule = textwrap.dedent( """ @@ -749,3 +788,95 @@ def test_match_os_any(): 0x0, ) assert "test rule" in matches + + +# this test demonstrates the behavior of unstable features that may change before the next major release. +def test_index_features_and_unstable(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - and: + - mnemonic: mov + - api: CreateFileW + """ + ) + r = capa.rules.Rule.from_yaml(rule) + rr = capa.rules.RuleSet([r]) + index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION] + + # there's a single rule, and its indexed by a single feature + assert len(index.rules_by_feature) == 1 + # and we index by the more uncommon API feature, not the common mnemonic feature + assert capa.features.insn.API("CreateFileW") in index.rules_by_feature + + assert not index.string_rules + assert not index.bytes_rules + + +# this test demonstrates the behavior of unstable features that may change before the next major release. +def test_index_features_or_unstable(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - or: + - mnemonic: mov + - api: CreateFileW + """ + ) + r = capa.rules.Rule.from_yaml(rule) + rr = capa.rules.RuleSet([r]) + index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION] + + # there's a single rule, and its indexed by both features, + # because they fall under the single root OR node. + assert len(index.rules_by_feature) == 2 + assert capa.features.insn.API("CreateFileW") in index.rules_by_feature + assert capa.features.insn.Mnemonic("mov") in index.rules_by_feature + + assert not index.string_rules + assert not index.bytes_rules + + +# this test demonstrates the behavior of unstable features that may change before the next major release. +def test_index_features_nested_unstable(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - and: + - mnemonic: mov + - or: + - api: CreateFileW + - string: foo + """ + ) + r = capa.rules.Rule.from_yaml(rule) + rr = capa.rules.RuleSet([r]) + index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION] + + # there's a single rule, and its indexed by the two uncommon features, + # not the single common feature. + assert len(index.rules_by_feature) == 2 + assert capa.features.insn.API("CreateFileW") in index.rules_by_feature + assert capa.features.common.String("foo") in index.rules_by_feature + assert capa.features.insn.Mnemonic("mov") not in index.rules_by_feature + + assert not index.string_rules + assert not index.bytes_rules