From 8b0076bfcea16ddfb249b497929e6e426b3adcf5 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 14 May 2024 21:41:18 +0200 Subject: [PATCH 01/28] features: mark format as a global feature --- capa/features/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 279332e6e..b817fafb9 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -490,6 +490,6 @@ def __init__(self, value: str, description=None): def is_global_feature(feature): """ is this a feature that is extracted at every scope? - today, these are OS and arch features. + today, these are OS, arch, and format features. """ - return isinstance(feature, (OS, Arch)) + return isinstance(feature, (OS, Arch, Format)) From 8858537af8936c836b67a91b5922d360363ddb0d Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 14 May 2024 21:41:33 +0200 Subject: [PATCH 02/28] pep8 --- capa/features/extractors/binexport2/insn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py index 9b027ec88..1a6a99b21 100644 --- a/capa/features/extractors/binexport2/insn.py +++ b/capa/features/extractors/binexport2/insn.py @@ -14,7 +14,7 @@ from capa.features.insn import API, Number, Mnemonic, OperandNumber from capa.features.common import Bytes, String, Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext, AnalysisContext +from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, ReadMemoryError, InstructionContext from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 From 9c0c66245e9aaee8f82cbe92869de4fba1549712 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 14 May 2024 21:41:40 +0200 Subject: [PATCH 03/28] rules: optimize rule pre-filtering, first revision --- capa/rules/__init__.py | 528 ++++++++++++++++++++++++++--------------- 1 file changed, 337 insertions(+), 191 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 67d0b03ea..43dcb7665 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -9,11 +9,13 @@ import io import os import re +import copy import uuid import codecs import logging import binascii import collections +import dataclasses from enum import Enum from pathlib import Path @@ -1365,32 +1367,52 @@ def __init__( rules = capa.optimizer.optimize_rules(rules) - self.file_rules = self._get_rules_for_scope(rules, Scope.FILE) - self.process_rules = self._get_rules_for_scope(rules, Scope.PROCESS) - self.thread_rules = self._get_rules_for_scope(rules, Scope.THREAD) - self.call_rules = self._get_rules_for_scope(rules, Scope.CALL) - self.function_rules = self._get_rules_for_scope(rules, Scope.FUNCTION) - self.basic_block_rules = self._get_rules_for_scope(rules, Scope.BASIC_BLOCK) - self.instruction_rules = self._get_rules_for_scope(rules, Scope.INSTRUCTION) + scopes = ( + Scope.FILE, + Scope.PROCESS, + Scope.THREAD, + Scope.CALL, + Scope.FUNCTION, + Scope.BASIC_BLOCK, + Scope.INSTRUCTION, + ) + self.rules = {rule.name: rule for rule in rules} self.rules_by_namespace = index_rules_by_namespace(rules) + self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes} # unstable - (self._easy_file_rules_by_feature, self._hard_file_rules) = self._index_rules_by_feature(self.file_rules) - (self._easy_process_rules_by_feature, self._hard_process_rules) = self._index_rules_by_feature( - self.process_rules - ) - (self._easy_thread_rules_by_feature, self._hard_thread_rules) = self._index_rules_by_feature(self.thread_rules) - (self._easy_call_rules_by_feature, self._hard_call_rules) = self._index_rules_by_feature(self.call_rules) - (self._easy_function_rules_by_feature, self._hard_function_rules) = self._index_rules_by_feature( - self.function_rules - ) - (self._easy_basic_block_rules_by_feature, self._hard_basic_block_rules) = self._index_rules_by_feature( - self.basic_block_rules - ) - (self._easy_instruction_rules_by_feature, self._hard_instruction_rules) = self._index_rules_by_feature( - self.instruction_rules - ) + self._feature_indexes_by_scopes = { + scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope]) for scope in scopes + } + + @property + def file_rules(self): + return self.rules_by_scope[Scope.FILE] + + @property + def process_rules(self): + return self.rules_by_scope[Scope.PROCESS] + + @property + def thread_rules(self): + return self.rules_by_scope[Scope.THREAD] + + @property + def call_rules(self): + return self.rules_by_scope[Scope.CALL] + + @property + def function_rules(self): + return self.rules_by_scope[Scope.FUNCTION] + + @property + def basic_block_rules(self): + return self.rules_by_scope[Scope.BASIC_BLOCK] + + @property + def instruction_rules(self): + return self.rules_by_scope[Scope.INSTRUCTION] def __len__(self): return len(self.rules) @@ -1402,123 +1424,186 @@ def __contains__(self, rulename): return rulename in self.rules @staticmethod - def _index_rules_by_feature(rules) -> Tuple[Dict[Feature, Set[str]], List[str]]: - """ - split the given rules into two structures: - - "easy rules" are indexed by feature, - such that you can quickly find the rules that contain a given feature. - - "hard rules" are those that contain substring/regex/bytes features or match statements. - these continue to be ordered topologically. + def _score_number(v: int) -> int: + if -0x8000 <= v <= 0x8000: + return 3 - a rule evaluator can use the "easy rule" index to restrict the - candidate rules that might match a given set of features. + if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF: + return 3 - at this time, a rule evaluator can't do anything special with - the "hard rules". it must still do a full top-down match of each - rule, in topological order. + return 7 - this does not index global features, because these are not selective, and - won't be used as the sole feature used to match. - """ + @staticmethod + def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int: + if isinstance( + node, + capa.features.common.MatchedRule, + ): + if node.value in scores_by_rule: + # other rule must match before this one, in same scope. + # use score from that rule, which will have already been processed. + return scores_by_rule[node.value] + else: + # scores_by_rule only contains rules for the current scope + # so the requested rule must be from a smaller scope + # and we can assume the feature will exist. + # + # We don't know what the score should be, unfortunately. + # Could try to thread that through. Use "5" in the meantime. + return 5 + + elif capa.features.common.is_global_feature(node): + # we don't want to index global features + # because they're not very selective. + # they also don't stand on their own - there's always some other logic. + return 0 + + elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)): + v = node.value + assert isinstance(v, int) + return RuleSet._score_number(v) + + C = node.__class__ + return { + # Scanning features (non-hashable) + # These are the non-hashable features. + # We can't use these for quick matching. + capa.features.common.Substring: 0, + capa.features.common.Regex: 0, + capa.features.common.Bytes: 0, + # hashable features + capa.features.common.Characteristic: 4, + capa.features.common.String: 9, + capa.features.common.Class: 5, + capa.features.common.Namespace: 5, + capa.features.insn.API: 8, + capa.features.insn.Property: 5, + capa.features.insn.Offset: 4, + capa.features.insn.Mnemonic: 2, + capa.features.insn.OperandOffset: 4, + capa.features.basicblock.BasicBlock: 1, + capa.features.file.Export: 7, + capa.features.file.Import: 5, + capa.features.file.Section: 5, + capa.features.file.FunctionName: 5, + }[C] + + @dataclass + class RuleFeatureIndex: + rules_by_feature: Dict[Feature, Set[str]] = dataclasses.field(default=dict) + string_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict) + bytes_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict) - # we'll do a couple phases: - # - # 1. recursively visit all nodes in all rules, - # a. indexing all features - # b. recording the types of features found per rule - # 2. compute the easy and hard rule sets - # 3. remove hard rules from the rules-by-feature index - # 4. construct the topologically ordered list of hard rules - rules_with_easy_features: Set[str] = set() - rules_with_hard_features: Set[str] = set() + @staticmethod + def _index_rules_by_feature(scope: Scope, rules: List[Rule]) -> RuleFeatureIndex: rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set) + scores_by_rule: Dict[str, int] = {} - def rec(rule_name: str, node: Union[Feature, Statement]): + # note closure over scores_by_rule + def rec( + rule_name: str, node: Union[Feature, Statement] + ) -> Union[None, Tuple[int, Feature], Tuple[int, Set[Feature]]]: """ - walk through a rule's logic tree, indexing the easy and hard rules, - and the features referenced by easy rules. + Walk through a rule's logic tree, picking the features to use for indexing, + returning the feature and an associated score. + The higher the score, the more selective the feature is expected to be. + The score is only used internally, to pick the best fetaure from within + and AND block. """ - if isinstance( - node, - ( - # these are the "hard features" - # substring: scanning feature - capa.features.common.Substring, - # regex: scanning feature - capa.features.common.Regex, - # bytes: scanning feature - capa.features.common.Bytes, - # match: dependency on another rule, - # which we have to evaluate first, - # and is therefore tricky. - capa.features.common.MatchedRule, - ), - ): - # hard feature: requires scan or match lookup - rules_with_hard_features.add(rule_name) - elif isinstance(node, capa.features.common.Feature): - if capa.features.common.is_global_feature(node): - # we don't want to index global features - # because they're not very selective. - # - # they're global, so if they match at one location in a file, - # they'll match at every location in a file. - # so that's not helpful to decide how to downselect. - # - # and, a global rule will never be the sole selector in a rule. - pass - else: - # easy feature: hash lookup - rules_with_easy_features.add(rule_name) - rules_by_feature[node].add(rule_name) - elif isinstance(node, (ceng.Not)): - # `not:` statements are tricky to deal with. - # - # first, features found under a `not:` should not be indexed, - # because they're not wanted to be found. - # second, `not:` can be nested under another `not:`, or two, etc. - # third, `not:` at the root or directly under an `or:` - # means the rule will match against *anything* not specified there, - # which is a difficult set of things to compute and index. - # - # so, if a rule has a `not:` statement, its hard. - # as of writing, this is an uncommon statement, with only 6 instances in 740 rules. - rules_with_hard_features.add(rule_name) + + if isinstance(node, (ceng.Not)): + # we don't index features within NOT blocks + return None + elif isinstance(node, (ceng.Some)) and node.count == 0: - # `optional:` and `0 or more:` are tricky to deal with. - # # when a subtree is optional, it may match, but not matching # doesn't have any impact either. # now, our rule authors *should* not put this under `or:` # and this is checked by the linter, - # but this could still happen (e.g. private rule set without linting) - # and would be hard to trace down. - # - # so better to be safe than sorry and consider this a hard case. - rules_with_hard_features.add(rule_name) + return None + elif isinstance(node, (ceng.Range)) and node.min == 0: - # `count(foo): 0 or more` are tricky to deal with. - # because the min is 0, - # this subtree *can* match just about any feature - # (except the given one) - # which is a difficult set of things to compute and index. - rules_with_hard_features.add(rule_name) + # `count(foo): 0 or more` is just like an optional block, + # because the min is 0, this subtree *can* match just about any feature. + return None + + elif isinstance(node, capa.features.common.Feature) and capa.features.common.is_global_feature(node): + # we don't want to index global features + # because they're not very selective. + # they also don't stand on their own - there's always some other logic. + return None + + elif isinstance(node, capa.features.common.Feature): + return (RuleSet._score_feature(scores_by_rule, node), node) + elif isinstance(node, (ceng.Range)): - rec(rule_name, node.child) - elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)): + # feature is found N times + return rec(rule_name, node.child) + + elif isinstance(node, ceng.And): + scores = [] for child in node.children: - rec(rule_name, child) + try: + score = rec(rule_name, child) + except AssertionError as e: + # if one branch isn't possible to index, + # thats ok, we can require a different one to match + logger.warning("and: swallowing: %s: %s", e, rule_name) + continue + + if not score: + continue + + scores.append(score) + + # otherwise we can't index this rule + assert len(scores) > 0 + + def and_score_key(item): + # order by score, then fewest number of features. + # TODO(wb): minimize number of features? play with this. + score, features = item + + if isinstance(features, set): + return (score, -len(features)) + else: + return (score, -1) + + scores.sort(key=and_score_key, reverse=True) + + # pick the best feature + return scores[0] + + elif isinstance(node, (ceng.Or, ceng.Some)): + min_score = 10000000 # assume this is larger than any score + features = set() + + for child in node.children: + item = rec(rule_name, child) + assert item is not None, "can't index OR branch" + + score, feature = item + + min_score = min(min_score, score) + + if isinstance(feature, set): + features.update(feature) + else: + features.add(feature) + + return min_score, features + elif isinstance(node, ceng.Statement): - # unhandled type of statement. - # this should only happen if a new subtype of `Statement` + # Unhandled type of statement. + # This should only happen if a new subtype of `Statement` # has since been added to capa. # - # ideally, we'd like to use mypy for exhaustiveness checking + # Ideally, we'd like to use mypy for exhaustiveness checking # for all the subtypes of `Statement`. - # but, as far as i can tell, mypy does not support this type + # But, as far as I can tell, mypy does not support this type # of checking. # - # in a way, this makes some intuitive sense: + # In a way, this makes some intuitive sense: # the set of subtypes of type A is unbounded, # because any user might come along and create a new subtype B, # so mypy can't reason about this set of types. @@ -1527,28 +1612,71 @@ def rec(rule_name: str, node: Union[Feature, Statement]): # programming error assert_never(node) + string_rules: Dict[str, List[Feature]] = {} + bytes_rules: Dict[str, List[Feature]] = {} + for rule in rules: rule_name = rule.meta["name"] + root = rule.statement - rec(rule_name, root) + try: + item = rec(rule_name, root) + except AssertionError as e: + logger.warning("fail: %s: %s", e, rule_name) + continue - # if a rule has a hard feature, - # don't consider it easy, and therefore, - # don't index any of its features. - # - # otherwise, its an easy rule, and index its features - for rules_with_feature in rules_by_feature.values(): - rules_with_feature.difference_update(rules_with_hard_features) - easy_rules_by_feature = rules_by_feature - - # `rules` is already topologically ordered, - # so extract our hard set into the topological ordering. - hard_rules = [] - for rule in rules: - if rule.meta["name"] in rules_with_hard_features: - hard_rules.append(rule.meta["name"]) + if item is None: + logger.warning("fail: can't index rule: %s", rule_name) + continue + assert item is not None, "can't index rule" + + score, feature = item + + if isinstance(feature, set): + features = feature + else: + features = {feature} + + string_features = [ + feature + for feature in features + if isinstance(feature, (capa.features.common.Substring, capa.features.common.Regex)) + ] + bytes_features = [feature for feature in features if isinstance(feature, capa.features.common.Bytes)] + hashable_features = [ + feature + for feature in features + if not isinstance( + feature, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes) + ) + ] + + logger.debug("indexing: features: %d, score: %d, rule: %s", len(features), score, rule_name) + scores_by_rule[rule_name] = score + for feature in features: + logger.debug(" : [%d] %s", RuleSet._score_feature(scores_by_rule, feature), feature) + + if string_features: + string_rules[rule_name] = string_features + + if bytes_features: + bytes_rules[rule_name] = bytes_features + + for feature in hashable_features: + rules_by_feature[feature].add(rule_name) + + logger.debug("indexing: %d features indexed for scope %s", len(rules_by_feature), scope) + logger.debug("indexing: %d indexed features are shared by more than 3 rules", + len([feature for feature, rules in rules_by_feature.items() if len(rules) > 3])) + logger.debug("indexing: %d scanning string features, %d scanning bytes features", + len(string_rules), len(bytes_rules)) - return (easy_rules_by_feature, hard_rules) + # TODO(wb): remember, when evaluating candidates, make sure + # to do it in topological order, so match statements work. + + # TODO(wb): remember, as rule matches are found, + # the candidates must be extended again, to account for match statements. + return RuleSet.RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules) @staticmethod def _get_rules_for_scope(rules, scope) -> List[Rule]: @@ -1625,74 +1753,92 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat this routine should act just like `capa.engine.match`, except that it may be more performant. """ - easy_rules_by_feature = {} - if scope == Scope.FILE: - easy_rules_by_feature = self._easy_file_rules_by_feature - hard_rule_names = self._hard_file_rules - elif scope == Scope.PROCESS: - easy_rules_by_feature = self._easy_process_rules_by_feature - hard_rule_names = self._hard_process_rules - elif scope == Scope.THREAD: - easy_rules_by_feature = self._easy_thread_rules_by_feature - hard_rule_names = self._hard_thread_rules - elif scope == Scope.CALL: - easy_rules_by_feature = self._easy_call_rules_by_feature - hard_rule_names = self._hard_call_rules - elif scope == Scope.FUNCTION: - easy_rules_by_feature = self._easy_function_rules_by_feature - hard_rule_names = self._hard_function_rules - elif scope == Scope.BASIC_BLOCK: - easy_rules_by_feature = self._easy_basic_block_rules_by_feature - hard_rule_names = self._hard_basic_block_rules - elif scope == Scope.INSTRUCTION: - easy_rules_by_feature = self._easy_instruction_rules_by_feature - hard_rule_names = self._hard_instruction_rules - else: - assert_never(scope) + + feature_index = self._feature_indexes_by_scopes[scope] + rules = self.rules_by_scope[scope] + rules_by_name = {rule.name: rule for rule in rules} + # topologic location of rule given its name + rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)} + + def resort_rules_topologically(rules: List[Rule]): + # note closure over `rule_index_by_rule_name` + rules.sort(key=lambda r: rule_index_by_rule_name[r.name]) candidate_rule_names = set() for feature in features: - easy_rule_names = easy_rules_by_feature.get(feature) - if easy_rule_names: - candidate_rule_names.update(easy_rule_names) + candidate_rule_names.update(feature_index.rules_by_feature.get(feature, ())) + + if feature_index.string_rules: + string_features = {} + for feature, locations in features.items(): + if isinstance(feature, capa.features.common.String): + string_features[feature] = locations + + if string_features: + for rule_name, wanted_strings in feature_index.string_rules.items(): + for wanted_string in wanted_strings: + if wanted_string.evaluate(string_features): + candidate_rule_names.add(rule_name) + + if feature_index.bytes_rules: + bytes_features = {} + for feature, locations in features.items(): + if isinstance(feature, capa.features.common.Bytes): + bytes_features[feature] = locations + + if bytes_features: + for rule_name, wanted_bytess in feature_index.bytes_rules.items(): + for wanted_bytes in wanted_bytess: + if wanted_bytes.evaluate(bytes_features): + candidate_rule_names.add(rule_name) + + # logger.debug("perf: match: %s: %s: %d features, %d candidate rules", + # scope, addr, len(features), len(candidate_rule_names)) # first, match against the set of rules that have at least one # feature shared with our feature set. candidate_rules = [self.rules[name] for name in candidate_rule_names] - features2, easy_matches = ceng.match(candidate_rules, features, addr) + resort_rules_topologically(candidate_rules) - # note that we've stored the updated feature set in `features2`. - # this contains a superset of the features in `features`; - # it contains additional features for any easy rule matches. - # we'll pass this feature set to hard rule matching, since one - # of those rules might rely on an easy rule match. # - # the updated feature set from hard matching will go into `features3`. - # this is a superset of `features2` is a superset of `features`. - # ultimately, this is what we'll return to the caller. + # The following is derived from ceng.match + # extended to interact with candidate_rules upon rule match. # - # in each case, we could have assigned the updated feature set back to `features`, - # but this is slightly more explicit how we're tracking the data. - - # now, match against (topologically ordered) list of rules - # that we can't really make any guesses about. - # these are rules with hard features, like substring/regex/bytes and match statements. - hard_rules = [self.rules[name] for name in hard_rule_names] - features3, hard_matches = ceng.match(hard_rules, features2, addr) - - # note that above, we probably are skipping matching a bunch of - # rules that definitely would never hit. - # specifically, "easy rules" that don't share any features with - # feature set. - - # MatchResults doesn't technically have an .update() method - # but a dict does. - matches = {} # type: ignore - matches.update(easy_matches) - matches.update(hard_matches) - - return (features3, matches) + results: MatchResults = collections.defaultdict(list) + # copy features so that we can modify it + # without affecting the caller (keep this function pure) + # + # note: copy doesn't notice this is a defaultdict, so we'll recreate that manually. + features = collections.defaultdict(set, copy.copy(features)) + + while candidate_rules: + rule = candidate_rules.pop(0) + res = rule.evaluate(features, short_circuit=True) + if res: + # we first matched the rule with short circuiting enabled. + # this is much faster than without short circuiting. + # however, we want to collect all results thoroughly, + # so once we've found a match quickly, + # go back and capture results without short circuiting. + res = rule.evaluate(features, short_circuit=False) + + # sanity check + assert bool(res) is True + + results[rule.name].append((addr, res)) + # we need to update the current `features` + # because subsequent iterations of this loop may use newly added features, + # such as rule or namespace matches. + ceng.index_rule_matches(features, rule, [addr]) + + new_candidates = feature_index.rules_by_feature.get(capa.features.common.MatchedRule(rule.name), ()) + if new_candidates: + candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates]) + resort_rules_topologically(candidate_rules) + + return (features, results) + def is_nursery_rule_path(path: Path) -> bool: """ From 2d9c82fb17b6fb19087deb8e49c23c64ef58cccf Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 22 May 2024 10:15:04 +0200 Subject: [PATCH 04/28] lints --- capa/features/extractors/binexport2/insn.py | 5 ++--- capa/rules/__init__.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py index 1a6a99b21..e77669b7d 100644 --- a/capa/features/extractors/binexport2/insn.py +++ b/capa/features/extractors/binexport2/insn.py @@ -14,7 +14,7 @@ from capa.features.insn import API, Number, Mnemonic, OperandNumber from capa.features.common import Bytes, String, Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress -from capa.features.extractors.binexport2 import AnalysisContext, FunctionContext, ReadMemoryError, InstructionContext +from capa.features.extractors.binexport2 import FunctionContext, ReadMemoryError, InstructionContext from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 @@ -31,8 +31,7 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle insn = be2.instruction[ii.instruction_index] for addr in insn.call_target: - if addr in be2_analysis.thunks: - addr = be2_analysis.thunks[addr] + addr = be2_analysis.thunks.get(addr, addr) if addr not in be2_index.vertex_index_by_address: # disassembler did not define function at address diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 43dcb7665..f525e1ab5 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1666,10 +1666,13 @@ def and_score_key(item): rules_by_feature[feature].add(rule_name) logger.debug("indexing: %d features indexed for scope %s", len(rules_by_feature), scope) - logger.debug("indexing: %d indexed features are shared by more than 3 rules", - len([feature for feature, rules in rules_by_feature.items() if len(rules) > 3])) - logger.debug("indexing: %d scanning string features, %d scanning bytes features", - len(string_rules), len(bytes_rules)) + logger.debug( + "indexing: %d indexed features are shared by more than 3 rules", + len([feature for feature, rules in rules_by_feature.items() if len(rules) > 3]), + ) + logger.debug( + "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules) + ) # TODO(wb): remember, when evaluating candidates, make sure # to do it in topological order, so match statements work. @@ -1756,7 +1759,6 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat feature_index = self._feature_indexes_by_scopes[scope] rules = self.rules_by_scope[scope] - rules_by_name = {rule.name: rule for rule in rules} # topologic location of rule given its name rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)} @@ -1804,7 +1806,7 @@ def resort_rules_topologically(rules: List[Rule]): # The following is derived from ceng.match # extended to interact with candidate_rules upon rule match. # - results: MatchResults = collections.defaultdict(list) + results: ceng.MatchResults = collections.defaultdict(list) # copy features so that we can modify it # without affecting the caller (keep this function pure) @@ -1838,7 +1840,7 @@ def resort_rules_topologically(rules: List[Rule]): resort_rules_topologically(candidate_rules) return (features, results) - + def is_nursery_rule_path(path: Path) -> bool: """ From 0dc0c515dca1d6bc00f9a2a6176b1dfd764b1d91 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 22 May 2024 11:25:00 +0200 Subject: [PATCH 05/28] rules: add documentation for optimized match routine --- capa/rules/__init__.py | 251 ++++++++++++++++++++++++++--------------- 1 file changed, 162 insertions(+), 89 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index f525e1ab5..9722ab970 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1423,69 +1423,69 @@ def __getitem__(self, rulename): def __contains__(self, rulename): return rulename in self.rules - @staticmethod - def _score_number(v: int) -> int: - if -0x8000 <= v <= 0x8000: - return 3 - - if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF: - return 3 - - return 7 - + # unstable @staticmethod def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int: - if isinstance( + if capa.features.common.is_global_feature(node): + # We don't want to index global features + # because they're not very selective. + # They also don't stand on their own - there's always some other logic. + raise ValueError("don't index global features") + + elif isinstance( node, capa.features.common.MatchedRule, ): - if node.value in scores_by_rule: - # other rule must match before this one, in same scope. - # use score from that rule, which will have already been processed. - return scores_by_rule[node.value] - else: - # scores_by_rule only contains rules for the current scope - # so the requested rule must be from a smaller scope - # and we can assume the feature will exist. - # - # We don't know what the score should be, unfortunately. - # Could try to thread that through. Use "5" in the meantime. - return 5 - - elif capa.features.common.is_global_feature(node): - # we don't want to index global features - # because they're not very selective. - # they also don't stand on their own - there's always some other logic. - return 0 + # If present, other rule must match before this one, in same scope. + # Use score from that rule, which will have already been processed due to topological sorting. + # Otherwise, use a default score of 5. + return scores_by_rule.get(node.value, 5) elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)): v = node.value assert isinstance(v, int) - return RuleSet._score_number(v) + + if -0x8000 <= v <= 0x8000: + # Small numbers are probably pretty common, like structure offsets, etc. + return 3 + + if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF: + # Numbers close to u32::max_int are also probably pretty common, + # like signed numbers closed to 0 that are stored as unsigned ints. + return 3 + + if 0xFFFF_FFFF_FFFF_FF00 <= v <= 0xFFFF_FFFF_FFFF_FFFF: + return 3 + + # Other numbers are assumed to be uncommon. + return 7 + + elif isinstance(node, (capa.features.common.Substring, capa.features.common.Regex, capa.features.common.Bytes)): + # Scanning features (non-hashable), which we can't use for quick matching/filtering. + return 0 C = node.__class__ return { - # Scanning features (non-hashable) - # These are the non-hashable features. - # We can't use these for quick matching. - capa.features.common.Substring: 0, - capa.features.common.Regex: 0, - capa.features.common.Bytes: 0, - # hashable features - capa.features.common.Characteristic: 4, capa.features.common.String: 9, + capa.features.insn.API: 8, + capa.features.file.Export: 7, + # "uncommon numbers": 7 capa.features.common.Class: 5, capa.features.common.Namespace: 5, - capa.features.insn.API: 8, capa.features.insn.Property: 5, - capa.features.insn.Offset: 4, - capa.features.insn.Mnemonic: 2, - capa.features.insn.OperandOffset: 4, - capa.features.basicblock.BasicBlock: 1, - capa.features.file.Export: 7, capa.features.file.Import: 5, capa.features.file.Section: 5, capa.features.file.FunctionName: 5, + # default MatchedRule: 5 + capa.features.common.Characteristic: 4, + capa.features.insn.Offset: 4, + capa.features.insn.OperandOffset: 4, + # "common numbers": 3 + capa.features.insn.Mnemonic: 2, + capa.features.basicblock.BasicBlock: 1, + # substring: 0 + # regex: 0 + # bytes: 0 }[C] @dataclass @@ -1494,6 +1494,7 @@ class RuleFeatureIndex: string_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict) bytes_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict) + # unstable @staticmethod def _index_rules_by_feature(scope: Scope, rules: List[Rule]) -> RuleFeatureIndex: rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set) @@ -1619,16 +1620,8 @@ def and_score_key(item): rule_name = rule.meta["name"] root = rule.statement - try: - item = rec(rule_name, root) - except AssertionError as e: - logger.warning("fail: %s: %s", e, rule_name) - continue - - if item is None: - logger.warning("fail: can't index rule: %s", rule_name) - continue - assert item is not None, "can't index rule" + item = rec(rule_name, root) + assert item is not None score, feature = item @@ -1674,11 +1667,6 @@ def and_score_key(item): "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules) ) - # TODO(wb): remember, when evaluating candidates, make sure - # to do it in topological order, so match statements work. - - # TODO(wb): remember, as rule matches are found, - # the candidates must be extended again, to account for match statements. return RuleSet.RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules) @staticmethod @@ -1749,29 +1737,89 @@ def filter_rules_by_meta(self, tag: str) -> "RuleSet": break return RuleSet(list(rules_filtered)) + # unstable + @staticmethod + def _sort_rules_by_index(rule_index_by_rule_name: Dict[str, int], rules: List[Rule]): + """ + Sort (in place) the given rules by their index provided by the given Dict. + This mapping is intended to represent the topologic index of the given rule; + that is, rules with a lower index should be evaluated first, since their dependencies + will be evaluated later. + """ + rules.sort(key=lambda r: rule_index_by_rule_name[r.name]) + def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]: """ - match rules from this ruleset at the given scope against the given features. + Match rules from this ruleset at the given scope against the given features. - this routine should act just like `capa.engine.match`, - except that it may be more performant. + This routine should act just like `capa.engine.match`, except that it may be more performant. + It uses its knowledge of all the rules to evaluate a minimal set of candidate rules for the given features. """ - feature_index = self._feature_indexes_by_scopes[scope] - rules = self.rules_by_scope[scope] - # topologic location of rule given its name + feature_index: RuleSet.RuleFeatureIndex = self._feature_indexes_by_scopes[scope] + rules: List[Rule] = self.rules_by_scope[scope] + # Topologic location of rule given its name. + # That is, rules with a lower index should be evaluated first, since their dependencies + # will be evaluated later. rule_index_by_rule_name = {rule.name: i for i, rule in enumerate(rules)} - def resort_rules_topologically(rules: List[Rule]): - # note closure over `rule_index_by_rule_name` - rules.sort(key=lambda r: rule_index_by_rule_name[r.name]) - - candidate_rule_names = set() + # This algorithm is optimized to evaluate as few rules as possible, + # because the less work we do, the faster capa can run. + # + # It relies on the observation that most rules don't match, + # and that most rules have an uncommon feature that *must* be present for the rule to match. + # + # Therefore, we record which uncommon feature(s) is required for each rule to match, + # and then only inspect these few candidates when a feature is seen in some scope. + # Ultimately, the exact same rules are matched with precisely the same results, + # its just done faster, because we ignore most of the rules that never would have matched anyways. + # + # In `_index_rules_by_feature`, we do the hard work of computing the minimal set of + # uncommon features for each rule. While its a little expensive, its a single pass + # that gets reused at every scope instance (read: thousands or millions of times). + # + # In the current routine, we collect all the rules that might match, given the presence + # of any uncommon feature. We sort the rules topographically, so that rule dependencies work out, + # and then we evaluate the candidate rules. In practice, this saves 20-50x the work! + # + # Recall that some features cannot be matched quickly via hash lookup: Regex, Bytes, etc. + # When these features are the uncommon features used to filter rules, we have to evaluate the + # feature frequently whenever a string/bytes feature is encountered. Its slow, but we can't + # get around it. Reducing our reliance on regex/bytes feature and/or finding a way to + # index these can futher improve performance. + + # Find all the rules that could match the given feature set. + # Ideally we want this set to be as small and focused as possible, + # and we can tune it by tweaking `_index_rules_by_feature`. + candidate_rule_names: Set[str] = set() for feature in features: candidate_rule_names.update(feature_index.rules_by_feature.get(feature, ())) + # Some rules rely totally on regex features, like the HTTP User-Agent rules. + # In these cases, when we encounter any string feature, we have to scan those + # regexes to find the candidate rules. + # As mentioned above, this is not good for performance, but its required for correctness. + # + # We may want to try to pre-evaluate these strings, based on their presence in the file, + # to reduce the number of evaluations we do here. + # See: https://github.com/mandiant/capa/issues/2063#issuecomment-2095639672 + # + # We may also want to specialize case-insensitive strings, which would enable them to + # be indexed, and therefore skip the scanning here, improving performance. + # This strategy is described here: + # https://github.com/mandiant/capa/issues/2063#issuecomment-2107083068 if feature_index.string_rules: - string_features = {} + + # This is a FeatureSet that contains only String features. + # Since we'll only be evaluating String/Regex features below, we don't care about + # other sorts of features (Mnemonic, Number, etc.) and therefore can save some time + # during evaluation. + # + # Specifically, we can address the issue described here: + # https://github.com/mandiant/capa/issues/2063#issuecomment-2095397884 + # That we spend a lot of time collecting String instances within `Regex.evaluate`. + # We don't have to address that issue further as long as we pre-filter the features here. + string_features: FeatureSet = {} for feature, locations in features.items(): if isinstance(feature, capa.features.common.String): string_features[feature] = locations @@ -1782,8 +1830,14 @@ def resort_rules_topologically(rules: List[Rule]): if wanted_string.evaluate(string_features): candidate_rule_names.add(rule_name) + # Like with String/Regex features above, we have to scan for Bytes to find candidate rules. + # + # We may want to index bytes when they have a common length, like 16 or 32. + # This would help us avoid the scanning here, which would improve performance. + # The strategy is described here: + # https://github.com/mandiant/capa/issues/2063#issuecomment-2107052190 if feature_index.bytes_rules: - bytes_features = {} + bytes_features: FeatureSet = {} for feature, locations in features.items(): if isinstance(feature, capa.features.common.Bytes): bytes_features[feature] = locations @@ -1794,52 +1848,71 @@ def resort_rules_topologically(rules: List[Rule]): if wanted_bytes.evaluate(bytes_features): candidate_rule_names.add(rule_name) - # logger.debug("perf: match: %s: %s: %d features, %d candidate rules", - # scope, addr, len(features), len(candidate_rule_names)) + # trace + logger.debug( + "perf: match: %s: %s: %d features, %d candidate rules", + scope, + addr, + len(features), + len(candidate_rule_names), + ) - # first, match against the set of rules that have at least one - # feature shared with our feature set. + # No rules can possibly match, so quickly return. + if not candidate_rule_names: + return (features, {}) + + # Here are the candidate rules (before we just had their names). candidate_rules = [self.rules[name] for name in candidate_rule_names] - resort_rules_topologically(candidate_rules) + + # Order rules topologically, so that rules with dependencies work correctly. + RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules) # # The following is derived from ceng.match # extended to interact with candidate_rules upon rule match. # + results: ceng.MatchResults = collections.defaultdict(list) - # copy features so that we can modify it - # without affecting the caller (keep this function pure) - # - # note: copy doesn't notice this is a defaultdict, so we'll recreate that manually. - features = collections.defaultdict(set, copy.copy(features)) + # If we match a rule, then we'll add a MatchedRule to the features that will be returned, + # but we want to do that in a copy. We'll lazily create the copy below, once a match has + # actually been found. + augmented_features = features while candidate_rules: rule = candidate_rules.pop(0) - res = rule.evaluate(features, short_circuit=True) + res = rule.evaluate(augmented_features, short_circuit=True) if res: # we first matched the rule with short circuiting enabled. # this is much faster than without short circuiting. # however, we want to collect all results thoroughly, # so once we've found a match quickly, # go back and capture results without short circuiting. - res = rule.evaluate(features, short_circuit=False) + res = rule.evaluate(augmented_features, short_circuit=False) # sanity check assert bool(res) is True results[rule.name].append((addr, res)) - # we need to update the current `features` - # because subsequent iterations of this loop may use newly added features, + # We need to update the current features because subsequent iterations may use newly added features, # such as rule or namespace matches. - ceng.index_rule_matches(features, rule, [addr]) + if augmented_features is features: + # lazily create the copy of features only when a rule matches, since it could be expensive. + augmented_features = collections.defaultdict(set, copy.copy(features)) + + ceng.index_rule_matches(augmented_features, rule, [addr]) + # Its possible that we're relying on a MatchedRule feature to be the + # uncommon feature used to filter other rules. So, extend the candidate + # rules with any of these dependencies. If we find any, also ensure they're + # evaluated in the correct topologic order, so that further dependencies work. new_candidates = feature_index.rules_by_feature.get(capa.features.common.MatchedRule(rule.name), ()) if new_candidates: + candidate_rule_names.update(new_candidates) candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates]) - resort_rules_topologically(candidate_rules) + RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules) - return (features, results) + return (augmented_features, results) def is_nursery_rule_path(path: Path) -> bool: From f86a60c85e9c7264acb68f8ea639565428a3deb3 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 22 May 2024 14:40:47 +0200 Subject: [PATCH 06/28] bytes: log length of bytes evaluations --- capa/features/common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/capa/features/common.py b/capa/features/common.py index b817fafb9..5f0ca7941 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -387,6 +387,7 @@ def __init__(self, value: bytes, description=None): def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True): capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.bytes"] += 1 + capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1 assert isinstance(self.value, bytes) for feature, locations in features.items(): From 6e50f4817e36af7a0adf8f5ad8561115f005c787 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 22 May 2024 15:20:19 +0200 Subject: [PATCH 07/28] ruleset: document optimized match behavior --- capa/rules/__init__.py | 221 ++++++++++++++++++++++++++--------------- 1 file changed, 142 insertions(+), 79 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 9722ab970..00e99fa03 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1368,13 +1368,13 @@ def __init__( rules = capa.optimizer.optimize_rules(rules) scopes = ( - Scope.FILE, - Scope.PROCESS, - Scope.THREAD, Scope.CALL, - Scope.FUNCTION, - Scope.BASIC_BLOCK, + Scope.THREAD, + Scope.PROCESS, Scope.INSTRUCTION, + Scope.BASIC_BLOCK, + Scope.FUNCTION, + Scope.FILE, ) self.rules = {rule.name: rule for rule in rules} @@ -1382,8 +1382,9 @@ def __init__( self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes} # unstable + scores_by_rule = {} self._feature_indexes_by_scopes = { - scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope]) for scope in scopes + scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes } @property @@ -1426,13 +1427,17 @@ def __contains__(self, rulename): # unstable @staticmethod def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int: - if capa.features.common.is_global_feature(node): - # We don't want to index global features - # because they're not very selective. - # They also don't stand on their own - there's always some other logic. - raise ValueError("don't index global features") + """ + Score the given feature by how "uncommon" it is, where a higher score is more uncommon. + Features that are not good for indexing will have a low score, or 0. + """ + + # + # Today, these scores are manually assigned by intuition/experience/guesswork. + # We could do a large-scale feature collection and use the results to assign scores. + # - elif isinstance( + if isinstance( node, capa.features.common.MatchedRule, ): @@ -1455,6 +1460,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe return 3 if 0xFFFF_FFFF_FFFF_FF00 <= v <= 0xFFFF_FFFF_FFFF_FFFF: + # Like signed numbers closed to 0 that are stored as unsigned long ints. return 3 # Other numbers are assumed to be uncommon. @@ -1466,10 +1472,20 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe C = node.__class__ return { + # + # Very uncommon features that are probably very selective in capa's domain. + # When possible, we want rules to be indexed by these features. + # capa.features.common.String: 9, capa.features.insn.API: 8, capa.features.file.Export: 7, # "uncommon numbers": 7 + # + # ----------------------------------------------------------------- + # + # Features that are probably somewhat common, and/or rarely used within capa. + # Its ok to index rules by these. + # capa.features.common.Class: 5, capa.features.common.Namespace: 5, capa.features.insn.Property: 5, @@ -1477,12 +1493,34 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe capa.features.file.Section: 5, capa.features.file.FunctionName: 5, # default MatchedRule: 5 + # + # ----------------------------------------------------------------- + # + # Features that are pretty common and we'd prefer not to index, but can if we have to. + # capa.features.common.Characteristic: 4, capa.features.insn.Offset: 4, capa.features.insn.OperandOffset: 4, # "common numbers": 3 + # + # ----------------------------------------------------------------- + # + # Very common features, which we'd only prefer to non-hashable features, like Regex/Substring/Bytes. + # capa.features.insn.Mnemonic: 2, capa.features.basicblock.BasicBlock: 1, + # + # + # We don't *want* to index global features because they're not very selective. + # They also don't usually stand on their own - there's always some other logic. + # + capa.features.common.OS: 0, + capa.features.common.Arch: 0, + capa.features.common.Format: 0, + # ----------------------------------------------------------------- + # + # Non-hashable features, which will require a scan to evaluate, and are therefore quite expensive. + # # substring: 0 # regex: 0 # bytes: 0 @@ -1490,36 +1528,52 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe @dataclass class RuleFeatureIndex: + # Mapping from hashable feature to a list of rules that might have this feature. rules_by_feature: Dict[Feature, Set[str]] = dataclasses.field(default=dict) + # Mapping from rule name to list of Regex/Substring features that have to match. + # All these features will be evaluated whenever a String feature is encountered. string_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict) + # Mapping from rule name to list of Bytes features that have to match. + # All these features will be evaluated whenever a Bytes feature is encountered. bytes_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict) # unstable @staticmethod - def _index_rules_by_feature(scope: Scope, rules: List[Rule]) -> RuleFeatureIndex: + def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> RuleFeatureIndex: + """ + Index the given rules by their minimal set of most "uncommon" features required to match. + + If absolutely necessary, provide the Regex/Substring/Bytes features + (which are not hashable and require a scan) that have to match, too. + """ + rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set) - scores_by_rule: Dict[str, int] = {} - # note closure over scores_by_rule def rec( - rule_name: str, node: Union[Feature, Statement] - ) -> Union[None, Tuple[int, Feature], Tuple[int, Set[Feature]]]: + rule_name: str, node: Union[Feature, Statement], + # closure over: scores_by_rule + ) -> Optional[Tuple[int, Set[Feature]]]: """ Walk through a rule's logic tree, picking the features to use for indexing, returning the feature and an associated score. The higher the score, the more selective the feature is expected to be. - The score is only used internally, to pick the best fetaure from within - and AND block. + The score is only used internally, to pick the best feature from within AND blocks. + + Note closure over `scores_by_rule`. """ if isinstance(node, (ceng.Not)): - # we don't index features within NOT blocks + # We don't index features within NOT blocks, because we're only looking for + # features that should be present. + # + # Technically we could have a rule that does `not: not: foo` and we'd want to + # index `foo`. But this is not seen today. return None elif isinstance(node, (ceng.Some)) and node.count == 0: - # when a subtree is optional, it may match, but not matching + # When a subtree is optional, it may match, but not matching # doesn't have any impact either. - # now, our rule authors *should* not put this under `or:` + # Now, our rule authors *should* not put this under `or:` # and this is checked by the linter, return None @@ -1528,31 +1582,45 @@ def rec( # because the min is 0, this subtree *can* match just about any feature. return None - elif isinstance(node, capa.features.common.Feature) and capa.features.common.is_global_feature(node): - # we don't want to index global features - # because they're not very selective. - # they also don't stand on their own - there's always some other logic. - return None - elif isinstance(node, capa.features.common.Feature): - return (RuleSet._score_feature(scores_by_rule, node), node) + return (RuleSet._score_feature(scores_by_rule, node), {node}) elif isinstance(node, (ceng.Range)): # feature is found N times return rec(rule_name, node.child) elif isinstance(node, ceng.And): - scores = [] + # When evaluating an AND block, all of the children need to match. + # + # So when we index rules, we want to pick the most uncommon feature(s) + # for each AND block. If the AND block matches, that feature must be there. + # We recursively explore children, computing their + # score, and pick the child with the greatest score. + # + # For example, given the rule: + # + # and: + # - mnemonic: mov + # - api: CreateFile + # + # we prefer to pick `api: CreateFile` because we expect it to be more uncommon. + # + # Note that the children nodes might be complex, like: + # + # and: + # - mnemonic: mov + # - or: + # - api: CreateFile + # - api: DeleteFile + # + # In this case, we prefer to pick the pair of API features since each is expected + # to be more common than the mnemonic. + scores: List[Tuple[int, Set[Feature]]] = [] for child in node.children: - try: - score = rec(rule_name, child) - except AssertionError as e: - # if one branch isn't possible to index, - # thats ok, we can require a different one to match - logger.warning("and: swallowing: %s: %s", e, rule_name) - continue + score = rec(rule_name, child) if not score: + # maybe an optional block or similar continue scores.append(score) @@ -1562,13 +1630,8 @@ def rec( def and_score_key(item): # order by score, then fewest number of features. - # TODO(wb): minimize number of features? play with this. score, features = item - - if isinstance(features, set): - return (score, -len(features)) - else: - return (score, -1) + return (score, -len(features)) scores.sort(key=and_score_key, reverse=True) @@ -1576,6 +1639,30 @@ def and_score_key(item): return scores[0] elif isinstance(node, (ceng.Or, ceng.Some)): + # When evaluating an OR block, any of the children need to match. + # It could be any of them, so we can't decide to only index some of them. + # + # For example, given the rule: + # + # or: + # - mnemonic: mov + # - api: CreateFile + # + # we have to pick both `mnemonic` and `api` features. + # + # Note that the children nodes might be complex, like: + # + # or: + # - mnemonic: mov + # - and: + # - api: CreateFile + # - api: DeleteFile + # + # In this case, we have to pick both the `mnemonic` and one of the `api` features. + # + # When computing the score of an OR branch, we have to use the min value encountered. + # While many of the children might be very specific, there might be a branch that is common + # and we need to handle that correctly. min_score = 10000000 # assume this is larger than any score features = set() @@ -1583,36 +1670,18 @@ def and_score_key(item): item = rec(rule_name, child) assert item is not None, "can't index OR branch" - score, feature = item - + score, _features = item min_score = min(min_score, score) - - if isinstance(feature, set): - features.update(feature) - else: - features.add(feature) + features.update(_features) return min_score, features - elif isinstance(node, ceng.Statement): - # Unhandled type of statement. - # This should only happen if a new subtype of `Statement` - # has since been added to capa. - # - # Ideally, we'd like to use mypy for exhaustiveness checking - # for all the subtypes of `Statement`. - # But, as far as I can tell, mypy does not support this type - # of checking. - # - # In a way, this makes some intuitive sense: - # the set of subtypes of type A is unbounded, - # because any user might come along and create a new subtype B, - # so mypy can't reason about this set of types. - assert_never(node) else: # programming error assert_never(node) + # These are the Regex/Substring/Bytes features that we have to use for filtering. + # Ideally we find a way to get rid of all of these, eventually. string_rules: Dict[str, List[Feature]] = {} bytes_rules: Dict[str, List[Feature]] = {} @@ -1622,13 +1691,7 @@ def and_score_key(item): root = rule.statement item = rec(rule_name, root) assert item is not None - - score, feature = item - - if isinstance(feature, set): - features = feature - else: - features = {feature} + score, features = item string_features = [ feature @@ -1849,13 +1912,13 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat candidate_rule_names.add(rule_name) # trace - logger.debug( - "perf: match: %s: %s: %d features, %d candidate rules", - scope, - addr, - len(features), - len(candidate_rule_names), - ) + # logger.debug( + # "perf: match: %s: %s: %d features, %d candidate rules", + # scope, + # addr, + # len(features), + # len(candidate_rule_names), + # ) # No rules can possibly match, so quickly return. if not candidate_rule_names: From b7d07346b682a1d8466b1aa53b877e722cef3f56 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 22 May 2024 15:23:15 +0200 Subject: [PATCH 08/28] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fe3ccc821..76142df32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - add function in capa/helpers to load plain and compressed JSON reports #1883 @Rohit1123 - document Antivirus warnings and VirusTotal false positive detections #2028 @RionEV @mr-tz +- optimize rule matching #2080 @williballenthin ### Breaking Changes From f853214ca091a506f75ef12daeb35a0f2c63c143 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Wed, 22 May 2024 15:40:06 +0200 Subject: [PATCH 09/28] ruleset: infrastructure to test optimized matcher --- capa/rules/__init__.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 00e99fa03..1823ddfec 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1811,7 +1811,7 @@ def _sort_rules_by_index(rule_index_by_rule_name: Dict[str, int], rules: List[Ru """ rules.sort(key=lambda r: rule_index_by_rule_name[r.name]) - def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]: + def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]: """ Match rules from this ruleset at the given scope against the given features. @@ -1977,6 +1977,29 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat return (augmented_features, results) + def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]: + """ + Match rules from this ruleset at the given scope against the given features. + + This wrapper around _match exists so that we can assert it matches precisely + the same as `capa.engine.match`, just faster. + """ + features1, matches1 = self._match(scope, features, addr) + + # enable this branch to demonstrate that the naive matcher agrees with this optimized matcher. + if True: + features2, matches2 = capa.engine.match(self.rules.values(), features, addr) + + for feature, locations in features1.items(): + assert feature in features2 + assert locations == features2[feature] + + for rulename, results in matches1.items(): + assert rulename in matches2 + assert len(results) == len(matches2[rulename]) + + return features1, matches1 + def is_nursery_rule_path(path: Path) -> bool: """ From 9b7fb4e24bc0b9d1c994680bbe199d670c0db6a1 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 09:30:43 +0200 Subject: [PATCH 10/28] pep8 --- capa/rules/__init__.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 1823ddfec..ed009b5c1 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1543,14 +1543,15 @@ def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dic """ Index the given rules by their minimal set of most "uncommon" features required to match. - If absolutely necessary, provide the Regex/Substring/Bytes features + If absolutely necessary, provide the Regex/Substring/Bytes features (which are not hashable and require a scan) that have to match, too. """ rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set) def rec( - rule_name: str, node: Union[Feature, Statement], + rule_name: str, + node: Union[Feature, Statement], # closure over: scores_by_rule ) -> Optional[Tuple[int, Set[Feature]]]: """ @@ -1981,7 +1982,7 @@ def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Feat """ Match rules from this ruleset at the given scope against the given features. - This wrapper around _match exists so that we can assert it matches precisely + This wrapper around _match exists so that we can assert it matches precisely the same as `capa.engine.match`, just faster. """ features1, matches1 = self._match(scope, features, addr) From e8ef897a1f87d852faff9a471d5c32cf0a8bcf72 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 09:51:00 +0200 Subject: [PATCH 11/28] linters --- capa/features/common.py | 3 ++- capa/rules/__init__.py | 40 +++++++++++++++++++++++----------------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 5f0ca7941..cde91d1b9 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -385,11 +385,12 @@ def __init__(self, value: bytes, description=None): self.value = value def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True): + assert isinstance(self.value, bytes) + capa.perf.counters["evaluate.feature"] += 1 capa.perf.counters["evaluate.feature.bytes"] += 1 capa.perf.counters["evaluate.feature.bytes." + str(len(self.value))] += 1 - assert isinstance(self.value, bytes) for feature, locations in features.items(): if not isinstance(feature, (Bytes,)): continue diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index ed009b5c1..1718aff5e 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -28,7 +28,7 @@ # https://github.com/python/mypy/issues/1153 from backports.functools_lru_cache import lru_cache # type: ignore -from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional +from typing import Any, Set, Dict, List, Tuple, Union, Callable, Iterator, Optional, cast from dataclasses import asdict, dataclass import yaml @@ -1382,7 +1382,7 @@ def __init__( self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes} # unstable - scores_by_rule = {} + scores_by_rule: Dict[str, int] = {} self._feature_indexes_by_scopes = { scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes } @@ -1444,6 +1444,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe # If present, other rule must match before this one, in same scope. # Use score from that rule, which will have already been processed due to topological sorting. # Otherwise, use a default score of 5. + assert isinstance(node.value, str) return scores_by_rule.get(node.value, 5) elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)): @@ -1526,20 +1527,21 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe # bytes: 0 }[C] + # unstable @dataclass - class RuleFeatureIndex: + class _RuleFeatureIndex: # Mapping from hashable feature to a list of rules that might have this feature. - rules_by_feature: Dict[Feature, Set[str]] = dataclasses.field(default=dict) + rules_by_feature: Dict[Feature, Set[str]] # Mapping from rule name to list of Regex/Substring features that have to match. # All these features will be evaluated whenever a String feature is encountered. - string_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict) + string_rules: Dict[str, List[Feature]] # Mapping from rule name to list of Bytes features that have to match. # All these features will be evaluated whenever a Bytes feature is encountered. - bytes_rules: Dict[str, List[Feature]] = dataclasses.field(default=dict) + bytes_rules: Dict[str, List[Feature]] # unstable @staticmethod - def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> RuleFeatureIndex: + def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> _RuleFeatureIndex: """ Index the given rules by their minimal set of most "uncommon" features required to match. @@ -1671,8 +1673,8 @@ def and_score_key(item): item = rec(rule_name, child) assert item is not None, "can't index OR branch" - score, _features = item - min_score = min(min_score, score) + _score, _features = item + min_score = min(min_score, _score) features.update(_features) return min_score, features @@ -1714,10 +1716,10 @@ def and_score_key(item): logger.debug(" : [%d] %s", RuleSet._score_feature(scores_by_rule, feature), feature) if string_features: - string_rules[rule_name] = string_features + string_rules[rule_name] = cast(List[Feature], string_features) if bytes_features: - bytes_rules[rule_name] = bytes_features + bytes_rules[rule_name] = cast(List[Feature], bytes_features) for feature in hashable_features: rules_by_feature[feature].add(rule_name) @@ -1731,7 +1733,7 @@ def and_score_key(item): "indexing: %d scanning string features, %d scanning bytes features", len(string_rules), len(bytes_rules) ) - return RuleSet.RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules) + return RuleSet._RuleFeatureIndex(rules_by_feature, string_rules, bytes_rules) @staticmethod def _get_rules_for_scope(rules, scope) -> List[Rule]: @@ -1820,7 +1822,7 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea It uses its knowledge of all the rules to evaluate a minimal set of candidate rules for the given features. """ - feature_index: RuleSet.RuleFeatureIndex = self._feature_indexes_by_scopes[scope] + feature_index: RuleSet._RuleFeatureIndex = self._feature_indexes_by_scopes[scope] rules: List[Rule] = self.rules_by_scope[scope] # Topologic location of rule given its name. # That is, rules with a lower index should be evaluated first, since their dependencies @@ -1978,18 +1980,22 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea return (augmented_features, results) - def match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[FeatureSet, ceng.MatchResults]: + def match( + self, scope: Scope, features: FeatureSet, addr: Address, paranoid=False + ) -> Tuple[FeatureSet, ceng.MatchResults]: """ Match rules from this ruleset at the given scope against the given features. This wrapper around _match exists so that we can assert it matches precisely the same as `capa.engine.match`, just faster. + + Args: + paranoid: when true, demonstrate that the naive matcher agrees with this optimized matcher (much slower!). """ features1, matches1 = self._match(scope, features, addr) - # enable this branch to demonstrate that the naive matcher agrees with this optimized matcher. - if True: - features2, matches2 = capa.engine.match(self.rules.values(), features, addr) + if paranoid: + features2, matches2 = capa.engine.match(list(self.rules.values()), features, addr) for feature, locations in features1.items(): assert feature in features2 From e49d47de41ad6c0e62b3951f28a0e57ec21a6011 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 10:00:43 +0200 Subject: [PATCH 12/28] rules: match: handle namespace match statements --- capa/engine.py | 15 ++++++++++----- capa/rules/__init__.py | 21 ++++++++++++++------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 649d0367c..25c26cb96 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -270,6 +270,14 @@ def evaluate(self, features: FeatureSet, short_circuit=True): MatchResults = Mapping[str, List[Tuple[Address, Result]]] +def get_rule_namespaces(rule: "capa.rules.Rule") -> Iterator[str]: + namespace = rule.meta.get("namespace") + if namespace: + while namespace: + yield namespace + namespace, _, _ = namespace.rpartition("/") + + def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: Iterable[Address]): """ record into the given featureset that the given rule matched at the given locations. @@ -280,11 +288,8 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: updates `features` in-place. doesn't modify the remaining arguments. """ features[capa.features.common.MatchedRule(rule.name)].update(locations) - namespace = rule.meta.get("namespace") - if namespace: - while namespace: - features[capa.features.common.MatchedRule(namespace)].update(locations) - namespace, _, _ = namespace.rpartition("/") + for namespace in get_rule_namespaces(rule): + features[capa.features.common.MatchedRule(namespace)].update(locations) def match(rules: List["capa.rules.Rule"], features: FeatureSet, addr: Address) -> Tuple[FeatureSet, MatchResults]: diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 1718aff5e..2a5dd8a7a 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -15,7 +15,6 @@ import logging import binascii import collections -import dataclasses from enum import Enum from pathlib import Path @@ -1968,15 +1967,23 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea ceng.index_rule_matches(augmented_features, rule, [addr]) - # Its possible that we're relying on a MatchedRule feature to be the + # Its possible that we're relying on a MatchedRule (or namespace) feature to be the # uncommon feature used to filter other rules. So, extend the candidate # rules with any of these dependencies. If we find any, also ensure they're # evaluated in the correct topologic order, so that further dependencies work. - new_candidates = feature_index.rules_by_feature.get(capa.features.common.MatchedRule(rule.name), ()) - if new_candidates: - candidate_rule_names.update(new_candidates) - candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates]) - RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules) + new_features = [capa.features.common.MatchedRule(rule.name)] + for namespace in ceng.get_rule_namespaces(rule): + new_features.append(capa.features.common.MatchedRule(namespace)) + + if new_features: + new_candidates: List[str] = [] + for new_feature in new_features: + new_candidates.extend(feature_index.rules_by_feature.get(new_feature, ())) + + if new_candidates: + candidate_rule_names.update(new_candidates) + candidate_rules.extend([self.rules[rule_name] for rule_name in new_candidates]) + RuleSet._sort_rules_by_index(rule_index_by_rule_name, candidate_rules) return (augmented_features, results) From a4f4f0bef38aec42ba1108c05496763fc6a42fde Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 10:15:20 +0200 Subject: [PATCH 13/28] rules: more tests for logic edge cases --- capa/rules/__init__.py | 6 ++++- tests/test_match.py | 61 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 2a5dd8a7a..072d8eff1 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1579,11 +1579,15 @@ def rec( # and this is checked by the linter, return None - elif isinstance(node, (ceng.Range)) and node.min == 0: + elif isinstance(node, (ceng.Range)) and node.min == 0 and node.max != 0: # `count(foo): 0 or more` is just like an optional block, # because the min is 0, this subtree *can* match just about any feature. return None + elif isinstance(node, (ceng.Range)) and node.min == 0 and node.max == 0: + # `count(foo): 0` is like a not block, which we don't index. + return None + elif isinstance(node, capa.features.common.Feature): return (RuleSet._score_feature(scores_by_rule, node), {node}) diff --git a/tests/test_match.py b/tests/test_match.py index 07af33d78..4aa6db305 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -5,9 +5,10 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. - import textwrap +import pytest + import capa.rules import capa.engine import capa.features.insn @@ -130,22 +131,29 @@ def test_match_range_exact_zero(): static: function dynamic: process features: - - count(number(100)): 0 + - and: + - count(number(100)): 0 + + # we can't have `count(foo): 0` at the top level, + # since we don't support top level NOT statements. + # so we have this additional trivial feature. + - mnemonic: mov + """ ) r = capa.rules.Rule.from_yaml(rule) # feature isn't indexed - good. - _, matches = match([r], {}, 0x0) + _, matches = match([r], {capa.features.insn.Mnemonic("mov"): {}}, 0x0) assert "test rule" in matches # feature is indexed, but no matches. # i don't think we should ever really have this case, but good to check anyways. - _, matches = match([r], {capa.features.insn.Number(100): {}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {}, capa.features.insn.Mnemonic("mov"): {}}, 0x0) assert "test rule" in matches # too many matches - _, matches = match([r], {capa.features.insn.Number(100): {1}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {1}, capa.features.insn.Mnemonic("mov"): {1}}, 0x0) assert "test rule" not in matches @@ -159,21 +167,27 @@ def test_match_range_with_zero(): static: function dynamic: process features: - - count(number(100)): (0, 1) + - and: + - count(number(100)): (0, 1) + + # we can't have `count(foo): 0` at the top level, + # since we don't support top level NOT statements. + # so we have this additional trivial feature. + - mnemonic: mov """ ) r = capa.rules.Rule.from_yaml(rule) # ok - _, matches = match([r], {}, 0x0) + _, matches = match([r], {capa.features.insn.Mnemonic("mov"): {}}, 0x0) assert "test rule" in matches - _, matches = match([r], {capa.features.insn.Number(100): {}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {}, capa.features.insn.Mnemonic("mov"): {}}, 0x0) assert "test rule" in matches - _, matches = match([r], {capa.features.insn.Number(100): {1}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {1}, capa.features.insn.Mnemonic("mov"): {1}}, 0x0) assert "test rule" in matches # too many matches - _, matches = match([r], {capa.features.insn.Number(100): {1, 2}}, 0x0) + _, matches = match([r], {capa.features.insn.Number(100): {1, 2}, capa.features.insn.Mnemonic("mov"): {1, 2}}, 0x0) assert "test rule" not in matches @@ -551,7 +565,8 @@ def test_match_regex_values_always_string(): assert capa.features.common.MatchedRule("test rule") in features -def test_match_not(): +@pytest.mark.xfail(reason="can't have top level NOT") +def test_match_only_not(): rule = textwrap.dedent( """ rule: @@ -572,6 +587,30 @@ def test_match_not(): assert "test rule" in matches +def test_match_not(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + namespace: testns1/testns2 + features: + - and: + - mnemonic: mov + - not: + - number: 99 + """ + ) + r = capa.rules.Rule.from_yaml(rule) + + _, matches = match([r], {capa.features.insn.Number(100): {1, 2}, capa.features.insn.Mnemonic("mov"): {1, 2}}, 0x0) + assert "test rule" in matches + + +@pytest.mark.xfail(reason="can't have nested NOT") def test_match_not_not(): rule = textwrap.dedent( """ From bff7f0a3ec66484c1301431453216886bb983a15 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 10:25:42 +0200 Subject: [PATCH 14/28] rules: match paranoid true --- capa/rules/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 072d8eff1..69741a9d9 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1992,7 +1992,7 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea return (augmented_features, results) def match( - self, scope: Scope, features: FeatureSet, addr: Address, paranoid=False + self, scope: Scope, features: FeatureSet, addr: Address, paranoid=True ) -> Tuple[FeatureSet, ceng.MatchResults]: """ Match rules from this ruleset at the given scope against the given features. From d20f040d97d6fe7df0dad8ef1ed0bc724a04c20e Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 10:34:51 +0200 Subject: [PATCH 15/28] rules: document logic edge cases --- capa/rules/__init__.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 69741a9d9..3937c249d 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -2000,6 +2000,14 @@ def match( This wrapper around _match exists so that we can assert it matches precisely the same as `capa.engine.match`, just faster. + This matcher does not handle some edge cases: + - top level NOT statements + - also top level counted features with zero occurances, like: `count(menmonic(mov)): 0` + - nested NOT statements (NOT: NOT: foo) + + We should discourage/forbid these constructs from our rules and add lints for them. + TODO(williballenthin): add lints for logic edge cases + Args: paranoid: when true, demonstrate that the naive matcher agrees with this optimized matcher (much slower!). """ From a7e24e6c784d278bdfe366f6226d7d677fe72f35 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 16:36:25 +0200 Subject: [PATCH 16/28] pep8 --- capa/features/extractors/binexport2/__init__.py | 4 +++- capa/features/extractors/binexport2/basicblock.py | 1 + capa/features/extractors/binexport2/extractor.py | 4 +++- capa/features/extractors/binexport2/insn.py | 4 +++- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/capa/features/extractors/binexport2/__init__.py b/capa/features/extractors/binexport2/__init__.py index ba488cd86..a09ca2a52 100644 --- a/capa/features/extractors/binexport2/__init__.py +++ b/capa/features/extractors/binexport2/__init__.py @@ -261,7 +261,9 @@ def _find_base_address(self): def _compute_thunks(self): for addr, idx in self.idx.vertex_index_by_address.items(): vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[idx] - if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.THUNK): + if not capa.features.extractors.binexport2.helpers.is_vertex_type( + vertex, BinExport2.CallGraph.Vertex.Type.THUNK + ): continue curr_idx: int = idx diff --git a/capa/features/extractors/binexport2/basicblock.py b/capa/features/extractors/binexport2/basicblock.py index 5d7398aa1..bcb7977b4 100644 --- a/capa/features/extractors/binexport2/basicblock.py +++ b/capa/features/extractors/binexport2/basicblock.py @@ -13,6 +13,7 @@ from capa.features.basicblock import BasicBlock from capa.features.extractors.binexport2 import FunctionContext, BasicBlockContext from capa.features.extractors.base_extractor import BBHandle, FunctionHandle +from capa.features.extractors.binexport2.binexport2_pb2 import BinExport2 def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/binexport2/extractor.py b/capa/features/extractors/binexport2/extractor.py index 6e1e4c633..1c3c4d393 100644 --- a/capa/features/extractors/binexport2/extractor.py +++ b/capa/features/extractors/binexport2/extractor.py @@ -75,7 +75,9 @@ def get_functions(self) -> Iterator[FunctionHandle]: be2_vertex: BinExport2.CallGraph.Vertex = self.be2.call_graph.vertex[vertex_idx] # skip thunks - if capa.features.extractors.binexport2.helpers.is_vertex_type(be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK): + if capa.features.extractors.binexport2.helpers.is_vertex_type( + be2_vertex, BinExport2.CallGraph.Vertex.Type.THUNK + ): continue yield FunctionHandle( diff --git a/capa/features/extractors/binexport2/insn.py b/capa/features/extractors/binexport2/insn.py index 13b067497..d9c758fef 100644 --- a/capa/features/extractors/binexport2/insn.py +++ b/capa/features/extractors/binexport2/insn.py @@ -54,7 +54,9 @@ def extract_insn_api_features(fh: FunctionHandle, _bbh: BBHandle, ih: InsnHandle vertex_idx: int = be2_index.vertex_index_by_address[addr] vertex: BinExport2.CallGraph.Vertex = be2.call_graph.vertex[vertex_idx] - if not capa.features.extractors.binexport2.helpers.is_vertex_type(vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED): + if not capa.features.extractors.binexport2.helpers.is_vertex_type( + vertex, BinExport2.CallGraph.Vertex.Type.IMPORTED + ): continue if not vertex.HasField("mangled_name"): From a66524ae458a6a15e391d738abc66dcfdb207614 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 16:36:33 +0200 Subject: [PATCH 17/28] rules: match: better debug paranoid matching --- capa/rules/__init__.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 3937c249d..010c5c6fb 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -2011,20 +2011,27 @@ def match( Args: paranoid: when true, demonstrate that the naive matcher agrees with this optimized matcher (much slower!). """ - features1, matches1 = self._match(scope, features, addr) + features, matches = self._match(scope, features, addr) if paranoid: - features2, matches2 = capa.engine.match(list(self.rules.values()), features, addr) + rules: List[Rule] = self.rules_by_scope[scope] + paranoid_features, paranoid_matches = capa.engine.match(rules, features, addr) - for feature, locations in features1.items(): - assert feature in features2 - assert locations == features2[feature] + if features != paranoid_features: + logger.warning("paranoid: %s: %s", scope, addr) + for feature in sorted(set(features.keys()) & set(paranoid_features.keys())): + logger.warning("paranoid: %s", feature) - for rulename, results in matches1.items(): - assert rulename in matches2 - assert len(results) == len(matches2[rulename]) + for feature in sorted(set(features.keys()) - set(paranoid_features.keys())): + logger.warning("paranoid: + %s", feature) - return features1, matches1 + for feature in sorted(set(paranoid_features.keys()) - set(features.keys())): + logger.warning("paranoid: - %s", feature) + + assert features == paranoid_features + assert set(matches.keys()) == set(paranoid_matches.keys()) + + return features, matches def is_nursery_rule_path(path: Path) -> bool: From 61d01bb0e9c16a9a4d2c5d14ed58090cdcf233dd Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 16:52:05 +0200 Subject: [PATCH 18/28] rules: matcher: more doc --- capa/rules/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 010c5c6fb..6e5b5262a 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -2009,7 +2009,8 @@ def match( TODO(williballenthin): add lints for logic edge cases Args: - paranoid: when true, demonstrate that the naive matcher agrees with this optimized matcher (much slower!). + paranoid: when true, demonstrate that the naive matcher agrees with this + optimized matcher (much slower! around 10x slower). """ features, matches = self._match(scope, features, addr) From 62c44521f0944c6b481025c5066da6ce67a8c1be Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 17:11:33 +0200 Subject: [PATCH 19/28] rules: match: disable paranoid mode by default --- capa/rules/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 6e5b5262a..ae7a30e02 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1992,7 +1992,7 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea return (augmented_features, results) def match( - self, scope: Scope, features: FeatureSet, addr: Address, paranoid=True + self, scope: Scope, features: FeatureSet, addr: Address, paranoid=False ) -> Tuple[FeatureSet, ceng.MatchResults]: """ Match rules from this ruleset at the given scope against the given features. From 8ccae6e0274725489685e4ec37571fc892cd157b Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 17:34:23 +0200 Subject: [PATCH 20/28] add tests demonstrating optimized matching behavior --- capa/rules/__init__.py | 2 + tests/test_match.py | 89 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index ae7a30e02..909a0d708 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1856,6 +1856,8 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea # feature frequently whenever a string/bytes feature is encountered. Its slow, but we can't # get around it. Reducing our reliance on regex/bytes feature and/or finding a way to # index these can futher improve performance. + # + # See the corresponding unstable tests in `test_match.py::test_index_features_*`. # Find all the rules that could match the given feature set. # Ideally we want this set to be as small and focused as possible, diff --git a/tests/test_match.py b/tests/test_match.py index 4aa6db305..24dc89a6a 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -788,3 +788,92 @@ def test_match_os_any(): 0x0, ) assert "test rule" in matches + + +def test_index_features_and_unstable(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - and: + - mnemonic: mov + - api: CreateFileW + """ + ) + r = capa.rules.Rule.from_yaml(rule) + rr = capa.rules.RuleSet([r]) + index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION] + + # there's a single rule, and its indexed by a single feature + assert len(index.rules_by_feature) == 1 + # and we index by the more uncommon API feature, not the common mnemonic feature + assert capa.features.insn.API("CreateFileW") in index.rules_by_feature + + assert not index.string_rules + assert not index.bytes_rules + + +def test_index_features_or_unstable(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - or: + - mnemonic: mov + - api: CreateFileW + """ + ) + r = capa.rules.Rule.from_yaml(rule) + rr = capa.rules.RuleSet([r]) + index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION] + + # there's a single rule, and its indexed by both features, + # because they fall under the single root OR node. + assert len(index.rules_by_feature) == 2 + assert capa.features.insn.API("CreateFileW") in index.rules_by_feature + assert capa.features.insn.Mnemonic("mov") in index.rules_by_feature + + assert not index.string_rules + assert not index.bytes_rules + + +def test_index_features_nested_unstable(): + rule = textwrap.dedent( + """ + rule: + meta: + name: test rule + scopes: + static: function + dynamic: process + features: + - and: + - mnemonic: mov + - or: + - api: CreateFileW + - string: foo + """ + ) + r = capa.rules.Rule.from_yaml(rule) + rr = capa.rules.RuleSet([r]) + index: capa.rules.RuleSet._RuleFeatureIndex = rr._feature_indexes_by_scopes[capa.rules.Scope.FUNCTION] + + # there's a single rule, and its indexed by the two uncommon features, + # not the single common feature. + assert len(index.rules_by_feature) == 2 + assert capa.features.insn.API("CreateFileW") in index.rules_by_feature + assert capa.features.common.String("foo") in index.rules_by_feature + assert capa.features.insn.Mnemonic("mov") not in index.rules_by_feature + + assert not index.string_rules + assert not index.bytes_rules From 12a78f395ba14717947b981af1beb4da51e19554 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 21:29:00 +0200 Subject: [PATCH 21/28] rules: match: remove inline closure comment Co-authored-by: Moritz --- capa/rules/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 909a0d708..ec88982fe 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1553,7 +1553,6 @@ def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dic def rec( rule_name: str, node: Union[Feature, Statement], - # closure over: scores_by_rule ) -> Optional[Tuple[int, Set[Feature]]]: """ Walk through a rule's logic tree, picking the features to use for indexing, From 960ee864b9daf116e3040b4bf06ce947158b04aa Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Mon, 3 Jun 2024 21:33:45 +0200 Subject: [PATCH 22/28] typo Co-authored-by: Mike Hunhoff --- capa/rules/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index ec88982fe..2a2530ca1 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1456,7 +1456,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe if 0xFFFF_FF00 <= v <= 0xFFFF_FFFF: # Numbers close to u32::max_int are also probably pretty common, - # like signed numbers closed to 0 that are stored as unsigned ints. + # like signed numbers close to 0 that are stored as unsigned ints. return 3 if 0xFFFF_FFFF_FFFF_FF00 <= v <= 0xFFFF_FFFF_FFFF_FFFF: From e21a70f0be569af15a273e71c936bc6efb05d640 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 4 Jun 2024 12:56:33 +0200 Subject: [PATCH 23/28] rules: clarify the term "unstable" with longer comments --- capa/rules/__init__.py | 10 +++++----- tests/test_match.py | 3 +++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 909a0d708..43b0ab8bc 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1380,7 +1380,7 @@ def __init__( self.rules_by_namespace = index_rules_by_namespace(rules) self.rules_by_scope = {scope: self._get_rules_for_scope(rules, scope) for scope in scopes} - # unstable + # these structures are unstable and may change before the next major release. scores_by_rule: Dict[str, int] = {} self._feature_indexes_by_scopes = { scope: self._index_rules_by_feature(scope, self.rules_by_scope[scope], scores_by_rule) for scope in scopes @@ -1423,7 +1423,7 @@ def __getitem__(self, rulename): def __contains__(self, rulename): return rulename in self.rules - # unstable + # this routine is unstable and may change before the next major release. @staticmethod def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int: """ @@ -1526,7 +1526,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe # bytes: 0 }[C] - # unstable + # this class is unstable and may change before the next major release. @dataclass class _RuleFeatureIndex: # Mapping from hashable feature to a list of rules that might have this feature. @@ -1538,7 +1538,7 @@ class _RuleFeatureIndex: # All these features will be evaluated whenever a Bytes feature is encountered. bytes_rules: Dict[str, List[Feature]] - # unstable + # this routine is unstable and may change before the next major release. @staticmethod def _index_rules_by_feature(scope: Scope, rules: List[Rule], scores_by_rule: Dict[str, int]) -> _RuleFeatureIndex: """ @@ -1806,7 +1806,7 @@ def filter_rules_by_meta(self, tag: str) -> "RuleSet": break return RuleSet(list(rules_filtered)) - # unstable + # this routine is unstable and may change before the next major release. @staticmethod def _sort_rules_by_index(rule_index_by_rule_name: Dict[str, int], rules: List[Rule]): """ diff --git a/tests/test_match.py b/tests/test_match.py index 24dc89a6a..d621853c7 100644 --- a/tests/test_match.py +++ b/tests/test_match.py @@ -790,6 +790,7 @@ def test_match_os_any(): assert "test rule" in matches +# this test demonstrates the behavior of unstable features that may change before the next major release. def test_index_features_and_unstable(): rule = textwrap.dedent( """ @@ -818,6 +819,7 @@ def test_index_features_and_unstable(): assert not index.bytes_rules +# this test demonstrates the behavior of unstable features that may change before the next major release. def test_index_features_or_unstable(): rule = textwrap.dedent( """ @@ -847,6 +849,7 @@ def test_index_features_or_unstable(): assert not index.bytes_rules +# this test demonstrates the behavior of unstable features that may change before the next major release. def test_index_features_nested_unstable(): rule = textwrap.dedent( """ From 457cfe018053f224af9ad6681e767aee1268fa6c Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 4 Jun 2024 13:17:48 +0200 Subject: [PATCH 24/28] rules: more comments describing how features are scored --- capa/rules/__init__.py | 47 ++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index f7b3a7972..b5671416c 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1427,8 +1427,20 @@ def __contains__(self, rulename): @staticmethod def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Feature) -> int: """ - Score the given feature by how "uncommon" it is, where a higher score is more uncommon. + Score the given feature by how "uncommon" we think it will be. + Features that we expect to be very selective (ie. uniquely identify a rule and be required to match), + or "uncommon", should get a high score. Features that are not good for indexing will have a low score, or 0. + + The range of values doesn't really matter, but here we use 0-10, where + - 10 is very uncommon, very selective, good for indexing a rule, and + - 0 is a very common, not selective, bad for indexing a rule. + + You shouldn't try to interpret the scores, beyond to compare features to pick one or the other. + + Today, these scores are assigned manually, by the capa devs, who use their intuition and experience. + We *could* do a large scale analysis of all features emitted by capa across many samples to + make this more data driven. If the current approach doesn't work well, consider that. """ # @@ -1440,11 +1452,14 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe node, capa.features.common.MatchedRule, ): - # If present, other rule must match before this one, in same scope. - # Use score from that rule, which will have already been processed due to topological sorting. - # Otherwise, use a default score of 5. - assert isinstance(node.value, str) - return scores_by_rule.get(node.value, 5) + # The other rule must match before this one, in same scope or smaller. + # Because we process the rules small->large scope and topologically, + # then we can rely on dependencies being processed first. + # + # If logic changes and you see issues here, ensure that `scores_by_rule` is correctly provided. + rule_name = node.value + assert isinstance(rule_name, str) + return scores_by_rule[rule_name] elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)): v = node.value @@ -1472,6 +1487,13 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe C = node.__class__ return { + # The range of values doesn't really matter, but here we use 0-10, where + # - 10 is very uncommon, very selective, good for indexing a rule, and + # - 0 is a very common, not selective, bad for indexing a rule. + # + # You shouldn't try to interpret the scores, beyond to compare features to pick one or the other. + + # ----------------------------------------------------------------- # # Very uncommon features that are probably very selective in capa's domain. # When possible, we want rules to be indexed by these features. @@ -1479,7 +1501,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe capa.features.common.String: 9, capa.features.insn.API: 8, capa.features.file.Export: 7, - # "uncommon numbers": 7 + # "uncommon numbers": 7 (placeholder for logic above) # # ----------------------------------------------------------------- # @@ -1492,7 +1514,6 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe capa.features.file.Import: 5, capa.features.file.Section: 5, capa.features.file.FunctionName: 5, - # default MatchedRule: 5 # # ----------------------------------------------------------------- # @@ -1501,11 +1522,11 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe capa.features.common.Characteristic: 4, capa.features.insn.Offset: 4, capa.features.insn.OperandOffset: 4, - # "common numbers": 3 + # "common numbers": 3 (placeholder for logic above) # # ----------------------------------------------------------------- # - # Very common features, which we'd only prefer to non-hashable features, like Regex/Substring/Bytes. + # Very common features, which we'd only prefer instead of non-hashable features, like Regex/Substring/Bytes. # capa.features.insn.Mnemonic: 2, capa.features.basicblock.BasicBlock: 1, @@ -1521,9 +1542,9 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe # # Non-hashable features, which will require a scan to evaluate, and are therefore quite expensive. # - # substring: 0 - # regex: 0 - # bytes: 0 + # substring: 0 (placeholder for logic above) + # regex: 0 (placeholder for logic above) + # bytes: 0 (placeholder for logic above) }[C] # this class is unstable and may change before the next major release. From b34667ff5e29d0fae413fa529cddd7488d25da25 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Tue, 4 Jun 2024 16:04:57 +0200 Subject: [PATCH 25/28] black --- capa/rules/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index b5671416c..b413f2aab 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1429,7 +1429,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe """ Score the given feature by how "uncommon" we think it will be. Features that we expect to be very selective (ie. uniquely identify a rule and be required to match), - or "uncommon", should get a high score. + or "uncommon", should get a high score. Features that are not good for indexing will have a low score, or 0. The range of values doesn't really matter, but here we use 0-10, where @@ -1492,7 +1492,6 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe # - 0 is a very common, not selective, bad for indexing a rule. # # You shouldn't try to interpret the scores, beyond to compare features to pick one or the other. - # ----------------------------------------------------------------- # # Very uncommon features that are probably very selective in capa's domain. From 1f1e142102d530add8059f1b76d509ba6a72a43e Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 6 Jun 2024 09:47:41 +0200 Subject: [PATCH 26/28] rules: match: re-introduce default rule scores --- capa/rules/__init__.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index b413f2aab..bdff39464 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1459,6 +1459,25 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe # If logic changes and you see issues here, ensure that `scores_by_rule` is correctly provided. rule_name = node.value assert isinstance(rule_name, str) + + if rule_name not in scores_by_rule: + # Its possible that we haven't scored the rule that is being requested here. + # This means that it won't ever match (because it won't be evaluated before this one). + # Still, we need to provide a default value here. + # So we give it 9, because it won't match, so its very selective. + # + # But how could this dependency not exist? + # Consider a rule that supports both static and dynamic analysis, but also has + # a `instruction: ` block. This block gets translated into a derived rule that only + # matches in static mode. Therefore, when the parent rule is run in dynamic mode, it + # won't be able to find the derived rule. This is the case we have to handle here. + # + # A better solution would be to prune this logic based on static/dynamic mode, but + # that takes more work and isn't in scope of this feature. + # + # See discussion in: https://github.com/mandiant/capa/pull/2080/#discussion_r1624783396 + return 9 + return scores_by_rule[rule_name] elif isinstance(node, (capa.features.insn.Number, capa.features.insn.OperandNumber)): From 347619d35a1a4efec4e9bdbcce98949b0e54cc46 Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 6 Jun 2024 09:48:35 +0200 Subject: [PATCH 27/28] rules: match: remove trace comments --- capa/rules/__init__.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index bdff39464..4b161c30c 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1957,15 +1957,6 @@ def _match(self, scope: Scope, features: FeatureSet, addr: Address) -> Tuple[Fea if wanted_bytes.evaluate(bytes_features): candidate_rule_names.add(rule_name) - # trace - # logger.debug( - # "perf: match: %s: %s: %d features, %d candidate rules", - # scope, - # addr, - # len(features), - # len(candidate_rule_names), - # ) - # No rules can possibly match, so quickly return. if not candidate_rule_names: return (features, {}) From b376a8401c82452d7d6ea658da2c15ab61ba74bf Mon Sep 17 00:00:00 2001 From: Willi Ballenthin Date: Thu, 6 Jun 2024 09:49:37 +0200 Subject: [PATCH 28/28] black --- capa/rules/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/rules/__init__.py b/capa/rules/__init__.py index 4b161c30c..0d1bbd3c6 100644 --- a/capa/rules/__init__.py +++ b/capa/rules/__init__.py @@ -1463,7 +1463,7 @@ def _score_feature(scores_by_rule: Dict[str, int], node: capa.features.common.Fe if rule_name not in scores_by_rule: # Its possible that we haven't scored the rule that is being requested here. # This means that it won't ever match (because it won't be evaluated before this one). - # Still, we need to provide a default value here. + # Still, we need to provide a default value here. # So we give it 9, because it won't match, so its very selective. # # But how could this dependency not exist?