From dcf43e444df2e402d8b19150c1cba4681621593f Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Fri, 26 Sep 2025 09:57:58 +1000 Subject: [PATCH 1/2] chore: reduce FPs in whitespace PR by considering ; statement Signed-off-by: Carl Flottmann --- .../pypi_malware_rules/obfuscation.yaml | 8 ++- .../obfuscation/excessive_spacing.py | 7 ++- .../obfuscation/expected_results.json | 60 ++++++++++++++----- 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml index 68bd7d54b..1250ce409 100644 --- a/src/macaron/resources/pypi_malware_rules/obfuscation.yaml +++ b/src/macaron/resources/pypi_malware_rules/obfuscation.yaml @@ -319,6 +319,8 @@ rules: languages: - python severity: ERROR - patterns: - - pattern-regex: '[\s]{50,}(\S)+' # The 50 here is the threshold for excessive spacing , more than that is considered obfuscation - - pattern-not-regex: '"""[\s\S]*"""' + pattern-either: # The 50 here is the threshold for excessive spacing , more than that is considered obfuscation + # there is excessive spacing after a ";", marking the end of a statement, then additional code. + - pattern-regex: ;[\s]{50,}(\S)+ + # there is excessive spacing before a ";", and any amount of whitespace before additional code. + - pattern-regex: '[\s]{50,};[\s]*(\S)+' diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/excessive_spacing.py b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/excessive_spacing.py index 22ea38a6f..4f9a77616 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/excessive_spacing.py +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/excessive_spacing.py @@ -20,6 +20,7 @@ def test_function(): """ sys.exit() - # excessive spacing obfuscation - def excessive_spacing_flow(): - print("Hello world!") + # excessive spacing obfuscation. The second line here will trigger two detections, which is expected since it matches both patterns. + print("hello"); __import__('os') + print("hi") ; __import__('base64') + print("things") ;__import__('zlib') diff --git a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json index 78b1467a2..008bd1eb6 100644 --- a/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json +++ b/tests/malware_analyzer/pypi/resources/sourcecode_samples/obfuscation/expected_results.json @@ -53,6 +53,21 @@ "start": 44, "end": 44 }, + { + "file": "obfuscation/excessive_spacing.py", + "start": 24, + "end": 24 + }, + { + "file": "obfuscation/excessive_spacing.py", + "start": 25, + "end": 25 + }, + { + "file": "obfuscation/excessive_spacing.py", + "start": 26, + "end": 26 + }, { "file": "obfuscation/inline_imports.py", "start": 23, @@ -105,6 +120,36 @@ } ] }, + "src.macaron.resources.pypi_malware_rules.obfuscation_excessive-spacing": { + "message": "Hidden code after excessive spacing", + "detections": [ + { + "file": "obfuscation/excessive_spacing.py", + "start": 24, + "end": 24 + }, + { + "file": "obfuscation/excessive_spacing.py", + "start": 25, + "end": 25 + }, + { + "file": "obfuscation/excessive_spacing.py", + "start": 25, + "end": 25 + }, + { + "file": "obfuscation/excessive_spacing.py", + "start": 26, + "end": 26 + }, + { + "file": "obfuscation/inline_imports.py", + "start": 27, + "end": 27 + } + ] + }, "src.macaron.resources.pypi_malware_rules.obfuscation_obfuscation-tools": { "message": "Found an indicator of the use of a python code obfuscation tool", "detections": [ @@ -229,21 +274,6 @@ "end": 68 } ] - }, - "src.macaron.resources.pypi_malware_rules.obfuscation_excessive-spacing": { - "message": "Hidden code after excessive spacing", - "detections": [ - { - "file": "obfuscation/excessive_spacing.py", - "start": 24, - "end": 25 - }, - { - "file": "obfuscation/inline_imports.py", - "start": 27, - "end": 27 - } - ] } }, "disabled_sourcecode_rule_findings": {} From 33711dd34793cf280be7c696e0c9ff5f2f5d343a Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Fri, 26 Sep 2025 11:16:05 +1000 Subject: [PATCH 2/2] docs: updated defaults and readme to clarify how to disable rules Signed-off-by: Carl Flottmann --- src/macaron/config/defaults.ini | 11 ++++++----- src/macaron/malware_analyzer/README.md | 9 +++++++++ 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/macaron/config/defaults.ini b/src/macaron/config/defaults.ini index e1746faa3..03270cd41 100644 --- a/src/macaron/config/defaults.ini +++ b/src/macaron/config/defaults.ini @@ -629,12 +629,13 @@ check_deliverability = True # custom rulesets: this is a collection of user-provided rulesets, living inside the path provided to 'custom_semgrep_rules_path'. # disable default semgrep rulesets here (i.e. all rule IDs in a Semgrep .yaml file) using ruleset names, the name -# without the .yaml prefix. Currently, we disable the exfiltration rulesets by default due to a high false positive rate. -# This list may not contain duplicated elements. Macaron's default ruleset names are all unique. +# without the .yaml prefix (e.g. "obfuscation" for "obfuscation.yaml"). Currently, we disable the exfiltration rulesets +# by default due to a high false positive rate. This list may not contain duplicated elements. Macaron's default ruleset +# names are all unique. disabled_default_rulesets = exfiltration -# disable individual rules here (i.e. individual rule IDs inside a Semgrep .yaml file) using rule IDs. You may also -# provide the IDs of your custom semgrep rules here too, as all Semgrep rule IDs must be unique. This list may not contain -# duplicated elements. +# disable individual rules here (i.e. individual rule IDs inside a Semgrep .yaml file, specified under the "rules" header in the +# .yaml file, with each rule ID under "- id") using rule IDs. You may also provide the IDs of your custom semgrep rules here too, +# as all Semgrep rule IDs must be unique. This list may not contain duplicated elements. disabled_rules = # absolute path to a directory where a custom set of semgrep rules for source code analysis are stored. These will be included # with Macaron's default rules. The path will be normalised to the OS path type. diff --git a/src/macaron/malware_analyzer/README.md b/src/macaron/malware_analyzer/README.md index d2d5517a9..facd7d987 100644 --- a/src/macaron/malware_analyzer/README.md +++ b/src/macaron/malware_analyzer/README.md @@ -101,6 +101,15 @@ This feature is currently a work in progress, and supports detection of code obf - `custom_semgrep_rules`: supply to this an absolute path to a directory containing custom Semgrep `.yaml` files to be run alongside the default ones. - `disabled_custom_rulesets`: supply to this a comma separated list of the names of custom Semgrep rule files (excluding the `.yaml` extension) to disable all rule IDs in that file. +Here, a "semgrep ruleset" refers to the name of a Semgrep `.yaml` file without the extension. For example, the name of one of the default rulesets is `obfuscation`, as the file name is `obfuscation.yaml`. To disable all rules in that `.yaml` file would look like this: +``` +disabled_default_rulesets = obfuscation +``` +A "semgrep rule", or "rule ID", refers to an `- id` entry under the `rules:` heading in a Semgrep `.yaml` file. For example, the name of a rule in `obfuscation.yaml` would be `obfuscation_excessive-spacing`, which is the name specified under the `- id` entry for that rule. Disabling it would look like this: +``` +disabled_rules = obfuscation_excessive-spacing +``` + ### Contributing When contributing an analyzer, it must meet the following requirements: