Implemented default-include configuration option

microsoft · Oct 3, 2023 · 221f528 · 221f528
1 parent ac44bb7
commit 221f528
Show file tree

Hide file tree

Showing 5 changed files with 275 additions and 98 deletions.
diff --git a/README.md b/README.md
@@ -624,22 +624,32 @@ The data in each `result` object can then be used for filtering via the `--filte
 # Optional description for the filter.  If no title is specified, the filter file name is used.
 description: Example filter from README.md
 
+# Optional configuration section to override default values.
+configuration:
+  # This option controls whether to include results where a property to check is missing, default value is true.
+  default-include: false
+
 # Items in `include` list are interpreted as inclusion filtering rules. 
 # Items are treated with OR operator, the filtered results includes objects matching any rule.
 # Each item can be one rule or a list of rules, in the latter case rules in the list are treated with AND operator - all rules must match.
 include:
-  # The following line includes issues whose author-mail field contains "@microsoft.com" AND found in Java files. 
+  # The following line includes issues whose author-mail property contains "@microsoft.com" AND found in Java files. 
   # Values with special characters `\:;_()$%^@,` must be enclosed in quotes (single or double):
   - author-mail: "@microsoft.com"
     locations[*].physicalLocation.artifactLocation.uri: "*.java"
-  # Instead of a substring, a regular expression can be used, enclosed in "/" characters.  Issues whose committer-mail field includes a string matching the regular expression are included.  Use ^ and $ to match the whole committer-mail field.
-  - committer-mail: "/^<myname.*\\.com>$/"
+  # Instead of a substring, a regular expression can be used, enclosed in "/" characters.  
+  # Issues whose committer-mail property includes a string matching the regular expression are included.  
+  # Use ^ and $ to match the whole committer-mail property.
+  - committer-mail: 
+      value: "/^<myname.*\\.com>$/"
+      # Configuration options can be overriden for any rule.
+      default-include: true
 
 # Lines under `exclude` are interpreted as exclusion filtering rules.
 exclude:
   # The following line excludes issues whose location is in test Java files with names starting with the "Test" prefix.
   - location: "Test*.java"
-  # The value for the field can be empty, in this case only existence of the field in 
+  # The value for the property can be empty, in this case only existence of the property is checked. 
   - suppression:
 ```
 
@@ -658,7 +668,7 @@ exclude:
 Field names must be specified in [JSONPath notation](https://goessner.net/articles/JsonPath/)
 accessing data in the [SARIF `result` object](https://docs.oasis-open.org/sarif/sarif/v2.1.0/cs01/sarif-v2.1.0-cs01.html#_Toc16012594).
 
-For commonly used fields the following shortcuts are defined:
+For commonly used properties the following shortcuts are defined:
 | Shortcut | Full JSONPath |
 | -------- | -------- |
 | author | properties.blame.author |
@@ -669,7 +679,7 @@ For commonly used fields the following shortcuts are defined:
 | rule | ruleId |
 | suppression | suppressions[*].kind |
 
-For the field `uri` (e.g. in `locations[*].physicalLocation.artifactLocation.uri`) file name wildcard characters can be used as it represents a file location:
+For the property `uri` (e.g. in `locations[*].physicalLocation.artifactLocation.uri`) file name wildcard characters can be used as it represents a file location:
 - `?` - a single occurrence of any character in a directory or file name
 - `*` - zero or more occurrences of any character in a directory or file name
 - `**` - zero or more occurrences across multiple directory levels

diff --git a/sarif/filter/filter_stats.py b/sarif/filter/filter_stats.py
@@ -13,6 +13,7 @@ def __init__(self, filter_description):
         self.filter_datetime = None
         self.filtered_in_result_count = 0
         self.filtered_out_result_count = 0
+        self.missing_property_count = 0
 
     def reset_counters(self):
         """
@@ -21,6 +22,7 @@ def reset_counters(self):
         self.filter_datetime = datetime.datetime.now()
         self.filtered_in_result_count = 0
         self.filtered_out_result_count = 0
+        self.missing_property_count = 0
 
     def add(self, other_filter_stats):
         """
@@ -35,6 +37,7 @@ def add(self, other_filter_stats):
             self.filtered_out_result_count += (
                 other_filter_stats.filtered_out_result_count
             )
+            self.missing_property_count += other_filter_stats.missing_property_count
 
     def __str__(self):
         """
@@ -54,6 +57,12 @@ def to_string(self):
             f": {self.filtered_out_result_count} filtered out, "
             f"{self.filtered_in_result_count} passed the filter"
         )
+        if self.missing_property_count:
+            ret += (
+                f", {self.missing_property_count} included by default "
+                "for lacking data to filter"
+            )
+
         return ret
 
     def to_json_camel_case(self):
@@ -65,6 +74,9 @@ def to_json_camel_case(self):
             "filter": self.filter_description,
             "in": self.filtered_in_result_count,
             "out": self.filtered_out_result_count,
+            "default": {
+                "noProperty": self.missing_property_count,
+            },
         }
 
 
@@ -79,4 +91,5 @@ def load_filter_stats_from_json(json_data):
         ret.rehydrated = True
         ret.filtered_in_result_count = json_data.get("in", 0)
         ret.filtered_out_result_count = json_data.get("out", 0)
+        ret.missing_property_count = json_data.get("default", {}).get("noProperty", 0)
     return ret
diff --git a/sarif/filter/general_filter.py b/sarif/filter/general_filter.py
@@ -2,12 +2,13 @@
 import re
 from typing import Optional, List
 
+import copy
 import jsonpath_ng.ext
 import yaml
 
 from sarif.filter.filter_stats import FilterStats, load_filter_stats_from_json
 
-# Commonly used fields can be specified using shortcuts
+# Commonly used properties can be specified using shortcuts
 # instead of full JSON path
 FILTER_SHORTCUTS = {
     "author": "properties.blame.author",
@@ -19,10 +20,15 @@
     "suppression": "suppressions[*].kind",
 }
 
-# Some fields can have specific shortcuts to make it easier to write filters
+# Some properties can have specific shortcuts to make it easier to write filters
 # For example a file location can be specified using wildcards
 FIELDS_REGEX_SHORTCUTS = {"uri": {"**": ".*", "*": "[^/]*", "?": "."}}
 
+# Default configuration for all filters
+DEFAULT_CONFIGURATION = {
+    "default-include": True,
+}
+
 
 def get_filter_function(filter_spec):
     if filter_spec:
@@ -38,22 +44,22 @@ def get_filter_function(filter_spec):
     return lambda value: True
 
 
-def _convert_glob_to_regex(field_name, field_value_spec):
-    # skip if field_value_spec is a regex
-    if field_value_spec and not (
-        field_value_spec.startswith("/") and field_value_spec.endswith("/")
+def _convert_glob_to_regex(property_name, property_value_spec):
+    # skip if property_value_spec is a regex
+    if property_value_spec and not (
+        property_value_spec.startswith("/") and property_value_spec.endswith("/")
     ):
-        # get last component of field name
-        last_component = field_name.split(".")[-1]
+        # get last component of property name
+        last_component = property_name.split(".")[-1]
         if last_component in FIELDS_REGEX_SHORTCUTS:
             shortcuts = FIELDS_REGEX_SHORTCUTS[last_component]
             rx = re.compile("|".join(map(re.escape, shortcuts.keys())))
-            field_value_spec = rx.sub(
-                lambda match: shortcuts[match.group(0)], field_value_spec
+            property_value_spec = rx.sub(
+                lambda match: shortcuts[match.group(0)], property_value_spec
             )
 
-            return f"/{field_value_spec}/"
-    return field_value_spec
+            return f"/{property_value_spec}/"
+    return property_value_spec
 
 
 class GeneralFilter:
@@ -67,8 +73,11 @@ def __init__(self):
         self.apply_inclusion_filter = False
         self.exclude_filters = {}
         self.apply_exclusion_filter = False
+        self.configuration = DEFAULT_CONFIGURATION
 
-    def init_filter(self, filter_description, include_filters, exclude_filters):
+    def init_filter(
+        self, filter_description, configuration, include_filters, exclude_filters
+    ):
         """
         Initialise the filter with the given filter patterns.
         """
@@ -77,6 +86,7 @@ def init_filter(self, filter_description, include_filters, exclude_filters):
         self.apply_inclusion_filter = len(include_filters) > 0
         self.exclude_filters = exclude_filters
         self.apply_exclusion_filter = len(exclude_filters) > 0
+        self.configuration.update(configuration)
 
     def rehydrate_filter_stats(self, dehydrated_filter_stats, filter_datetime):
         """
@@ -97,29 +107,30 @@ def _filter_append(self, filtered_results: List[dict], result: dict):
         # Remove any existing filter log on the result
         result.setdefault("properties", {}).pop("filtered", None)
 
-        matched_include_filters = []
+        included_stats = None
         if self.apply_inclusion_filter:
-            matched_include_filters = self._filter_result(result, self.include_filters)
-            if not matched_include_filters:
+            included_stats = self._filter_result(result, self.include_filters)
+            if not included_stats["matchedFilter"]:
                 return
 
         if self.apply_exclusion_filter:
-            if self._filter_result(result, self.exclude_filters):
+            excluded_stats = self._filter_result(result, self.exclude_filters)
+            if excluded_stats["matchedFilter"]:
                 self.filter_stats.filtered_out_result_count += 1
                 return
 
-        included = {
-            "state": "included",
-            "matchedFilter": matched_include_filters,
-        }
-        self.filter_stats.filtered_in_result_count += 1
-        included["filter"] = self.filter_stats.filter_description
-        result["properties"]["filtered"] = included
+        if included_stats["state"] == "included":
+            self.filter_stats.filtered_in_result_count += 1
+        else:
+            self.filter_stats.missing_property_count += 1
+        included_stats["filter"] = self.filter_stats.filter_description
+        result["properties"]["filtered"] = included_stats
 
         filtered_results.append(result)
 
-    def _filter_result(self, result: dict, filters: List[str]) -> List[dict]:
+    def _filter_result(self, result: dict, filters: List[str]) -> dict:
         matched_filters = []
+        warnings = []
         if filters:
             # filters contains rules which treated as OR.
             # if any rule matches, the record is selected.
@@ -130,6 +141,14 @@ def _filter_result(self, result: dict, filters: List[str]) -> List[dict]:
                 for prop_path, prop_value_spec in filter_spec.items():
                     resolved_prop_path = FILTER_SHORTCUTS.get(prop_path, prop_path)
                     jsonpath_expr = jsonpath_ng.ext.parse(resolved_prop_path)
+                    filter_configuration = self.configuration
+
+                    # if prop_value_spec is a dict, update filter configuration from it
+                    if isinstance(prop_value_spec, dict):
+                        filter_configuration = copy.deepcopy(self.configuration)
+                        filter_configuration.update(prop_value_spec)
+                        # actual value for the filter is in "value" key
+                        prop_value_spec = prop_value_spec.get("value", None)
 
                     found_results = jsonpath_expr.find(result)
                     if found_results:
@@ -140,11 +159,34 @@ def _filter_result(self, result: dict, filters: List[str]) -> List[dict]:
                         filter_function = get_filter_function(value_spec)
                         if filter_function(value):
                             continue
+                    else:
+                        # property to filter on is not found.
+                        # if "default-include" is true, include the "result" with a warning.
+                        if filter_configuration.get("default-include", True):
+                            warnings.append(
+                                f"Field '{prop_path}' is missing but the result included as default-include is true"
+                            )
+                            continue
                     matched = False
                     break
                 if matched:
                     matched_filters.append(filter_spec)
-        return matched_filters
+                    break
+
+        stats = {
+            "state": "included",
+            "matchedFilter": matched_filters,
+        }
+
+        if warnings:
+            stats.update(
+                {
+                    "state": "default",
+                    "warnings": warnings,
+                }
+            )
+
+        return stats
 
     def filter_results(self, results: List[dict]) -> List[dict]:
         """
@@ -177,8 +219,9 @@ def load_filter_file(file_path):
         with open(file_path, encoding="utf-8") as file_in:
             yaml_content = yaml.safe_load(file_in)
             filter_description = yaml_content.get("description", file_name)
+            configuration = yaml_content.get("configuration", {})
             include_filters = yaml_content.get("include", {})
             exclude_filters = yaml_content.get("exclude", {})
     except yaml.YAMLError as error:
         raise IOError(f"Cannot read filter file {file_path}") from error
-    return filter_description, include_filters, exclude_filters
+    return filter_description, configuration, include_filters, exclude_filters
diff --git a/sarif/sarif_file.py b/sarif/sarif_file.py
@@ -204,15 +204,19 @@ def init_default_line_number_1(self):
         self._default_line_number = "1"
         self._cached_records = None
 
-    def init_general_filter(self, filter_description, include_filters, exclude_filters):
+    def init_general_filter(
+        self, filter_description, configuration, include_filters, exclude_filters
+    ):
         """
-        Set up general filtering.  This is applied to all fields in results array in each SARIF file.
+        Set up general filtering.  This is applied to all properties in results array in each SARIF file.
         If only inclusion criteria are provided, only issues matching the inclusion criteria are considered.
         If only exclusion criteria are provided, only issues not matching the exclusion criteria are considered.
         If both are provided, only issues matching the inclusion criteria and not matching the
         exclusion criteria are considered.
         """
-        self._filter.init_filter(filter_description, include_filters, exclude_filters)
+        self._filter.init_filter(
+            filter_description, configuration, include_filters, exclude_filters
+        )
         # Clear the unfiltered records cached by get_records() above.
         self._cached_records = None
 
@@ -420,17 +424,19 @@ def init_default_line_number_1(self):
         for run in self.runs:
             run.init_default_line_number_1()
 
-    def init_general_filter(self, filter_description, include_filters, exclude_filters):
+    def init_general_filter(
+        self, filter_description, configuration, include_filters, exclude_filters
+    ):
         """
-        Set up general filtering.  This is applied to all fields in results array in each SARIF file.
+        Set up general filtering.  This is applied to all properties in results array in each SARIF file.
         If only inclusion criteria are provided, only issues matching the inclusion criteria are considered.
         If only exclusion criteria are provided, only issues not matching the exclusion criteria are considered.
         If both are provided, only issues matching the inclusion criteria and not matching the
         exclusion criteria are considered.
         """
         for run in self.runs:
             run.init_general_filter(
-                filter_description, include_filters, exclude_filters
+                filter_description, configuration, include_filters, exclude_filters
             )
 
     def get_abs_file_path(self) -> str:
@@ -619,21 +625,23 @@ def init_default_line_number_1(self):
         for input_file in self.files:
             input_file.init_default_line_number_1()
 
-    def init_general_filter(self, filter_description, include_filters, exclude_filters):
+    def init_general_filter(
+        self, filter_description, configuration, include_filters, exclude_filters
+    ):
         """
-        Set up general filtering.  This is applied to all fields in results array in each SARIF file.
+        Set up general filtering.  This is applied to all properties in results array in each SARIF file.
         If only inclusion criteria are provided, only issues matching the inclusion criteria are considered.
         If only exclusion criteria are provided, only issues not matching the exclusion criteria are considered.
         If both are provided, only issues matching the inclusion criteria and not matching the
         exclusion criteria are considered.
         """
         for subdir in self.subdirs:
             subdir.init_general_filter(
-                filter_description, include_filters, exclude_filters
+                filter_description, configuration, include_filters, exclude_filters
             )
         for input_file in self.files:
             input_file.init_general_filter(
-                filter_description, include_filters, exclude_filters
+                filter_description, configuration, include_filters, exclude_filters
             )
 
     def add_dir(self, sarif_file_set):