Skip to content

Commit

Permalink
bug 1697051: add crash_report_keys field
Browse files Browse the repository at this point in the history
This adds a field that we can search using contains/does-not-contain and
aggregate on to help us answer questions like:

1. Which crash reports have file named xyz? For example, which have a
   memory_report.
2. Which crash reports have crash annotation XYZ? For example, which
   have a new crash annotation field we're not indexing yet.

This also adds a processor note with "invalidkeys" to indicate when the
crash report has keys that are malformed.
  • Loading branch information
willkg committed Nov 8, 2021
1 parent 2381325 commit 1a7a443
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 0 deletions.
5 changes: 5 additions & 0 deletions socorro/external/es/super_search_fields.py
Expand Up @@ -627,6 +627,11 @@ def number_field(
"query_type": "number",
"storage_mapping": {"type": "long"},
},
"crash_report_keys": keyword_field(
name="crash_report_keys",
description="Crash annotation keys and dump filenames from the crash report.",
is_protected=False,
),
"bug_1541161": {
"data_validation_type": "str",
"description": (
Expand Down
2 changes: 2 additions & 0 deletions socorro/processor/processor_pipeline.py
Expand Up @@ -25,6 +25,7 @@
)
from socorro.processor.rules.general import (
CPUInfoRule,
CrashReportKeysRule,
DeNoneRule,
DeNullRule,
IdentifierRule,
Expand Down Expand Up @@ -184,6 +185,7 @@ def get_rulesets(self, config):
# The default processing pipeline
"default": [
# fix the raw crash removing null characters and Nones
CrashReportKeysRule(),
DeNullRule(),
DeNoneRule(),
# fix ModuleSignatureInfo if it needs fixing
Expand Down
36 changes: 36 additions & 0 deletions socorro/processor/rules/general.py
Expand Up @@ -2,6 +2,8 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

import re

from glom import glom
import markus

Expand Down Expand Up @@ -127,3 +129,37 @@ def action(self, raw_crash, dumps, processed_crash, processor_meta):
processed_crash, "json_dump.system_info.os_ver", default=""
).strip()
processed_crash["os_version"] = os_ver


class CrashReportKeysRule(Rule):
"""Extracts a list of all keys and dump names and saves it as crash_report_keys"""

# At least one alphanumeric plus underscore and dash
VALID_KEY = re.compile(r"^[a-zA-Z0-9_-]+$")

def sanitize(self, key):
# If the key isn't alphanumeric with underscores, then it's not valid
if not self.VALID_KEY.match(key):
return None

# Truncate
key = key[:100]

return key

def action(self, raw_crash, dumps, processed_crash, processor_meta):
all_keys = set(raw_crash.keys()) | set(dumps.keys())

# Go through and remove obviously invalid keys
sanitized_keys = [self.sanitize(key) for key in all_keys]
sanitized_keys = {key for key in sanitized_keys if key}

processed_crash["crash_report_keys"] = list(sorted(sanitized_keys))

# Figure out the set of keys that are in one set or the other, but
# not both
diff = all_keys.symmetric_difference(sanitized_keys)
if diff:
processor_meta["processor_notes"].append(
"invalidkeys: Crash report contains invalid keys"
)
67 changes: 67 additions & 0 deletions socorro/unittest/processor/rules/test_general.py
Expand Up @@ -7,6 +7,7 @@

from socorro.processor.rules.general import (
CPUInfoRule,
CrashReportKeysRule,
DeNoneRule,
DeNullRule,
IdentifierRule,
Expand Down Expand Up @@ -328,3 +329,69 @@ def test_stuff_missing(self):

# raw crash should be unchanged
assert raw_crash == {}


class TestCrashReportKeysRule:
def test_basic(self):
raw_crash = {
"Product": "Firefox",
"Version": "95.0",
"ReleaseChannel": "nightly",
"ipc_channel_error": "ouch",
}
dumps = {
"upload_file_minidump": "fake data",
"upload_file_minidump_flash1": "fake data",
}
processed_crash = {}
processor_meta = get_basic_processor_meta()

rule = CrashReportKeysRule()

rule.act(raw_crash, dumps, processed_crash, processor_meta)

assert processor_meta["processor_notes"] == []
assert processed_crash == {
"crash_report_keys": [
"Product",
"ReleaseChannel",
"Version",
"ipc_channel_error",
"upload_file_minidump",
"upload_file_minidump_flash1",
],
}

@pytest.mark.parametrize(
"key, expected",
[
("ipc_channel_error", "ipc_channel_error"),
("ReleaseProcess", "ReleaseProcess"),
("ContentSandboxWin32kState", "ContentSandboxWin32kState"),
("Add-ons", "Add-ons"),
# Long keys can be valid, but are truncated to 100 characters
("a" * 150, "a" * 100),
],
)
def test_sanitize_valid(self, key, expected):
rule = CrashReportKeysRule()

assert rule.sanitize(key) == expected

@pytest.mark.parametrize(
"key",
[
# Keys have to have at least one character
"",
# Spaces aren't valid
" abc ",
# Dollar sign isn't valid
"abc$def",
# Unicode isn't valid
"\u0001F44D",
],
)
def test_sanitize_invalid(self, key):
rule = CrashReportKeysRule()

assert rule.sanitize(key) is None

0 comments on commit 1a7a443

Please sign in to comment.