Skip to content

Commit

Permalink
Merge pull request #1620 from twobraids/redacted
Browse files Browse the repository at this point in the history
fixes Bug 930324 - added processed_crash redaction system
  • Loading branch information
twobraids committed Oct 25, 2013
2 parents 1ca732a + 038a137 commit 7490cd5
Show file tree
Hide file tree
Showing 20 changed files with 346 additions and 139 deletions.
96 changes: 81 additions & 15 deletions socorro/external/crashstorage_base.py
Expand Up @@ -8,12 +8,56 @@

import sys
import collections
import re

from configman import Namespace, RequiredConfig
from configman.converters import classes_in_namespaces_converter, \
class_converter
from configman.dotdict import DotDict

#==============================================================================
class Redactor(RequiredConfig):
"""This class is the implementation of a functor for in situ redacting
of sensitive keys from a mapping. Keys that are to be redacted are placed
in the configuration under the name 'forbidden_keys'. They may take the
form of dotted keys with subkeys. For example, "a.b.c" means that the key,
"c" is to be redacted."""
required_config = Namespace()
required_config.add_option(
name='forbidden_keys',
doc='a list of keys not allowed in a redacted processed crash',
default="url, email, user_id, exploitability,"
"json_dump.sensitive,"
"upload_file_minidump_flash1.json_dump.sensitive,"
"upload_file_minidump_flash2.json_dump.sensitive,"
"upload_file_minidump_browser.json_dump.sensitive"
)

#--------------------------------------------------------------------------
def __init__(self, config):
self.config = config
self.forbidden_keys = [
x.strip() for x in self.config.forbidden_keys.split(',')
]

#--------------------------------------------------------------------------
def redact(self, a_mapping):
"""this is the function that does the redaction."""
for a_key in self.forbidden_keys:
sub_mapping = a_mapping
sub_keys = a_key.split('.')
try:
for a_sub_key in sub_keys[:-1]: # step through the subkeys
sub_mapping = sub_mapping[a_sub_key.strip()]
del sub_mapping[sub_keys[-1]]
except KeyError:
pass # this is okay, our key was already deleted by
# another pattern that matched at a higher level

#--------------------------------------------------------------------------
def __call__(self, a_mapping):
self.redact(a_mapping)


#==============================================================================
class CrashIDNotFound(Exception):
Expand All @@ -24,6 +68,11 @@ class CrashIDNotFound(Exception):
class CrashStorageBase(RequiredConfig):
"""the base class for all crash storage classes"""
required_config = Namespace()
required_config.add_option(
name="redactor_class",
doc="the name of the class that implements a 'redact' method",
default=Redactor
)

#--------------------------------------------------------------------------
def __init__(self, config, quit_check_callback=None):
Expand Down Expand Up @@ -55,6 +104,7 @@ def __init__(self, config, quit_check_callback=None):
self.quit_check = lambda: False
self.logger = config.logger
self.exceptions_eligible_for_retry = ()
self.redactor = config.redactor_class(config)

#--------------------------------------------------------------------------
def close(self):
Expand Down Expand Up @@ -153,11 +203,25 @@ def get_raw_dumps_as_files(self, crash_id):

#--------------------------------------------------------------------------
def get_processed(self, crash_id):
"""the default implementation of fetching a processed_crash
"""the default implementation of fetching a processed_crash. This
method should not be overridden in subclasses unless the intent is to
alter the redaction process.
parameters:
crash_id - the id of a processed_crash to fetch"""
processed_crash = self.get_unredacted_processed(crash_id)
self.redactor(processed_crash)
return processed_crash

#--------------------------------------------------------------------------
def get_unredacted_processed(self, crash_id):
"""the implementation of fetching a processed_crash with no redaction
parameters:
crash_id - the id of a processed_crash to fetch"""
raise NotImplementedError("get_processed is not implemented")
raise NotImplementedError(
"get_unredacted_processed is not implemented"
)

#--------------------------------------------------------------------------
def remove(self, crash_id):
Expand Down Expand Up @@ -206,7 +270,7 @@ def get_raw_dumps(self, crash_id):
return {}

#--------------------------------------------------------------------------
def get_processed(self, crash_id):
def get_unredacted_processed(self, crash_id):
"""the default implementation of fetching a processed_crash
parameters:
Expand Down Expand Up @@ -543,15 +607,15 @@ def get_raw_dumps_as_files(self, crash_id):
return self.fallback_store.get_raw_dumps_as_files(crash_id)

#--------------------------------------------------------------------------
def get_processed(self, crash_id):
"""the default implementation of fetching a processed_crash
def get_unredacted_processed(self, crash_id):
"""fetch an unredacted processed_crash
parameters:
crash_id - the id of a processed_crash to fetch"""
try:
return self.primary_store.get_processed(crash_id)
return self.primary_store.get_unredacted_processed(crash_id)
except CrashIDNotFound:
return self.fallback_store.get_processed(crash_id)
return self.fallback_store.get_unredacted_processed(crash_id)

#--------------------------------------------------------------------------
def remove(self, crash_id):
Expand Down Expand Up @@ -608,7 +672,7 @@ class PrimaryDeferredStorage(CrashStorageBase):
def __init__(self, config, quit_check_callback=None):
"""instantiate the primary and deferred storage systems"""
super(PrimaryDeferredStorage, self).__init__(
config,
config,
quit_check_callback
)
self.primary_store = config.primary.storage_class(
Expand Down Expand Up @@ -704,15 +768,15 @@ def get_raw_dumps_as_files(self, crash_id):
return self.deferred_store.get_raw_dumps_as_files(crash_id)

#--------------------------------------------------------------------------
def get_processed(self, crash_id):
"""the default implementation of fetching a processed_crash
def get_unredacted_processed(self, crash_id):
"""fetch an unredacted processed_crash
parameters:
crash_id - the id of a processed_crash to fetch"""
try:
return self.primary_store.get_processed(crash_id)
return self.primary_store.get_unredacted_processed(crash_id)
except CrashIDNotFound:
return self.deferred_store.get_processed(crash_id)
return self.deferred_store.get_unredacted_processed(crash_id)

#--------------------------------------------------------------------------
def remove(self, crash_id):
Expand Down Expand Up @@ -753,7 +817,7 @@ class PrimaryDeferredProcessedStorage(PrimaryDeferredStorage):
#--------------------------------------------------------------------------
def __init__(self, config, quit_check_callback=None):
super(PrimaryDeferredProcessedStorage, self).__init__(
config,
config,
quit_check_callback
)
self.processed_store = config.processed.storage_class(
Expand All @@ -766,5 +830,7 @@ def save_processed(self, processed_crash):
self.processed_store.save_processed(processed_crash)

#--------------------------------------------------------------------------
def get_processed(self, crash_id):
return self.processed_store.get_processed(crash_id)
def get_unredacted_processed(self, crash_id):
"""fetch an unredacted processed crash from the underlying
storage implementation"""
return self.processed_store.get_unredacted_processed(crash_id)
33 changes: 1 addition & 32 deletions socorro/external/filesystem/crashstorage.py
Expand Up @@ -382,13 +382,6 @@ class FileSystemCrashStorage(FileSystemThrottledCrashStorage):
doc='the length of branches in the radix storage tree',
default=2
)
required_config.add_option(
'forbidden_keys',
doc='a comma delimited list of keys to not allowed in the processed '
'crash',
default='url, email, user_id, exploitability',
from_string_converter=lambda x: [i.strip() for i in x.split(',')]
)

#--------------------------------------------------------------------------
def __init__(self, config, quit_check_callback=None):
Expand Down Expand Up @@ -418,10 +411,6 @@ def save_processed(self, processed_crash):
except KeyError:
raise CrashIDNotFound("uuid missing from processed_crash")
try:
processed_crash = self.sanitize_processed_crash(
processed_crash,
self.config.forbidden_keys
)
self._stringify_dates_in_dict(processed_crash)
processed_crash_file_handle = \
self.pro_crash_store.newEntry(crash_id)
Expand All @@ -439,7 +428,7 @@ def save_processed(self, processed_crash):
raise

#--------------------------------------------------------------------------
def get_processed(self, crash_id):
def get_unredacted_processed(self, crash_id):
"""fetch a processed json file from the underlying file system"""
try:
return self.pro_crash_store.getDumpFromFile(crash_id)
Expand All @@ -463,26 +452,6 @@ def remove(self, crash_id):
self.logger.warning('processed crash not found for deletion: %s',
crash_id)

#--------------------------------------------------------------------------
@staticmethod
def sanitize_processed_crash(processed_crash, forbidden_keys):
"""returns a copy of a processed_crash with the forbidden keys removed.
parameters:
processed_crash - the processed crash in the form of a mapping
forbidden_keys - a list of strings to be removed from the
processed crash
returns:
a mapping that is a shallow copy of the original processed_crash
minus the forbidden keys and values"""

a_copy = processed_crash.copy()
for a_forbidden_key in forbidden_keys:
if a_forbidden_key in a_copy:
del a_copy[a_forbidden_key]
return a_copy

#--------------------------------------------------------------------------
@staticmethod
def _stringify_dates_in_dict(a_dict):
Expand Down
13 changes: 2 additions & 11 deletions socorro/external/fs/crashstorage.py
Expand Up @@ -89,13 +89,6 @@ class FSRadixTreeStorage(CrashStorageBase):
doc='the default dump field',
default='upload_file_minidump'
)
required_config.add_option(
'forbidden_keys',
doc='a comma delimited list of keys to not allowed in the processed '
'crash',
default='url, email, user_id, exploitability',
from_string_converter=lambda x: [i.strip() for i in x.split(',')]
)
required_config.add_option(
'name_branch_base',
doc='the directory base name to use for the named radix tree storage',
Expand Down Expand Up @@ -181,9 +174,6 @@ def save_processed(self, processed_crash):
crash_id = processed_crash['uuid']
processed_crash = processed_crash.copy()
f = StringIO()
for k in self.config.forbidden_keys:
if k in processed_crash:
del processed_crash[k]
with closing(gzip.GzipFile(mode='wb', fileobj=f)) as fz:
json.dump(processed_crash, fz, default=self.json_default)
self._save_files(crash_id, {
Expand Down Expand Up @@ -249,7 +239,8 @@ def read_with(fn):
for k, v
in self.get_raw_dumps_as_files(crash_id).iteritems())

def get_processed(self, crash_id):
def get_unredacted_processed(self, crash_id):
"""this method returns an unredacted processed crash"""
parent_dir = self._get_radixed_parent_directory(crash_id)
if not os.path.exists(parent_dir):
raise CrashIDNotFound
Expand Down
7 changes: 0 additions & 7 deletions socorro/external/hb/connection_context.py
Expand Up @@ -60,13 +60,6 @@ class HBaseConnectionContext(RequiredConfig):
doc='timeout in milliseconds for an HBase connection',
default=5000,
)
required_config.add_option(
'forbidden_keys',
default='email, url, user_id, exploitability',
doc='a comma delimited list of keys banned from the processed crash '
'in HBase',
from_string_converter=lambda s: [x.strip() for x in s.split(',')]
)
required_config.add_option(
'temporary_file_system_storage_path',
doc='a local filesystem path where dumps temporarily '
Expand Down
10 changes: 3 additions & 7 deletions socorro/external/hb/crashstorage.py
Expand Up @@ -255,10 +255,6 @@ def transaction(client, processed_crash=processed_crash):

crash_id = processed_crash['uuid']

for k in self.config.forbidden_keys:
if k in processed_crash:
del processed_crash[k]

self._stringify_dates_in_dict(processed_crash)

row_id = crash_id_to_row_id(crash_id)
Expand Down Expand Up @@ -426,9 +422,9 @@ def transaction(client):
return name_to_pathname_mapping
return transaction()

def get_processed(self, crash_id):
"""Return the cooked json (jsonz) for a given ooid as a string
If the ooid doesn't exist, return an empty string."""
def get_unredacted_processed(self, crash_id):
"""Return the unredacted processed json (jsonz) for a given ooid as a
Mapping. If not found, raise the NotFound exception."""
@self._wrap_in_transaction
def transaction(client):
row_id = crash_id_to_row_id(crash_id)
Expand Down
29 changes: 27 additions & 2 deletions socorro/external/hb/hbase_client.py
Expand Up @@ -12,6 +12,7 @@
import contextlib
import gzip
import sys
import json


_raises_exception = object()
Expand Down Expand Up @@ -92,9 +93,26 @@ def run(self):

class get_processed(_CommandRequiringCrashID):
"""Usage: get_processed CRASH_ID
Get the processed JSON for a crash"""
Get the redacted processed JSON for a crash"""
def run(self):
pprint.pprint(self.storage.get_processed(self.config.crash_id))
if self.config.json:
print json.dumps(self.storage.get_processed(self.config.crash_id))
else:
pprint.pprint(self.storage.get_processed(self.config.crash_id))


class get_unredacted_processed(_CommandRequiringCrashID):
"""Usage: get_unredacted_processed CRASH_ID
Get the unredacted processed JSON for a crash"""
def run(self):
if self.config.json:
print json.dumps(self.storage.get_unredacted_processed(
self.config.crash_id
))
else:
pprint.pprint(self.storage.get_unredacted_processed(
self.config.crash_id
))


class get_report_processing_state(_CommandRequiringCrashID):
Expand Down Expand Up @@ -237,6 +255,13 @@ class HBaseClientApp(generic_app.App):
doc='command to use',
from_string_converter=lambda s: class_converter(__name__ + '.' + s)
)
required_config.add_option(
'json',
default=False,
short_form='j',
doc='json output instead of a pretty printed mapping',
)


def main(self):
self.storage = self.config.hbase_crash_storage_class(self.config)
Expand Down
7 changes: 0 additions & 7 deletions socorro/external/hbase/connection_context.py
Expand Up @@ -38,13 +38,6 @@ class HBaseSingleConnectionContext(RequiredConfig):
doc='timeout in milliseconds for an HBase connection',
default=5000,
)
required_config.add_option(
'forbidden_keys',
default='email, url, user_id, exploitability',
doc='a comma delimited list of keys banned from the processed crash '
'in HBase',
from_string_converter=lambda s: [x.strip() for x in s.split(',')]
)
required_config.add_option(
'temporary_file_system_storage_path',
doc='a local filesystem path where dumps temporarily '
Expand Down

0 comments on commit 7490cd5

Please sign in to comment.