Skip to content

Commit

Permalink
fixes bug 1382727 - remove bad keys from ES documents (#3912)
Browse files Browse the repository at this point in the history
* fixes bug 1382727 - remove bad keys from ES documents

This adds a pass to remove bad keys from the raw_crash part of the document
before indexing it into Elasticsearch.

Good keys contain one or more ascii alpha-numeric characters plus underscore and
hyphen. Everything else is invalid and gets removed.

* Fix names and add comments
  • Loading branch information
willkg committed Aug 11, 2017
1 parent d8080a5 commit c3c8395
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 2 deletions.
47 changes: 45 additions & 2 deletions socorro/external/es/crashstorage.py
Expand Up @@ -37,6 +37,22 @@ def __init__(self, config):
]


# Valid Elasticsearch keys contain one or more ascii alphanumeric characters, underscore, and hyphen
# and that's it.
VALID_KEY = re.compile(r'^[a-zA-Z0-9_-]+$')


def is_valid_key(key):
"""Validates an Elasticsearch document key
:arg string key: the key to validate
:returns: True if it's valid and False if not
"""
return bool(VALID_KEY.match(key))


class ESCrashStorage(CrashStorageBase):
"""This sends raw and processed crash reports to Elasticsearch."""

Expand Down Expand Up @@ -168,13 +184,41 @@ def reconstitute_datetimes(processed_crash):
# not there? we don't care
pass

@staticmethod
def remove_bad_keys(data):
"""Removes keys from the top-level of the dict that are bad
Good keys satisfy the following properties:
* have one or more characters
* are composed of a-zA-Z0-9_-
Anything else is a bad key and needs to be removed.
This modifies the data dict in-place and only looks at the top level.
:arg dict data: the data to remove bad keys from
"""
if not data:
return

# Copy the list of things we're iterating over because we're mutating
# the dict in place.
for key in list(data.keys()):
if not is_valid_key(key):
del data[key]

def _submit_crash_to_elasticsearch(self, connection, crash_document):
"""Submit a crash report to elasticsearch.
"""
# Massage the crash such that the date_processed field is formatted
# in the fashion of our established mapping.
self.reconstitute_datetimes(crash_document['processed_crash'])

# Remove bad keys from the raw crash.
self.remove_bad_keys(crash_document['raw_crash'])

# Obtain the index name.
es_index = self.get_index_for_crash(
crash_document['processed_crash']['date_processed']
Expand All @@ -190,8 +234,7 @@ def _submit_crash_to_elasticsearch(self, connection, crash_document):
# Submit the crash for indexing.
# Don't retry more than 5 times. That is to avoid infinite loops in
# case of an unhandled exception.
times = range(5)
while times.pop(-1):
for attempt in range(5):
try:
connection.index(
index=es_index,
Expand Down
52 changes: 52 additions & 0 deletions socorro/unittest/external/es/test_crashstorage.py
Expand Up @@ -10,9 +10,11 @@
from copy import deepcopy

from configman.dotdict import DotDict
import pytest

from socorro.external.crashstorage_base import Redactor
from socorro.external.es.crashstorage import (
is_valid_key,
ESCrashStorage,
ESCrashStorageRedactedSave,
ESCrashStorageRedactedJsonDump,
Expand Down Expand Up @@ -165,6 +167,29 @@ def test_redact_raw_crash(self):
eq_(crash, expected_crash)


class TestIsValidKey(object):
@pytest.mark.parametrize('key', [
'a',
'abc',
'ABC',
'AbcDef',
'Abc_Def',
'Abc-def'
'Abc-123-def'
])
def test_valid_key(self, key):
assert is_valid_key(key) is True

@pytest.mark.parametrize('key', [
'',
'.',
'abc def',
u'na\xefve',
])
def test_invalid_key(self, key):
assert is_valid_key(key) is False


class IntegrationTestESCrashStorage(ElasticsearchTestCase):
"""These tests interact with Elasticsearch (or some other external
resource).
Expand Down Expand Up @@ -235,6 +260,33 @@ def test_index_crash(self):
)
es_storage.close()

def test_index_crash_with_bad_keys(self):
a_raw_crash_with_bad_keys = {
'foo': 'alpha',
'': 'bad key 1',
'.': 'bad key 2',
u'na\xefve': 'bad key 3',
}

es_storage = ESCrashStorage(config=self.config)

es_storage.save_raw_and_processed(
raw_crash=a_raw_crash_with_bad_keys,
dumps=None,
processed_crash=a_processed_crash,
crash_id=a_processed_crash['uuid']
)

# Ensure that the document was indexed by attempting to retreive it.
doc = self.es_client.get(
index=self.config.elasticsearch.elasticsearch_index,
id=a_processed_crash['uuid']
)
# Make sure the invalid keys aren't in the crash.
raw_crash = doc['_source']['raw_crash']
assert raw_crash == {'foo': 'alpha'}
es_storage.close()

def test_bulk_index_crash(self):
"""Test indexing a crash document.
"""
Expand Down

0 comments on commit c3c8395

Please sign in to comment.