Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Bug 858245 - Added a scrubber app to use in the API. r=peterbe

  • Loading branch information...
commit 983cce83937ceff68f8e5780fae7b0a911daec7b 1 parent e176ec6
@AdrianGaudebert AdrianGaudebert authored
View
78 crashstats/scrubber/__init__.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+
+import re
+
+
+# source: http://stackp.online.fr/?p=19
+EMAIL = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')
+
+
+# source: http://stackoverflow.com/questions/520031
+URL = re.compile(
+ r"((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.‌​]"
+ "[a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]"
+ "+|(‌​([^\s()<>]+)))*)|[^\s`!()[]{};:'\".,<>?«»“”‘’]))",
+ re.DOTALL
+)
+
+
+def scrub_data(data, **kwargs):
+ """Return a scrubbed copy of a list of dictionaries.
+
+ See `scrub_dict` for parameters.
+ """
+ scrubbed = list(data)
+ for i, item in enumerate(scrubbed):
+ scrubbed[i] = scrub_dict(item, **kwargs)
+ return scrubbed
+
+
+def scrub_dict(
+ data,
+ remove_fields=None,
+ replace_fields=None,
+ clean_fields=None
+):
+ """Return a scrubbed copy of a dictionary.
+
+ Several options are available:
+ * remove_fields
+ * list or tuple of strings
+ * remove those fields from the dictionary
+ * example: remove_fields=['email', 'phone']
+ * replace_fields
+ * list or tuple of 2-uples
+ * replace the value of those fields with some content
+ * example: replace_fields=[('email', 'scrubbed email'), ('phone', '')]
+ * clean_fields
+ * list or tuple of 2-uples
+ * search for patterns in those fields and remove what matches
+ * example: clean_fields=[('comment', EMAIL), ('comment', URL)]
+
+ Any number of those options can be used in the same call. If none is used,
+ return the dictionary unchanged.
+ """
+ scrubbed = data.copy()
+ for key in remove_fields or []:
+ if key in scrubbed:
+ del scrubbed[key]
+
+ for key in scrubbed:
+ for field in replace_fields or []:
+ if field[0] == key:
+ scrubbed[key] = field[1]
+
+ for field in clean_fields or []:
+ if field[0] == key:
+ scrubbed[key] = scrub_string(scrubbed[key], field[1])
+
+ return scrubbed
+
+
+def scrub_string(data, pattern):
+ """Return a copy of a string where everything that matches the pattern is
+ removed.
+ """
+ for i in pattern.findall(data):
+ data = data.replace(i[0], '')
+ return data
View
0  crashstats/scrubber/tests/__init__.py
No changes.
View
84 crashstats/scrubber/tests/test_scrubber.py
@@ -0,0 +1,84 @@
+from nose.tools import eq_, ok_
+from django.test import TestCase
+
+from crashstats import scrubber
+
+
+class TestScrubber(TestCase):
+
+ def test_scrub_string_email(self):
+ data = 'this is my email me@example.org!'
+ res = scrubber.scrub_string(data, scrubber.EMAIL)
+ eq_(res, 'this is my email !')
+
+ def test_scrub_string_url(self):
+ data = 'this is my Web site http://example.org/?param=12 !'
+ res = scrubber.scrub_string(data, scrubber.URL)
+ eq_(res, 'this is my Web site !')
+
+ data = 'link www.example.org/?param=12'
+ res = scrubber.scrub_string(data, scrubber.URL)
+ eq_(res, 'link ')
+
+ def test_scrub_dict_remove_fields(self):
+ data = {
+ 'email': 'me@example.org',
+ 'text': 'hello'
+ }
+ res = scrubber.scrub_dict(data, remove_fields=['email'])
+ eq_(res, {'text': 'hello'})
+
+ def test_scrub_dict_replace_fields(self):
+ data = {
+ 'email': 'me@example.org',
+ 'text': 'hello'
+ }
+ res = scrubber.scrub_dict(data, replace_fields=[('email', 'scrubbed')])
+ eq_(res, {'email': 'scrubbed', 'text': 'hello'})
+
+ def test_scrub_dict_clean_fields(self):
+ data = {
+ 'email': 'me@example.org',
+ 'text': (
+ 'this is my email address me@example.org and my website '
+ 'http://www.example.org/ do you like it?'
+ )
+ }
+ res = scrubber.scrub_dict(
+ data,
+ clean_fields=[('text', scrubber.EMAIL), ('text', scrubber.URL)]
+ )
+ ok_('email' in res)
+ ok_('text' in res)
+ ok_('email address' in res['text'])
+ ok_('me@example.org' not in res['text'])
+ ok_('http://www.example.org/' not in res['text'])
+
+ def test_scrub_data(self):
+ data = [
+ {
+ 'email': 'me@example.org',
+ 'text': 'look at my site www.example.org it is cool',
+ 'age': 25,
+ },
+ {
+ 'email': None,
+ 'url': 'http://mozilla.org',
+ 'age': 25,
+ }
+ ]
+ res = scrubber.scrub_data(data)
+ eq_(data, res)
+
+ res = scrubber.scrub_data(
+ data,
+ remove_fields=['age'],
+ replace_fields=[('email', 'NO EMAIL'), ('url', 'NO URL')],
+ clean_fields=[('text', scrubber.EMAIL), ('text', scrubber.URL)]
+ )
+ eq_(len(res), 2)
+ eq_(res[0]['email'], 'NO EMAIL')
+ eq_(res[1]['url'], 'NO URL')
+ ok_('age' not in res[0])
+ ok_('age' not in res[1])
+ ok_('www.example.org' not in res[0]['text'])
Please sign in to comment.
Something went wrong with that request. Please try again.