Merge pull request #328 from AdrianGaudebert/scrubber

Bug 858245 - Added a scrubber app to use in the API.
mozilla · Apr 26, 2013 · 9cbc154 · 9cbc154
2 parents 0ef7ea0 + 983cce8
commit 9cbc154
Show file tree

Hide file tree

Showing 3 changed files with 162 additions and 0 deletions.
diff --git a/crashstats/scrubber/__init__.py b/crashstats/scrubber/__init__.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+
+import re
+
+
+# source: http://stackp.online.fr/?p=19
+EMAIL = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')
+
+
+# source: http://stackoverflow.com/questions/520031
+URL = re.compile(
+    r"((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.‌]"
+    "[a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]"
+    "+|(‌([^\s()<>]+)))*)|[^\s`!()[]{};:'\".,<>?«»“”‘’]))",
+    re.DOTALL
+)
+
+
+def scrub_data(data, **kwargs):
+    """Return a scrubbed copy of a list of dictionaries.
+
+    See `scrub_dict` for parameters.
+    """
+    scrubbed = list(data)
+    for i, item in enumerate(scrubbed):
+        scrubbed[i] = scrub_dict(item, **kwargs)
+    return scrubbed
+
+
+def scrub_dict(
+    data,
+    remove_fields=None,
+    replace_fields=None,
+    clean_fields=None
+):
+    """Return a scrubbed copy of a dictionary.
+
+    Several options are available:
+    * remove_fields
+        * list or tuple of strings
+        * remove those fields from the dictionary
+        * example: remove_fields=['email', 'phone']
+    * replace_fields
+        * list or tuple of 2-uples
+        * replace the value of those fields with some content
+        * example: replace_fields=[('email', 'scrubbed email'), ('phone', '')]
+    * clean_fields
+        * list or tuple of 2-uples
+        * search for patterns in those fields and remove what matches
+        * example: clean_fields=[('comment', EMAIL), ('comment', URL)]
+
+    Any number of those options can be used in the same call. If none is used,
+    return the dictionary unchanged.
+    """
+    scrubbed = data.copy()
+    for key in remove_fields or []:
+        if key in scrubbed:
+            del scrubbed[key]
+
+    for key in scrubbed:
+        for field in replace_fields or []:
+            if field[0] == key:
+                scrubbed[key] = field[1]
+
+        for field in clean_fields or []:
+            if field[0] == key:
+                scrubbed[key] = scrub_string(scrubbed[key], field[1])
+
+    return scrubbed
+
+
+def scrub_string(data, pattern):
+    """Return a copy of a string where everything that matches the pattern is
+    removed.
+    """
+    for i in pattern.findall(data):
+        data = data.replace(i[0], '')
+    return data
diff --git a/crashstats/scrubber/tests/__init__.py b/crashstats/scrubber/tests/__init__.py
diff --git a/crashstats/scrubber/tests/test_scrubber.py b/crashstats/scrubber/tests/test_scrubber.py
@@ -0,0 +1,84 @@
+from nose.tools import eq_, ok_
+from django.test import TestCase
+
+from crashstats import scrubber
+
+
+class TestScrubber(TestCase):
+
+    def test_scrub_string_email(self):
+        data = 'this is my email me@example.org!'
+        res = scrubber.scrub_string(data, scrubber.EMAIL)
+        eq_(res, 'this is my email !')
+
+    def test_scrub_string_url(self):
+        data = 'this is my Web site http://example.org/?param=12 !'
+        res = scrubber.scrub_string(data, scrubber.URL)
+        eq_(res, 'this is my Web site  !')
+
+        data = 'link www.example.org/?param=12'
+        res = scrubber.scrub_string(data, scrubber.URL)
+        eq_(res, 'link ')
+
+    def test_scrub_dict_remove_fields(self):
+        data = {
+            'email': 'me@example.org',
+            'text': 'hello'
+        }
+        res = scrubber.scrub_dict(data, remove_fields=['email'])
+        eq_(res, {'text': 'hello'})
+
+    def test_scrub_dict_replace_fields(self):
+        data = {
+            'email': 'me@example.org',
+            'text': 'hello'
+        }
+        res = scrubber.scrub_dict(data, replace_fields=[('email', 'scrubbed')])
+        eq_(res, {'email': 'scrubbed', 'text': 'hello'})
+
+    def test_scrub_dict_clean_fields(self):
+        data = {
+            'email': 'me@example.org',
+            'text': (
+                'this is my email address me@example.org and my website '
+                'http://www.example.org/ do you like it?'
+            )
+        }
+        res = scrubber.scrub_dict(
+            data,
+            clean_fields=[('text', scrubber.EMAIL), ('text', scrubber.URL)]
+        )
+        ok_('email' in res)
+        ok_('text' in res)
+        ok_('email address' in res['text'])
+        ok_('me@example.org' not in res['text'])
+        ok_('http://www.example.org/' not in res['text'])
+
+    def test_scrub_data(self):
+        data = [
+            {
+                'email': 'me@example.org',
+                'text': 'look at my site www.example.org it is cool',
+                'age': 25,
+            },
+            {
+                'email': None,
+                'url': 'http://mozilla.org',
+                'age': 25,
+            }
+        ]
+        res = scrubber.scrub_data(data)
+        eq_(data, res)
+
+        res = scrubber.scrub_data(
+            data,
+            remove_fields=['age'],
+            replace_fields=[('email', 'NO EMAIL'), ('url', 'NO URL')],
+            clean_fields=[('text', scrubber.EMAIL), ('text', scrubber.URL)]
+        )
+        eq_(len(res), 2)
+        eq_(res[0]['email'], 'NO EMAIL')
+        eq_(res[1]['url'], 'NO URL')
+        ok_('age' not in res[0])
+        ok_('age' not in res[1])
+        ok_('www.example.org' not in res[0]['text'])