Skip to content
This repository has been archived by the owner on Feb 1, 2018. It is now read-only.

Commit

Permalink
Bug 858245 - Added a scrubber app to use in the API. r=peterbe
Browse files Browse the repository at this point in the history
  • Loading branch information
adngdb committed Apr 26, 2013
1 parent e176ec6 commit 983cce8
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 0 deletions.
78 changes: 78 additions & 0 deletions crashstats/scrubber/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# coding=utf-8

import re


# source: http://stackp.online.fr/?p=19
EMAIL = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)')


# source: http://stackoverflow.com/questions/520031
URL = re.compile(
r"((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.‌​]"
"[a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]"
"+|(‌​([^\s()<>]+)))*)|[^\s`!()[]{};:'\".,<>?«»“”‘’]))",
re.DOTALL
)


def scrub_data(data, **kwargs):
"""Return a scrubbed copy of a list of dictionaries.
See `scrub_dict` for parameters.
"""
scrubbed = list(data)
for i, item in enumerate(scrubbed):
scrubbed[i] = scrub_dict(item, **kwargs)
return scrubbed


def scrub_dict(
data,
remove_fields=None,
replace_fields=None,
clean_fields=None
):
"""Return a scrubbed copy of a dictionary.
Several options are available:
* remove_fields
* list or tuple of strings
* remove those fields from the dictionary
* example: remove_fields=['email', 'phone']
* replace_fields
* list or tuple of 2-uples
* replace the value of those fields with some content
* example: replace_fields=[('email', 'scrubbed email'), ('phone', '')]
* clean_fields
* list or tuple of 2-uples
* search for patterns in those fields and remove what matches
* example: clean_fields=[('comment', EMAIL), ('comment', URL)]
Any number of those options can be used in the same call. If none is used,
return the dictionary unchanged.
"""
scrubbed = data.copy()
for key in remove_fields or []:
if key in scrubbed:
del scrubbed[key]

for key in scrubbed:
for field in replace_fields or []:
if field[0] == key:
scrubbed[key] = field[1]

for field in clean_fields or []:
if field[0] == key:
scrubbed[key] = scrub_string(scrubbed[key], field[1])

return scrubbed


def scrub_string(data, pattern):
"""Return a copy of a string where everything that matches the pattern is
removed.
"""
for i in pattern.findall(data):
data = data.replace(i[0], '')
return data
Empty file.
84 changes: 84 additions & 0 deletions crashstats/scrubber/tests/test_scrubber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from nose.tools import eq_, ok_
from django.test import TestCase

from crashstats import scrubber


class TestScrubber(TestCase):

def test_scrub_string_email(self):
data = 'this is my email me@example.org!'
res = scrubber.scrub_string(data, scrubber.EMAIL)
eq_(res, 'this is my email !')

def test_scrub_string_url(self):
data = 'this is my Web site http://example.org/?param=12 !'
res = scrubber.scrub_string(data, scrubber.URL)
eq_(res, 'this is my Web site !')

data = 'link www.example.org/?param=12'
res = scrubber.scrub_string(data, scrubber.URL)
eq_(res, 'link ')

def test_scrub_dict_remove_fields(self):
data = {
'email': 'me@example.org',
'text': 'hello'
}
res = scrubber.scrub_dict(data, remove_fields=['email'])
eq_(res, {'text': 'hello'})

def test_scrub_dict_replace_fields(self):
data = {
'email': 'me@example.org',
'text': 'hello'
}
res = scrubber.scrub_dict(data, replace_fields=[('email', 'scrubbed')])
eq_(res, {'email': 'scrubbed', 'text': 'hello'})

def test_scrub_dict_clean_fields(self):
data = {
'email': 'me@example.org',
'text': (
'this is my email address me@example.org and my website '
'http://www.example.org/ do you like it?'
)
}
res = scrubber.scrub_dict(
data,
clean_fields=[('text', scrubber.EMAIL), ('text', scrubber.URL)]
)
ok_('email' in res)
ok_('text' in res)
ok_('email address' in res['text'])
ok_('me@example.org' not in res['text'])
ok_('http://www.example.org/' not in res['text'])

def test_scrub_data(self):
data = [
{
'email': 'me@example.org',
'text': 'look at my site www.example.org it is cool',
'age': 25,
},
{
'email': None,
'url': 'http://mozilla.org',
'age': 25,
}
]
res = scrubber.scrub_data(data)
eq_(data, res)

res = scrubber.scrub_data(
data,
remove_fields=['age'],
replace_fields=[('email', 'NO EMAIL'), ('url', 'NO URL')],
clean_fields=[('text', scrubber.EMAIL), ('text', scrubber.URL)]
)
eq_(len(res), 2)
eq_(res[0]['email'], 'NO EMAIL')
eq_(res[1]['url'], 'NO URL')
ok_('age' not in res[0])
ok_('age' not in res[1])
ok_('www.example.org' not in res[0]['text'])

0 comments on commit 983cce8

Please sign in to comment.