This repository has been archived by the owner on Feb 1, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #328 from AdrianGaudebert/scrubber
Bug 858245 - Added a scrubber app to use in the API.
- Loading branch information
Showing
3 changed files
with
162 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# coding=utf-8 | ||
|
||
import re | ||
|
||
|
||
# source: http://stackp.online.fr/?p=19 | ||
EMAIL = re.compile('([\w\-\.]+@(\w[\w\-]+\.)+[\w\-]+)') | ||
|
||
|
||
# source: http://stackoverflow.com/questions/520031 | ||
URL = re.compile( | ||
r"((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.]" | ||
"[a-z]{2,4}/)(?:[^\s()<>]+|(([^\s()<>]+|(([^\s()<>]+)))*))+(?:(([^\s()<>]" | ||
"+|(([^\s()<>]+)))*)|[^\s`!()[]{};:'\".,<>?«»“”‘’]))", | ||
re.DOTALL | ||
) | ||
|
||
|
||
def scrub_data(data, **kwargs): | ||
"""Return a scrubbed copy of a list of dictionaries. | ||
See `scrub_dict` for parameters. | ||
""" | ||
scrubbed = list(data) | ||
for i, item in enumerate(scrubbed): | ||
scrubbed[i] = scrub_dict(item, **kwargs) | ||
return scrubbed | ||
|
||
|
||
def scrub_dict( | ||
data, | ||
remove_fields=None, | ||
replace_fields=None, | ||
clean_fields=None | ||
): | ||
"""Return a scrubbed copy of a dictionary. | ||
Several options are available: | ||
* remove_fields | ||
* list or tuple of strings | ||
* remove those fields from the dictionary | ||
* example: remove_fields=['email', 'phone'] | ||
* replace_fields | ||
* list or tuple of 2-uples | ||
* replace the value of those fields with some content | ||
* example: replace_fields=[('email', 'scrubbed email'), ('phone', '')] | ||
* clean_fields | ||
* list or tuple of 2-uples | ||
* search for patterns in those fields and remove what matches | ||
* example: clean_fields=[('comment', EMAIL), ('comment', URL)] | ||
Any number of those options can be used in the same call. If none is used, | ||
return the dictionary unchanged. | ||
""" | ||
scrubbed = data.copy() | ||
for key in remove_fields or []: | ||
if key in scrubbed: | ||
del scrubbed[key] | ||
|
||
for key in scrubbed: | ||
for field in replace_fields or []: | ||
if field[0] == key: | ||
scrubbed[key] = field[1] | ||
|
||
for field in clean_fields or []: | ||
if field[0] == key: | ||
scrubbed[key] = scrub_string(scrubbed[key], field[1]) | ||
|
||
return scrubbed | ||
|
||
|
||
def scrub_string(data, pattern): | ||
"""Return a copy of a string where everything that matches the pattern is | ||
removed. | ||
""" | ||
for i in pattern.findall(data): | ||
data = data.replace(i[0], '') | ||
return data |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
from nose.tools import eq_, ok_ | ||
from django.test import TestCase | ||
|
||
from crashstats import scrubber | ||
|
||
|
||
class TestScrubber(TestCase): | ||
|
||
def test_scrub_string_email(self): | ||
data = 'this is my email me@example.org!' | ||
res = scrubber.scrub_string(data, scrubber.EMAIL) | ||
eq_(res, 'this is my email !') | ||
|
||
def test_scrub_string_url(self): | ||
data = 'this is my Web site http://example.org/?param=12 !' | ||
res = scrubber.scrub_string(data, scrubber.URL) | ||
eq_(res, 'this is my Web site !') | ||
|
||
data = 'link www.example.org/?param=12' | ||
res = scrubber.scrub_string(data, scrubber.URL) | ||
eq_(res, 'link ') | ||
|
||
def test_scrub_dict_remove_fields(self): | ||
data = { | ||
'email': 'me@example.org', | ||
'text': 'hello' | ||
} | ||
res = scrubber.scrub_dict(data, remove_fields=['email']) | ||
eq_(res, {'text': 'hello'}) | ||
|
||
def test_scrub_dict_replace_fields(self): | ||
data = { | ||
'email': 'me@example.org', | ||
'text': 'hello' | ||
} | ||
res = scrubber.scrub_dict(data, replace_fields=[('email', 'scrubbed')]) | ||
eq_(res, {'email': 'scrubbed', 'text': 'hello'}) | ||
|
||
def test_scrub_dict_clean_fields(self): | ||
data = { | ||
'email': 'me@example.org', | ||
'text': ( | ||
'this is my email address me@example.org and my website ' | ||
'http://www.example.org/ do you like it?' | ||
) | ||
} | ||
res = scrubber.scrub_dict( | ||
data, | ||
clean_fields=[('text', scrubber.EMAIL), ('text', scrubber.URL)] | ||
) | ||
ok_('email' in res) | ||
ok_('text' in res) | ||
ok_('email address' in res['text']) | ||
ok_('me@example.org' not in res['text']) | ||
ok_('http://www.example.org/' not in res['text']) | ||
|
||
def test_scrub_data(self): | ||
data = [ | ||
{ | ||
'email': 'me@example.org', | ||
'text': 'look at my site www.example.org it is cool', | ||
'age': 25, | ||
}, | ||
{ | ||
'email': None, | ||
'url': 'http://mozilla.org', | ||
'age': 25, | ||
} | ||
] | ||
res = scrubber.scrub_data(data) | ||
eq_(data, res) | ||
|
||
res = scrubber.scrub_data( | ||
data, | ||
remove_fields=['age'], | ||
replace_fields=[('email', 'NO EMAIL'), ('url', 'NO URL')], | ||
clean_fields=[('text', scrubber.EMAIL), ('text', scrubber.URL)] | ||
) | ||
eq_(len(res), 2) | ||
eq_(res[0]['email'], 'NO EMAIL') | ||
eq_(res[1]['url'], 'NO URL') | ||
ok_('age' not in res[0]) | ||
ok_('age' not in res[1]) | ||
ok_('www.example.org' not in res[0]['text']) |