Merge pull request #32 from mihneadb/master

This strips the non ascii chars from all text that is processed. r=ctalbert
mozilla · Oct 16, 2012 · 44d4852 · 44d4852
2 parents e762908 + 315e9c1
commit 44d4852
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 0 deletions.
diff --git a/spade/scraper/pipelines.py b/spade/scraper/pipelines.py
@@ -18,6 +18,9 @@ def __init__(self):
     def process_item(self, item, spider):
         """Called whenever an item is yielded by the spider"""
 
+        # strip non ascii chars
+        item['raw_content'] = ''.join(c for c in item['raw_content'] if ord(c) < 128)
+
         # hash the filename to prevent storing too-long file names
         hash_data = item['filename'] + item['user_agent'].ua_string
         filename = sha1(hash_data).hexdigest()

diff --git a/spade/utils/html_diff.py b/spade/utils/html_diff.py
@@ -21,6 +21,8 @@ def strip(self, html):
             style=True, embedded=True)
 
         h = html.read()
+        # strip non ascii chars
+        h = ''.join(c for c in h if ord(c) < 128)
         html.seek(0)  # hack to have the file re-readable for further checking
 
         return cleaner.clean_html(h)