Permalink
Browse files

Merge pull request #32 from mihneadb/master

This strips the non ascii chars from all text that is processed. r=ctalbert
  • Loading branch information...
2 parents e762908 + 315e9c1 commit 44d48523027c4d60a5e00db57c8a36ef143ac032 ctalbert committed Oct 16, 2012
Showing with 5 additions and 0 deletions.
  1. +3 −0 spade/scraper/pipelines.py
  2. +2 −0 spade/utils/html_diff.py
@@ -18,6 +18,9 @@ def __init__(self):
def process_item(self, item, spider):
"""Called whenever an item is yielded by the spider"""
+ # strip non ascii chars
+ item['raw_content'] = ''.join(c for c in item['raw_content'] if ord(c) < 128)
+
# hash the filename to prevent storing too-long file names
hash_data = item['filename'] + item['user_agent'].ua_string
filename = sha1(hash_data).hexdigest()
View
@@ -21,6 +21,8 @@ def strip(self, html):
style=True, embedded=True)
h = html.read()
+ # strip non ascii chars
+ h = ''.join(c for c in h if ord(c) < 128)
html.seek(0) # hack to have the file re-readable for further checking
return cleaner.clean_html(h)

0 comments on commit 44d4852

Please sign in to comment.