Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

strip non ascii chars

  • Loading branch information...
commit 315e9c175d4fe6077f0cd310e6bf92430348907d 1 parent b50c6b3
@mihneadb mihneadb authored
Showing with 5 additions and 0 deletions.
  1. +3 −0  spade/scraper/pipelines.py
  2. +2 −0  spade/utils/html_diff.py
View
3  spade/scraper/pipelines.py
@@ -18,6 +18,9 @@ def __init__(self):
def process_item(self, item, spider):
"""Called whenever an item is yielded by the spider"""
+ # strip non ascii chars
+ item['raw_content'] = ''.join(c for c in item['raw_content'] if ord(c) < 128)
+
# hash the filename to prevent storing too-long file names
hash_data = item['filename'] + item['user_agent'].ua_string
filename = sha1(hash_data).hexdigest()
View
2  spade/utils/html_diff.py
@@ -21,6 +21,8 @@ def strip(self, html):
style=True, embedded=True)
h = html.read()
+ # strip non ascii chars
+ h = ''.join(c for c in h if ord(c) < 128)
html.seek(0) # hack to have the file re-readable for further checking
return cleaner.clean_html(h)
Please sign in to comment.
Something went wrong with that request. Please try again.