Skip to content
This repository has been archived by the owner on Feb 18, 2019. It is now read-only.

Commit

Permalink
Merge pull request #32 from mihneadb/master
Browse files Browse the repository at this point in the history
This strips the non ascii chars from all text that is processed. r=ctalbert
  • Loading branch information
ctalbert committed Oct 16, 2012
2 parents e762908 + 315e9c1 commit 44d4852
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 0 deletions.
3 changes: 3 additions & 0 deletions spade/scraper/pipelines.py
Expand Up @@ -18,6 +18,9 @@ def __init__(self):
def process_item(self, item, spider): def process_item(self, item, spider):
"""Called whenever an item is yielded by the spider""" """Called whenever an item is yielded by the spider"""


# strip non ascii chars
item['raw_content'] = ''.join(c for c in item['raw_content'] if ord(c) < 128)

# hash the filename to prevent storing too-long file names # hash the filename to prevent storing too-long file names
hash_data = item['filename'] + item['user_agent'].ua_string hash_data = item['filename'] + item['user_agent'].ua_string
filename = sha1(hash_data).hexdigest() filename = sha1(hash_data).hexdigest()
Expand Down
2 changes: 2 additions & 0 deletions spade/utils/html_diff.py
Expand Up @@ -21,6 +21,8 @@ def strip(self, html):
style=True, embedded=True) style=True, embedded=True)


h = html.read() h = html.read()
# strip non ascii chars
h = ''.join(c for c in h if ord(c) < 128)
html.seek(0) # hack to have the file re-readable for further checking html.seek(0) # hack to have the file re-readable for further checking


return cleaner.clean_html(h) return cleaner.clean_html(h)

0 comments on commit 44d4852

Please sign in to comment.