Skip to content
This repository
Browse code

Merge pull request #32 from mihneadb/master

This strips the non ascii chars from all text that is processed. r=ctalbert
  • Loading branch information...
commit 44d48523027c4d60a5e00db57c8a36ef143ac032 2 parents e762908 + 315e9c1
ctalbert authored October 16, 2012
3  spade/scraper/pipelines.py
@@ -18,6 +18,9 @@ def __init__(self):
18 18
     def process_item(self, item, spider):
19 19
         """Called whenever an item is yielded by the spider"""
20 20
 
  21
+        # strip non ascii chars
  22
+        item['raw_content'] = ''.join(c for c in item['raw_content'] if ord(c) < 128)
  23
+
21 24
         # hash the filename to prevent storing too-long file names
22 25
         hash_data = item['filename'] + item['user_agent'].ua_string
23 26
         filename = sha1(hash_data).hexdigest()
2  spade/utils/html_diff.py
@@ -21,6 +21,8 @@ def strip(self, html):
21 21
             style=True, embedded=True)
22 22
 
23 23
         h = html.read()
  24
+        # strip non ascii chars
  25
+        h = ''.join(c for c in h if ord(c) < 128)
24 26
         html.seek(0)  # hack to have the file re-readable for further checking
25 27
 
26 28
         return cleaner.clean_html(h)

0 notes on commit 44d4852

Please sign in to comment.
Something went wrong with that request. Please try again.