From 96a40eb99fd70f26fb463abb33d3875ef34ac50b Mon Sep 17 00:00:00 2001 From: Mihnea Dobrescu-Balaur Date: Wed, 26 Sep 2012 19:24:17 -0700 Subject: [PATCH 1/3] don't follow hyperlinks; process only homepages. --- spade/scraper/spiders/general_spider.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/spade/scraper/spiders/general_spider.py b/spade/scraper/spiders/general_spider.py index e2ba4c5..eb38bf5 100644 --- a/spade/scraper/spiders/general_spider.py +++ b/spade/scraper/spiders/general_spider.py @@ -119,7 +119,7 @@ def parse(self, response): 'timestamp': self.get_now_time()}) # Continue crawling - # Parse stylesheet links, scripts, and hyperlinks + # Parse stylesheet links and scripts hxs = HtmlXPathSelector(response) # Extract other target links @@ -133,13 +133,8 @@ def parse(self, response): except TypeError: js_links = [] - try: - hyperlinks = hxs.select('//a/@href').extract() - except TypeError: - hyperlinks = [] - # Using a set removes duplicate links. - all_links = set(hyperlinks + js_links + css_links) + all_links = set(js_links + css_links) # Examine links, yield requests if they are valid for url in all_links: From b50c6b3a5fdedca58e6e9487536077138f8bf6c2 Mon Sep 17 00:00:00 2001 From: Mihnea Dobrescu-Balaur Date: Sat, 13 Oct 2012 16:42:03 +0300 Subject: [PATCH 2/3] Revert "don't follow hyperlinks; process only homepages." This reverts commit 96a40eb99fd70f26fb463abb33d3875ef34ac50b. --- spade/scraper/spiders/general_spider.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/spade/scraper/spiders/general_spider.py b/spade/scraper/spiders/general_spider.py index eb38bf5..e2ba4c5 100644 --- a/spade/scraper/spiders/general_spider.py +++ b/spade/scraper/spiders/general_spider.py @@ -119,7 +119,7 @@ def parse(self, response): 'timestamp': self.get_now_time()}) # Continue crawling - # Parse stylesheet links and scripts + # Parse stylesheet links, scripts, and hyperlinks hxs = HtmlXPathSelector(response) # Extract other target links @@ -133,8 +133,13 @@ def parse(self, response): except TypeError: js_links = [] + try: + hyperlinks = hxs.select('//a/@href').extract() + except TypeError: + hyperlinks = [] + # Using a set removes duplicate links. - all_links = set(js_links + css_links) + all_links = set(hyperlinks + js_links + css_links) # Examine links, yield requests if they are valid for url in all_links: From 315e9c175d4fe6077f0cd310e6bf92430348907d Mon Sep 17 00:00:00 2001 From: Mihnea Dobrescu-Balaur Date: Sat, 13 Oct 2012 17:24:19 +0300 Subject: [PATCH 3/3] strip non ascii chars --- spade/scraper/pipelines.py | 3 +++ spade/utils/html_diff.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/spade/scraper/pipelines.py b/spade/scraper/pipelines.py index 6cf8b42..5561ebe 100644 --- a/spade/scraper/pipelines.py +++ b/spade/scraper/pipelines.py @@ -18,6 +18,9 @@ def __init__(self): def process_item(self, item, spider): """Called whenever an item is yielded by the spider""" + # strip non ascii chars + item['raw_content'] = ''.join(c for c in item['raw_content'] if ord(c) < 128) + # hash the filename to prevent storing too-long file names hash_data = item['filename'] + item['user_agent'].ua_string filename = sha1(hash_data).hexdigest() diff --git a/spade/utils/html_diff.py b/spade/utils/html_diff.py index c59c63f..75d5255 100644 --- a/spade/utils/html_diff.py +++ b/spade/utils/html_diff.py @@ -21,6 +21,8 @@ def strip(self, html): style=True, embedded=True) h = html.read() + # strip non ascii chars + h = ''.join(c for c in h if ord(c) < 128) html.seek(0) # hack to have the file re-readable for further checking return cleaner.clean_html(h)