From 96a40eb99fd70f26fb463abb33d3875ef34ac50b Mon Sep 17 00:00:00 2001
From: Mihnea Dobrescu-Balaur <mihneadb@gmail.com>
Date: Wed, 26 Sep 2012 19:24:17 -0700
Subject: [PATCH 1/3] don't follow hyperlinks; process only homepages.

---
 spade/scraper/spiders/general_spider.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/spade/scraper/spiders/general_spider.py b/spade/scraper/spiders/general_spider.py
index e2ba4c5..eb38bf5 100644
--- a/spade/scraper/spiders/general_spider.py
+++ b/spade/scraper/spiders/general_spider.py
@@ -119,7 +119,7 @@ def parse(self, response):
                               'timestamp': self.get_now_time()})
 
                 # Continue crawling
-                # Parse stylesheet links, scripts, and hyperlinks
+                # Parse stylesheet links and scripts
                 hxs = HtmlXPathSelector(response)
 
                 # Extract other target links
@@ -133,13 +133,8 @@ def parse(self, response):
                 except TypeError:
                     js_links = []
 
-                try:
-                    hyperlinks = hxs.select('//a/@href').extract()
-                except TypeError:
-                    hyperlinks = []
-
                 # Using a set removes duplicate links.
-                all_links = set(hyperlinks + js_links + css_links)
+                all_links = set(js_links + css_links)
 
                 # Examine links, yield requests if they are valid
                 for url in all_links:

From b50c6b3a5fdedca58e6e9487536077138f8bf6c2 Mon Sep 17 00:00:00 2001
From: Mihnea Dobrescu-Balaur <mihneadb@gmail.com>
Date: Sat, 13 Oct 2012 16:42:03 +0300
Subject: [PATCH 2/3] Revert "don't follow hyperlinks; process only homepages."

This reverts commit 96a40eb99fd70f26fb463abb33d3875ef34ac50b.
---
 spade/scraper/spiders/general_spider.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/spade/scraper/spiders/general_spider.py b/spade/scraper/spiders/general_spider.py
index eb38bf5..e2ba4c5 100644
--- a/spade/scraper/spiders/general_spider.py
+++ b/spade/scraper/spiders/general_spider.py
@@ -119,7 +119,7 @@ def parse(self, response):
                               'timestamp': self.get_now_time()})
 
                 # Continue crawling
-                # Parse stylesheet links and scripts
+                # Parse stylesheet links, scripts, and hyperlinks
                 hxs = HtmlXPathSelector(response)
 
                 # Extract other target links
@@ -133,8 +133,13 @@ def parse(self, response):
                 except TypeError:
                     js_links = []
 
+                try:
+                    hyperlinks = hxs.select('//a/@href').extract()
+                except TypeError:
+                    hyperlinks = []
+
                 # Using a set removes duplicate links.
-                all_links = set(js_links + css_links)
+                all_links = set(hyperlinks + js_links + css_links)
 
                 # Examine links, yield requests if they are valid
                 for url in all_links:

From 315e9c175d4fe6077f0cd310e6bf92430348907d Mon Sep 17 00:00:00 2001
From: Mihnea Dobrescu-Balaur <mihneadb@gmail.com>
Date: Sat, 13 Oct 2012 17:24:19 +0300
Subject: [PATCH 3/3] strip non ascii chars

---
 spade/scraper/pipelines.py | 3 +++
 spade/utils/html_diff.py   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/spade/scraper/pipelines.py b/spade/scraper/pipelines.py
index 6cf8b42..5561ebe 100644
--- a/spade/scraper/pipelines.py
+++ b/spade/scraper/pipelines.py
@@ -18,6 +18,9 @@ def __init__(self):
     def process_item(self, item, spider):
         """Called whenever an item is yielded by the spider"""
 
+        # strip non ascii chars
+        item['raw_content'] = ''.join(c for c in item['raw_content'] if ord(c) < 128)
+
         # hash the filename to prevent storing too-long file names
         hash_data = item['filename'] + item['user_agent'].ua_string
         filename = sha1(hash_data).hexdigest()
diff --git a/spade/utils/html_diff.py b/spade/utils/html_diff.py
index c59c63f..75d5255 100644
--- a/spade/utils/html_diff.py
+++ b/spade/utils/html_diff.py
@@ -21,6 +21,8 @@ def strip(self, html):
             style=True, embedded=True)
 
         h = html.read()
+        # strip non ascii chars
+        h = ''.join(c for c in h if ord(c) < 128)
         html.seek(0)  # hack to have the file re-readable for further checking
 
         return cleaner.clean_html(h)