Permalink
Browse files

Revert "don't follow hyperlinks; process only homepages."

This reverts commit 96a40eb.
  • Loading branch information...
1 parent 96a40eb commit b50c6b3a5fdedca58e6e9487536077138f8bf6c2 @mihneadb mihneadb committed Oct 13, 2012
Showing with 7 additions and 2 deletions.
  1. +7 −2 spade/scraper/spiders/general_spider.py
@@ -119,7 +119,7 @@ def parse(self, response):
'timestamp': self.get_now_time()})
# Continue crawling
- # Parse stylesheet links and scripts
+ # Parse stylesheet links, scripts, and hyperlinks
hxs = HtmlXPathSelector(response)
# Extract other target links
@@ -133,8 +133,13 @@ def parse(self, response):
except TypeError:
js_links = []
+ try:
+ hyperlinks = hxs.select('//a/@href').extract()
+ except TypeError:
+ hyperlinks = []
+
# Using a set removes duplicate links.
- all_links = set(js_links + css_links)
+ all_links = set(hyperlinks + js_links + css_links)
# Examine links, yield requests if they are valid
for url in all_links:

0 comments on commit b50c6b3

Please sign in to comment.