Navigation Menu

Skip to content
This repository has been archived by the owner on Feb 18, 2019. It is now read-only.

Commit

Permalink
Merge branch 'master' into improve-performance
Browse files Browse the repository at this point in the history
  • Loading branch information
maurodoglio committed Apr 26, 2013
2 parents 87297f0 + 25cbf41 commit 407a19f
Show file tree
Hide file tree
Showing 356 changed files with 35 additions and 25,242 deletions.
7 changes: 3 additions & 4 deletions requirements/compiled.txt
Expand Up @@ -7,10 +7,9 @@
# non-Debian-based system).

MySQL-python
lxml
Twisted==13.0.0
pyOpenSSL
https://github.com/scrapy/scrapy/archive/0.17.0.zip
Celery==3.0.17
django-celery==3.0.17
kombu==2.4.7
numpy==1.7.0
numpy==1.7.0
zope.interface==4.0.1
2 changes: 1 addition & 1 deletion requirements/dev.txt
Expand Up @@ -11,4 +11,4 @@ factory_boy==1.1.4
#for celery auto-reloading
pyinotify==0.9.4
#celery monitor
flower==0.4.3
flower==0.4.3
3 changes: 0 additions & 3 deletions requirements/pure.txt
@@ -1,9 +1,6 @@
# Pure-python dependencies; pre-installed in vendor/

Django>=1.5.1,<1.6
Scrapy==0.16.4
w3lib==1.2
zope.interface==4.0.1
argparse==1.2.1
cssutils==0.9.10
django-inmemorystorage==0.1.1
Expand Down
21 changes: 12 additions & 9 deletions spade/controller/management/commands/scrape.py
Expand Up @@ -5,6 +5,10 @@
from __future__ import absolute_import

from django.core.management.base import BaseCommand
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.settings import Settings
from scrapy import log


class Command(BaseCommand):
Expand All @@ -18,12 +22,11 @@ def handle(self, *args, **options):
self.stdout.write(u"Usage: {0}\n".format(self.args))
self.stdout.write(self.help)
else:
# Take a filename from command line to crawl
default = [u""]
default.append(u"crawl")
default.append(u"all")
default.append(u"-s")
default.append(u"URLS=" + unicode(args[0]))

from scrapy.cmdline import execute
execute(default)
settings = Settings()
settings.overrides['URLS'] = args[0]
crawler = Crawler(settings)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()
2 changes: 1 addition & 1 deletion spade/scraper/pipelines.py
Expand Up @@ -119,7 +119,7 @@ def open_spider(self, spider):
spider.batch.save()

# save initial site list
file_content = ContentFile('\n'.join(spider.start_urls))
file_content = ContentFile('\n'.join(spider.get_start_urls()))
filename = str(spider.batch).replace(' ', '')
spider.batch.sitelist.save(filename, file_content)
spider.batch.sitelist.close()
Expand Down
11 changes: 4 additions & 7 deletions spade/scraper/spiders/general_spider.py
Expand Up @@ -5,7 +5,6 @@

# Scrapy Imports
from scrapy import log
from scrapy.conf import settings
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
Expand Down Expand Up @@ -35,7 +34,6 @@ def __init__(self):
"""
Set URLs to traverse from
"""
self.start_urls = self.get_start_urls()

def get_now_time(self):
"""Gets a datetime"""
Expand All @@ -48,10 +46,10 @@ def log(self, msg):

def get_start_urls(self):
"""Extracts urls from a text file into the list of URLs to crawl"""
if not settings.get('URLS'):
if not self.settings.get('URLS'):
raise ValueError('No text file. Use -s URLS=somefile.txt')

with open(settings.get('URLS')) as data:
with open(self.settings.get('URLS')) as data:
return [line.rstrip('\r\n') for line in data]

def get_content_type(self, headers):
Expand Down Expand Up @@ -104,11 +102,10 @@ def parse(self, response):
else:
if 'text/html' not in self.get_content_type(response.headers):
# For linked content, find the urlscan it linked from

urlscan = model.URLScan.objects.get(

site_scan=sitescan,
page_url_hash=
sha256(response.meta['referrer']).hexdigest())
page_url_hash=sha256(response.meta['referrer']).hexdigest())
else:
# Only create urlscans for text/html
urlscan, us_created = model.URLScan.objects.get_or_create(
Expand Down
22 changes: 0 additions & 22 deletions spade/tests/scraper/middleware/test_spidermiddleware.py
@@ -1,29 +1,7 @@
from scrapy.conf import settings
from scrapy.http import Response, Request

from spade import model
from spade.scraper.middlewares import DepthMiddleware
from spade.scraper.middlewares import OffsiteMiddleware
from spade.scraper.spiders.general_spider import GeneralSpider


def pytest_funcarg__spider(request):
"""Use scrapy's overrides to start a spider w/ specific settings"""
# This is necessary because the spider errors when a source file is not
# provided.
settings.overrides['URLS'] = u"spade/tests/sitelists/urls.txt"
settings.overrides['LOG_ENABLED'] = True

# Initialize and return spider
spider = GeneralSpider()
now = spider.get_now_time()
spider.batch = model.Batch.objects.create(
kickoff_time=now, finish_time=now)
spider.batch.save()

# Delete created batch from database when test is done
request.addfinalizer(lambda: spider.batch.delete())
return spider


def pytest_funcarg__offsite_middleware(request):
Expand Down
23 changes: 14 additions & 9 deletions spade/tests/scraper/spider/test_spider.py
@@ -1,6 +1,7 @@
from hashlib import sha256
from scrapy.conf import settings
from scrapy.http import Response, Request
from scrapy.utils.project import get_project_settings
from scrapy.http import Response, Request, HtmlResponse
from scrapy.crawler import Crawler
from spade.scraper.spiders.general_spider import GeneralSpider
from spade import model
from spade.scraper.items import MarkupItem
Expand All @@ -13,10 +14,14 @@
def pytest_funcarg__spider(request):
"""Use scrapy's overrides to start the spider w/ specific settings"""

settings.overrides['LOG_ENABLED'] = True
settings = get_project_settings()
settings.overrides['URLS'] = u"spade/tests/sitelists/urls.txt"
spider = GeneralSpider()
settings.overrides['LOG_ENABLED'] = True

# Initialize and return spider

spider = GeneralSpider()
spider.set_crawler(Crawler(settings))
# Create initial batch
now = spider.get_now_time()
spider.batch = model.Batch.objects.create(
Expand Down Expand Up @@ -145,9 +150,9 @@ def test_spider_name(spider):

def test_spider_read_from_file(spider):
"""Ensure the test list of urls was read correctly"""
if len(spider.start_urls) != 1:
if len(spider.get_start_urls()) != 1:
assert False
elif spider.start_urls[0] == "http://localhost:8181":
elif spider.get_start_urls()[0] == "http://localhost:8181":
assert True
else:
assert False
Expand Down Expand Up @@ -193,13 +198,13 @@ def test_spider_crawls_links(spider, scrape_request, html_headers,
spider.batch_user_agents = [ua]

# Generate a mock response based on html containing two links
mock_response = Response('http://test:12345',
body=mock_html_twolinks)
mock_response = HtmlResponse(url='http://test:12345',
body=mock_html_twolinks,
encoding='utf-8')
mock_response.request = scrape_request
mock_response.headers = html_headers
mock_response.meta['user_agent'] = ua
mock_response.status = 200
mock_response.encoding = u'utf-8'
mock_response.flags = []

# Call spider on the mock response
Expand Down
42 changes: 0 additions & 42 deletions vendor/scrapy/__init__.py

This file was deleted.

0 comments on commit 407a19f

Please sign in to comment.