Skip to content

Commit

Permalink
Merge pull request #413 from open-contracting/dryrun
Browse files Browse the repository at this point in the history
Add dryrun command and clean up issues it identified
  • Loading branch information
jpmckinney committed Jun 2, 2020
2 parents e32f339 + 25669c0 commit ce11307
Show file tree
Hide file tree
Showing 25 changed files with 107 additions and 28 deletions.
2 changes: 1 addition & 1 deletion kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def start_requests(self):
@handle_error
def parse(self, response):
if self.zip_file_format:
self.build_file_from_response(response, data_type='zip', post_to_api=False)
yield self.build_file_from_response(response, data_type='zip', post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand Down
47 changes: 47 additions & 0 deletions kingfisher_scrapy/commands/dryrun.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerProcess

from kingfisher_scrapy.base_spider import BaseSpider, ZipSpider


def yield_nothing(*args, **kwargs):
yield


class DryRun(ScrapyCommand):
def short_desc(self):
return 'Run a dry run of all spiders'

def run(self, args, opts):
BaseSpider.parse_json_lines = yield_nothing
ZipSpider.parse = yield_nothing

# Stop after one item or error.
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
self.settings.set('CLOSESPIDER_ITEMCOUNT', 1)

# Disable Kingfisher, Telnet, LogStats extensions.
self.settings.set('EXTENSIONS', {
'scrapy.extensions.telnet.TelnetConsole': None,
})
self.settings.set('LOGSTATS_INTERVAL', None)

runner = CrawlerProcess(settings=self.settings)

exceptions = {
'test_fail',
# Server unavailable
'mexico_cdmx',
# Require authentication
'openopps',
'paraguay_dncp_records',
'paraguay_dncp_releases',
'paraguay_hacienda',
}

for spider_name in runner.spider_loader.list():
if spider_name not in exceptions:
spidercls = runner.spider_loader.load(spider_name)
runner.crawl(spidercls)

runner.start()
5 changes: 5 additions & 0 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,13 @@ def __init__(self, directory):
@classmethod
def from_crawler(cls, crawler):
directory = crawler.settings['FILES_STORE']

if not directory:
raise NotConfigured('FILES_STORE is not set.')

extension = cls(directory)
crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped)

return extension

def item_scraped(self, item, spider):
Expand Down
4 changes: 2 additions & 2 deletions kingfisher_scrapy/log_formatter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from scrapy import logformatter
from scrapy.logformatter import LogFormatter


class KingfisherLogFormatter(logformatter.LogFormatter):
class KingfisherLogFormatter(LogFormatter):
# https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped
def scraped(self, item, response, spider):
"""
Expand Down
3 changes: 3 additions & 0 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@

# The maximum response size (in bytes) that downloader will download (default: 1073741824):
DOWNLOAD_MAXSIZE = 4000000000
DOWNLOAD_WARNSIZE = 0
# Many spiders time out when using default of 180.
DOWNLOAD_TIMEOUT = 360

# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 2
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/armenia.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Armenia(LinksSpider):
name = 'armenia'
data_type = 'release_package'
next_pointer = '/next_page/uri'
next_page_formatter = parameters('offset')
next_page_formatter = staticmethod(parameters('offset'))

def start_requests(self):
url = 'https://armeps.am/ocds/release'
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/australia.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class Australia(LinksSpider):
name = 'australia'
data_type = 'release_package'
next_page_formatter = parameters('cursor')
next_page_formatter = staticmethod(parameters('cursor'))

def start_requests(self):
url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/chile_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class ChileCompraBaseSpider(SimpleSpider):
custom_settings = {
'DOWNLOAD_FAIL_ON_DATALOSS': False,
}
download_timeout = 300

limit = 100
base_list_url = 'https://apis.mercadopublico.cl/OCDS/data/listaA%C3%B1oMes/{0.year:d}/{0.month:02d}/{1}/{2}'

Expand Down
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/chile_compra_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class ChileCompraBulk(ZipSpider):
name = 'chile_compra_bulk'
data_type = 'record_package'

download_warnsize = 0
download_timeout = 99999
custom_settings = {
'DOWNLOAD_FAIL_ON_DATALOSS': False,
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/colombia.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

class Colombia(LinksSpider):
name = 'colombia'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

def start_requests(self):
base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases'
Expand Down
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/colombia_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ class ColombiaBulk(ZipSpider):
encoding = 'iso-8859-1'
zip_file_format = 'json_lines'

download_warnsize = 0
download_timeout = 99999
custom_settings = {
'DOWNLOAD_FAIL_ON_DATALOSS': False,
Expand Down
2 changes: 0 additions & 2 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
class DominicanRepublic(BaseSpider):
name = 'dominican_republic'

download_timeout = 360 # 6min

def start_requests(self):
yield scrapy.Request(
'https://www.dgcp.gob.do/estandar-mundial-ocds/',
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/georgia_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class GeorgiaRecords(LinksSpider):
name = 'georgia_records'
data_type = 'record_package'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

def start_requests(self):
url = 'https://odapi.spa.ge/api/records.json'
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/georgia_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class GeorgiaReleases(LinksSpider):
name = 'georgia_releases'
data_type = 'release_package'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

def start_requests(self):
url = 'https://odapi.spa.ge/api/releases.json'
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/honduras_portal_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class HondurasPortalRecords(LinksSpider):
data_type = 'record_package'
data_pointer = '/recordPackage'
next_pointer = '/next'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

download_delay = 0.9

Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/honduras_portal_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class HondurasPortalReleases(LinksSpider):
data_type = 'release_package'
data_pointer = '/releasePackage'
next_pointer = '/next'
next_page_formatter = parameters('page')
next_page_formatter = staticmethod(parameters('page'))

download_delay = 0.9

Expand Down
3 changes: 1 addition & 2 deletions kingfisher_scrapy/spiders/openopps.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,7 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.username = crawler.settings.get('KINGFISHER_OPENOPPS_USERNAME')
spider.password = crawler.settings.get('KINGFISHER_OPENOPPS_PASSWORD')
if spider.username is None or spider.password is None:
spider.logger.error('Please set the environment variables '
'KINGFISHER_OPENOPPS_USERNAME and KINGFISHER_OPENOPPS_PASSWORD')
spider.logger.error('KINGFISHER_OPENOPPS_USERNAME and/or KINGFISHER_OPENOPPS_PASSWORD is not set.')
raise scrapy.exceptions.CloseSpider('authentication_credentials_missing')

return spider
Expand Down
3 changes: 1 addition & 2 deletions kingfisher_scrapy/spiders/paraguay_dncp_base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import logging
from datetime import datetime

import scrapy
Expand Down Expand Up @@ -44,7 +43,7 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN')

if spider.request_token is None:
logging.error('No request token available')
spider.logger.error('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN is not set.')
raise scrapy.exceptions.CloseSpider('authentication_credentials_missing')

return spider
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/spiders/paraguay_hacienda.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def from_crawler(cls, crawler, *args, **kwargs):
spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN')
spider.client_secret = crawler.settings.get('KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET')
if spider.request_token is None or spider.client_secret is None:
spider.logger.error('No request token or client secret available')
spider.logger.error('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN and/or '
'KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET is not set.')
raise scrapy.exceptions.CloseSpider('authentication_credentials_missing')

return spider
Expand Down
1 change: 0 additions & 1 deletion kingfisher_scrapy/spiders/portugal.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ class Portugal(ZipSpider):
encoding = 'iso-8859-1'
zip_file_format = 'json_lines'

download_warnsize = 0
download_timeout = 9999

def start_requests(self):
Expand Down
8 changes: 4 additions & 4 deletions kingfisher_scrapy/spiders/uk_contracts_finder.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import json

from kingfisher_scrapy.base_spider import BaseSpider
from kingfisher_scrapy.util import components, handle_error, parameters, replace_parameter
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error, parameters, replace_parameter


class UKContractsFinder(BaseSpider):
class UKContractsFinder(SimpleSpider):
name = 'uk_contracts_finder'
data_type = 'release_package_list_in_results'
encoding = 'iso-8859-1'
Expand All @@ -22,4 +22,4 @@ def parse_list(self, response):
total = data['maxPage']
for page in range(2, total + 1):
url = replace_parameter(response.request.url, 'page', page)
yield self.build_request(url, formatter=components('page'))
yield self.build_request(url, formatter=parameters('page'))
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/uk_fts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
class UKContractsFinder(LinksSpider):
name = 'uk_fts'
data_type = 'release_package_in_ocdsReleasePackage_in_list_in_results'
next_page_formatter = parameters('cursor')
next_page_formatter = staticmethod(parameters('cursor'))

def start_requests(self):
# This URL was provided by the publisher and is not the production URL.
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def replace_parameter(url, key, value):
parsed = urlsplit(url)
query = parse_qs(parsed.query)
if value is None:
del query[key]
query.pop(key, None)
else:
query[key] = [value]
return parsed._replace(query=urlencode(query, doseq=True)).geturl()
Expand Down
12 changes: 12 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pytest

from kingfisher_scrapy.util import replace_parameter


@pytest.mark.parametrize('url,value,expected', [
('http://example.com/?page=1', 2, 'http://example.com/?page=2'),
('http://example.com/?page=1', None, 'http://example.com/'),
('http://example.com/', None, 'http://example.com/'),
])
def test_replace_parameter(url, value, expected):
assert replace_parameter(url, 'page', value) == expected
20 changes: 19 additions & 1 deletion tests/test_zip_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,18 @@ def test_parse_json_lines(sample, len_items):

response = response_fixture(body=io.getvalue())
generator = spider.parse(response)
item = next(generator)
items = list(generator)

# assert len(items) == len_items
assert type(item) is File
assert len(item) == 6
assert item['file_name'] == 'test'
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'zip'
assert item['encoding'] == 'utf-8'
assert item['post_to_api'] is False

assert len(items) == len_items

for i, item in enumerate(items, 1):
assert type(item) is FileItem
Expand Down Expand Up @@ -83,8 +92,17 @@ def test_parse_release_package(sample, len_items, len_releases):

response = response_fixture(body=io.getvalue())
generator = spider.parse(response)
item = next(generator)
items = list(generator)

assert type(item) is File
assert len(item) == 6
assert item['file_name'] == 'test'
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'zip'
assert item['encoding'] == 'utf-8'
assert item['post_to_api'] is False

assert len(items) == len_items

for i, item in enumerate(items, 1):
Expand Down

0 comments on commit ce11307

Please sign in to comment.