diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index e2db8936..ddbdeab9 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -308,7 +308,7 @@ def start_requests(self): @handle_error def parse(self, response): if self.zip_file_format: - self.build_file_from_response(response, data_type='zip', post_to_api=False) + yield self.build_file_from_response(response, data_type='zip', post_to_api=False) zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): diff --git a/kingfisher_scrapy/commands/dryrun.py b/kingfisher_scrapy/commands/dryrun.py new file mode 100644 index 00000000..f7c35a71 --- /dev/null +++ b/kingfisher_scrapy/commands/dryrun.py @@ -0,0 +1,47 @@ +from scrapy.commands import ScrapyCommand +from scrapy.crawler import CrawlerProcess + +from kingfisher_scrapy.base_spider import BaseSpider, ZipSpider + + +def yield_nothing(*args, **kwargs): + yield + + +class DryRun(ScrapyCommand): + def short_desc(self): + return 'Run a dry run of all spiders' + + def run(self, args, opts): + BaseSpider.parse_json_lines = yield_nothing + ZipSpider.parse = yield_nothing + + # Stop after one item or error. + self.settings.set('CLOSESPIDER_ERRORCOUNT', 1) + self.settings.set('CLOSESPIDER_ITEMCOUNT', 1) + + # Disable Kingfisher, Telnet, LogStats extensions. + self.settings.set('EXTENSIONS', { + 'scrapy.extensions.telnet.TelnetConsole': None, + }) + self.settings.set('LOGSTATS_INTERVAL', None) + + runner = CrawlerProcess(settings=self.settings) + + exceptions = { + 'test_fail', + # Server unavailable + 'mexico_cdmx', + # Require authentication + 'openopps', + 'paraguay_dncp_records', + 'paraguay_dncp_releases', + 'paraguay_hacienda', + } + + for spider_name in runner.spider_loader.list(): + if spider_name not in exceptions: + spidercls = runner.spider_loader.load(spider_name) + runner.crawl(spidercls) + + runner.start() diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 297dd9af..1cc151f0 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -17,8 +17,13 @@ def __init__(self, directory): @classmethod def from_crawler(cls, crawler): directory = crawler.settings['FILES_STORE'] + + if not directory: + raise NotConfigured('FILES_STORE is not set.') + extension = cls(directory) crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) + return extension def item_scraped(self, item, spider): diff --git a/kingfisher_scrapy/log_formatter.py b/kingfisher_scrapy/log_formatter.py index 9e0b42bd..a7298837 100644 --- a/kingfisher_scrapy/log_formatter.py +++ b/kingfisher_scrapy/log_formatter.py @@ -1,7 +1,7 @@ -from scrapy import logformatter +from scrapy.logformatter import LogFormatter -class KingfisherLogFormatter(logformatter.LogFormatter): +class KingfisherLogFormatter(LogFormatter): # https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped def scraped(self, item, response, spider): """ diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 53fc8d38..7db1f5bc 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -32,6 +32,9 @@ # The maximum response size (in bytes) that downloader will download (default: 1073741824): DOWNLOAD_MAXSIZE = 4000000000 +DOWNLOAD_WARNSIZE = 0 +# Many spiders time out when using default of 180. +DOWNLOAD_TIMEOUT = 360 # The download delay setting will honor only one of: CONCURRENT_REQUESTS_PER_DOMAIN = 2 diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index 04437cd2..10dc0ada 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -8,7 +8,7 @@ class Armenia(LinksSpider): name = 'armenia' data_type = 'release_package' next_pointer = '/next_page/uri' - next_page_formatter = parameters('offset') + next_page_formatter = staticmethod(parameters('offset')) def start_requests(self): url = 'https://armeps.am/ocds/release' diff --git a/kingfisher_scrapy/spiders/australia.py b/kingfisher_scrapy/spiders/australia.py index 60ec5dcf..49bed716 100644 --- a/kingfisher_scrapy/spiders/australia.py +++ b/kingfisher_scrapy/spiders/australia.py @@ -9,7 +9,7 @@ class Australia(LinksSpider): name = 'australia' data_type = 'release_package' - next_page_formatter = parameters('cursor') + next_page_formatter = staticmethod(parameters('cursor')) def start_requests(self): url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \ diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index 1ec7fbe6..26552dab 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -9,7 +9,7 @@ class ChileCompraBaseSpider(SimpleSpider): custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } - download_timeout = 300 + limit = 100 base_list_url = 'https://apis.mercadopublico.cl/OCDS/data/listaA%C3%B1oMes/{0.year:d}/{0.month:02d}/{1}/{2}' diff --git a/kingfisher_scrapy/spiders/chile_compra_bulk.py b/kingfisher_scrapy/spiders/chile_compra_bulk.py index 854d627d..b35a9d13 100644 --- a/kingfisher_scrapy/spiders/chile_compra_bulk.py +++ b/kingfisher_scrapy/spiders/chile_compra_bulk.py @@ -8,7 +8,6 @@ class ChileCompraBulk(ZipSpider): name = 'chile_compra_bulk' data_type = 'record_package' - download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index faedc130..92a97df0 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -10,7 +10,7 @@ class Colombia(LinksSpider): name = 'colombia' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) def start_requests(self): base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases' diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 2f1c4036..ee9e2427 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -17,7 +17,6 @@ class ColombiaBulk(ZipSpider): encoding = 'iso-8859-1' zip_file_format = 'json_lines' - download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 18098a6c..0cde0c99 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -11,8 +11,6 @@ class DominicanRepublic(BaseSpider): name = 'dominican_republic' - download_timeout = 360 # 6min - def start_requests(self): yield scrapy.Request( 'https://www.dgcp.gob.do/estandar-mundial-ocds/', diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index f9f7cef3..d2eefad7 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -7,7 +7,7 @@ class GeorgiaRecords(LinksSpider): name = 'georgia_records' data_type = 'record_package' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) def start_requests(self): url = 'https://odapi.spa.ge/api/records.json' diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index ee3ce55b..25437126 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -7,7 +7,7 @@ class GeorgiaReleases(LinksSpider): name = 'georgia_releases' data_type = 'release_package' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) def start_requests(self): url = 'https://odapi.spa.ge/api/releases.json' diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py index c5f5ec69..542213a1 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_records.py +++ b/kingfisher_scrapy/spiders/honduras_portal_records.py @@ -9,7 +9,7 @@ class HondurasPortalRecords(LinksSpider): data_type = 'record_package' data_pointer = '/recordPackage' next_pointer = '/next' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) download_delay = 0.9 diff --git a/kingfisher_scrapy/spiders/honduras_portal_releases.py b/kingfisher_scrapy/spiders/honduras_portal_releases.py index ca4c56f1..a32bf9e8 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_releases.py +++ b/kingfisher_scrapy/spiders/honduras_portal_releases.py @@ -9,7 +9,7 @@ class HondurasPortalReleases(LinksSpider): data_type = 'release_package' data_pointer = '/releasePackage' next_pointer = '/next' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) download_delay = 0.9 diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index e5d6ee32..1d9e22e9 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -56,8 +56,7 @@ def from_crawler(cls, crawler, *args, **kwargs): spider.username = crawler.settings.get('KINGFISHER_OPENOPPS_USERNAME') spider.password = crawler.settings.get('KINGFISHER_OPENOPPS_PASSWORD') if spider.username is None or spider.password is None: - spider.logger.error('Please set the environment variables ' - 'KINGFISHER_OPENOPPS_USERNAME and KINGFISHER_OPENOPPS_PASSWORD') + spider.logger.error('KINGFISHER_OPENOPPS_USERNAME and/or KINGFISHER_OPENOPPS_PASSWORD is not set.') raise scrapy.exceptions.CloseSpider('authentication_credentials_missing') return spider diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 30580bfb..0a1b63c2 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -1,5 +1,4 @@ import json -import logging from datetime import datetime import scrapy @@ -44,7 +43,7 @@ def from_crawler(cls, crawler, *args, **kwargs): spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN') if spider.request_token is None: - logging.error('No request token available') + spider.logger.error('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN is not set.') raise scrapy.exceptions.CloseSpider('authentication_credentials_missing') return spider diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py index 522dbde9..d25ae23b 100644 --- a/kingfisher_scrapy/spiders/paraguay_hacienda.py +++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py @@ -34,7 +34,8 @@ def from_crawler(cls, crawler, *args, **kwargs): spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN') spider.client_secret = crawler.settings.get('KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET') if spider.request_token is None or spider.client_secret is None: - spider.logger.error('No request token or client secret available') + spider.logger.error('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN and/or ' + 'KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET is not set.') raise scrapy.exceptions.CloseSpider('authentication_credentials_missing') return spider diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py index fdf25ba7..263aef58 100644 --- a/kingfisher_scrapy/spiders/portugal.py +++ b/kingfisher_scrapy/spiders/portugal.py @@ -12,7 +12,6 @@ class Portugal(ZipSpider): encoding = 'iso-8859-1' zip_file_format = 'json_lines' - download_warnsize = 0 download_timeout = 9999 def start_requests(self): diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index baf3dce3..f33acf93 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -1,10 +1,10 @@ import json -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import components, handle_error, parameters, replace_parameter +from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import handle_error, parameters, replace_parameter -class UKContractsFinder(BaseSpider): +class UKContractsFinder(SimpleSpider): name = 'uk_contracts_finder' data_type = 'release_package_list_in_results' encoding = 'iso-8859-1' @@ -22,4 +22,4 @@ def parse_list(self, response): total = data['maxPage'] for page in range(2, total + 1): url = replace_parameter(response.request.url, 'page', page) - yield self.build_request(url, formatter=components('page')) + yield self.build_request(url, formatter=parameters('page')) diff --git a/kingfisher_scrapy/spiders/uk_fts.py b/kingfisher_scrapy/spiders/uk_fts.py index 6c172dbf..ed3fd46b 100644 --- a/kingfisher_scrapy/spiders/uk_fts.py +++ b/kingfisher_scrapy/spiders/uk_fts.py @@ -7,7 +7,7 @@ class UKContractsFinder(LinksSpider): name = 'uk_fts' data_type = 'release_package_in_ocdsReleasePackage_in_list_in_results' - next_page_formatter = parameters('cursor') + next_page_formatter = staticmethod(parameters('cursor')) def start_requests(self): # This URL was provided by the publisher and is not the production URL. diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index f20487f5..cba2376e 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -96,7 +96,7 @@ def replace_parameter(url, key, value): parsed = urlsplit(url) query = parse_qs(parsed.query) if value is None: - del query[key] + query.pop(key, None) else: query[key] = [value] return parsed._replace(query=urlencode(query, doseq=True)).geturl() diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 00000000..4d118cc4 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,12 @@ +import pytest + +from kingfisher_scrapy.util import replace_parameter + + +@pytest.mark.parametrize('url,value,expected', [ + ('http://example.com/?page=1', 2, 'http://example.com/?page=2'), + ('http://example.com/?page=1', None, 'http://example.com/'), + ('http://example.com/', None, 'http://example.com/'), +]) +def test_replace_parameter(url, value, expected): + assert replace_parameter(url, 'page', value) == expected diff --git a/tests/test_zip_spider.py b/tests/test_zip_spider.py index 2c464f82..cdd936c2 100644 --- a/tests/test_zip_spider.py +++ b/tests/test_zip_spider.py @@ -51,9 +51,18 @@ def test_parse_json_lines(sample, len_items): response = response_fixture(body=io.getvalue()) generator = spider.parse(response) + item = next(generator) items = list(generator) - # assert len(items) == len_items + assert type(item) is File + assert len(item) == 6 + assert item['file_name'] == 'test' + assert item['url'] == 'http://example.com' + assert item['data_type'] == 'zip' + assert item['encoding'] == 'utf-8' + assert item['post_to_api'] is False + + assert len(items) == len_items for i, item in enumerate(items, 1): assert type(item) is FileItem @@ -83,8 +92,17 @@ def test_parse_release_package(sample, len_items, len_releases): response = response_fixture(body=io.getvalue()) generator = spider.parse(response) + item = next(generator) items = list(generator) + assert type(item) is File + assert len(item) == 6 + assert item['file_name'] == 'test' + assert item['url'] == 'http://example.com' + assert item['data_type'] == 'zip' + assert item['encoding'] == 'utf-8' + assert item['post_to_api'] is False + assert len(items) == len_items for i, item in enumerate(items, 1):