From f30cde63eddeb4468af86dbc02495d2747ec038d Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 1 Jun 2020 23:54:10 -0400 Subject: [PATCH 01/15] Set DOWNLOAD_WARNSIZE = 0 globally (digiwhist_*, georgia_opendata, argentina_buenos_aires, moldova_old) --- kingfisher_scrapy/settings.py | 1 + kingfisher_scrapy/spiders/chile_compra_bulk.py | 1 - kingfisher_scrapy/spiders/colombia_bulk.py | 1 - kingfisher_scrapy/spiders/portugal.py | 1 - 4 files changed, 1 insertion(+), 3 deletions(-) diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 53fc8d38..7ee095a8 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -32,6 +32,7 @@ # The maximum response size (in bytes) that downloader will download (default: 1073741824): DOWNLOAD_MAXSIZE = 4000000000 +DOWNLOAD_WARNSIZE = 0 # The download delay setting will honor only one of: CONCURRENT_REQUESTS_PER_DOMAIN = 2 diff --git a/kingfisher_scrapy/spiders/chile_compra_bulk.py b/kingfisher_scrapy/spiders/chile_compra_bulk.py index 854d627d..b35a9d13 100644 --- a/kingfisher_scrapy/spiders/chile_compra_bulk.py +++ b/kingfisher_scrapy/spiders/chile_compra_bulk.py @@ -8,7 +8,6 @@ class ChileCompraBulk(ZipSpider): name = 'chile_compra_bulk' data_type = 'record_package' - download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 2f1c4036..ee9e2427 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -17,7 +17,6 @@ class ColombiaBulk(ZipSpider): encoding = 'iso-8859-1' zip_file_format = 'json_lines' - download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py index fdf25ba7..263aef58 100644 --- a/kingfisher_scrapy/spiders/portugal.py +++ b/kingfisher_scrapy/spiders/portugal.py @@ -12,7 +12,6 @@ class Portugal(ZipSpider): encoding = 'iso-8859-1' zip_file_format = 'json_lines' - download_warnsize = 0 download_timeout = 9999 def start_requests(self): From 4ccf9e08c69ffb9d5222972777854535e0f3b11e Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 1 Jun 2020 23:54:42 -0400 Subject: [PATCH 02/15] Don't load KingfisherFilesStore if FILES_STORE is not set --- kingfisher_scrapy/extensions.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 297dd9af..1cc151f0 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -17,8 +17,13 @@ def __init__(self, directory): @classmethod def from_crawler(cls, crawler): directory = crawler.settings['FILES_STORE'] + + if not directory: + raise NotConfigured('FILES_STORE is not set.') + extension = cls(directory) crawler.signals.connect(extension.item_scraped, signal=signals.item_scraped) + return extension def item_scraped(self, item, spider): From 88bce5e68d99e2341c1b54eb2cc4cf8829157632 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 1 Jun 2020 23:55:11 -0400 Subject: [PATCH 03/15] log_formatter: Use shorter form to import class --- kingfisher_scrapy/log_formatter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/log_formatter.py b/kingfisher_scrapy/log_formatter.py index 9e0b42bd..a7298837 100644 --- a/kingfisher_scrapy/log_formatter.py +++ b/kingfisher_scrapy/log_formatter.py @@ -1,7 +1,7 @@ -from scrapy import logformatter +from scrapy.logformatter import LogFormatter -class KingfisherLogFormatter(logformatter.LogFormatter): +class KingfisherLogFormatter(LogFormatter): # https://docs.scrapy.org/en/latest/_modules/scrapy/logformatter.html#LogFormatter.scraped def scraped(self, item, response, spider): """ From a520a1f227a2cbb8c3e33cb75f1f0a5234c36bb1 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 1 Jun 2020 23:56:16 -0400 Subject: [PATCH 04/15] Fix next_page_formatter (needs staticmethod) --- kingfisher_scrapy/spiders/armenia.py | 2 +- kingfisher_scrapy/spiders/australia.py | 2 +- kingfisher_scrapy/spiders/colombia.py | 2 +- kingfisher_scrapy/spiders/georgia_records.py | 2 +- kingfisher_scrapy/spiders/georgia_releases.py | 2 +- kingfisher_scrapy/spiders/honduras_portal_records.py | 2 +- kingfisher_scrapy/spiders/honduras_portal_releases.py | 2 +- kingfisher_scrapy/spiders/uk_fts.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index 04437cd2..10dc0ada 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -8,7 +8,7 @@ class Armenia(LinksSpider): name = 'armenia' data_type = 'release_package' next_pointer = '/next_page/uri' - next_page_formatter = parameters('offset') + next_page_formatter = staticmethod(parameters('offset')) def start_requests(self): url = 'https://armeps.am/ocds/release' diff --git a/kingfisher_scrapy/spiders/australia.py b/kingfisher_scrapy/spiders/australia.py index 60ec5dcf..49bed716 100644 --- a/kingfisher_scrapy/spiders/australia.py +++ b/kingfisher_scrapy/spiders/australia.py @@ -9,7 +9,7 @@ class Australia(LinksSpider): name = 'australia' data_type = 'release_package' - next_page_formatter = parameters('cursor') + next_page_formatter = staticmethod(parameters('cursor')) def start_requests(self): url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \ diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index faedc130..92a97df0 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -10,7 +10,7 @@ class Colombia(LinksSpider): name = 'colombia' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) def start_requests(self): base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases' diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index f9f7cef3..d2eefad7 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -7,7 +7,7 @@ class GeorgiaRecords(LinksSpider): name = 'georgia_records' data_type = 'record_package' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) def start_requests(self): url = 'https://odapi.spa.ge/api/records.json' diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index ee3ce55b..25437126 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -7,7 +7,7 @@ class GeorgiaReleases(LinksSpider): name = 'georgia_releases' data_type = 'release_package' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) def start_requests(self): url = 'https://odapi.spa.ge/api/releases.json' diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py index c5f5ec69..542213a1 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_records.py +++ b/kingfisher_scrapy/spiders/honduras_portal_records.py @@ -9,7 +9,7 @@ class HondurasPortalRecords(LinksSpider): data_type = 'record_package' data_pointer = '/recordPackage' next_pointer = '/next' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) download_delay = 0.9 diff --git a/kingfisher_scrapy/spiders/honduras_portal_releases.py b/kingfisher_scrapy/spiders/honduras_portal_releases.py index ca4c56f1..a32bf9e8 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_releases.py +++ b/kingfisher_scrapy/spiders/honduras_portal_releases.py @@ -9,7 +9,7 @@ class HondurasPortalReleases(LinksSpider): data_type = 'release_package' data_pointer = '/releasePackage' next_pointer = '/next' - next_page_formatter = parameters('page') + next_page_formatter = staticmethod(parameters('page')) download_delay = 0.9 diff --git a/kingfisher_scrapy/spiders/uk_fts.py b/kingfisher_scrapy/spiders/uk_fts.py index 6c172dbf..ed3fd46b 100644 --- a/kingfisher_scrapy/spiders/uk_fts.py +++ b/kingfisher_scrapy/spiders/uk_fts.py @@ -7,7 +7,7 @@ class UKContractsFinder(LinksSpider): name = 'uk_fts' data_type = 'release_package_in_ocdsReleasePackage_in_list_in_results' - next_page_formatter = parameters('cursor') + next_page_formatter = staticmethod(parameters('cursor')) def start_requests(self): # This URL was provided by the publisher and is not the production URL. From c245905be16d63bff50097bf363c70172117b93c Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 1 Jun 2020 23:56:49 -0400 Subject: [PATCH 05/15] uk_contracts_finder: Fix formatter and inheritance --- kingfisher_scrapy/spiders/uk_contracts_finder.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index baf3dce3..f33acf93 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -1,10 +1,10 @@ import json -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import components, handle_error, parameters, replace_parameter +from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import handle_error, parameters, replace_parameter -class UKContractsFinder(BaseSpider): +class UKContractsFinder(SimpleSpider): name = 'uk_contracts_finder' data_type = 'release_package_list_in_results' encoding = 'iso-8859-1' @@ -22,4 +22,4 @@ def parse_list(self, response): total = data['maxPage'] for page in range(2, total + 1): url = replace_parameter(response.request.url, 'page', page) - yield self.build_request(url, formatter=components('page')) + yield self.build_request(url, formatter=parameters('page')) From 0840d4ea531d655de15751cab12bd9e9cfc52ddd Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 1 Jun 2020 23:57:10 -0400 Subject: [PATCH 06/15] Fix replace_parameter if parameter not set --- kingfisher_scrapy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index f20487f5..cba2376e 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -96,7 +96,7 @@ def replace_parameter(url, key, value): parsed = urlsplit(url) query = parse_qs(parsed.query) if value is None: - del query[key] + query.pop(key, None) else: query[key] = [value] return parsed._replace(query=urlencode(query, doseq=True)).geturl() From 1ce52efa1b5a6d3af148f19de62bd6f7a76c3a92 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 1 Jun 2020 23:57:33 -0400 Subject: [PATCH 07/15] Log clearer error message if authentication not configured --- kingfisher_scrapy/spiders/openopps.py | 3 +-- kingfisher_scrapy/spiders/paraguay_dncp_base.py | 2 +- kingfisher_scrapy/spiders/paraguay_hacienda.py | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index e5d6ee32..1d9e22e9 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -56,8 +56,7 @@ def from_crawler(cls, crawler, *args, **kwargs): spider.username = crawler.settings.get('KINGFISHER_OPENOPPS_USERNAME') spider.password = crawler.settings.get('KINGFISHER_OPENOPPS_PASSWORD') if spider.username is None or spider.password is None: - spider.logger.error('Please set the environment variables ' - 'KINGFISHER_OPENOPPS_USERNAME and KINGFISHER_OPENOPPS_PASSWORD') + spider.logger.error('KINGFISHER_OPENOPPS_USERNAME and/or KINGFISHER_OPENOPPS_PASSWORD is not set.') raise scrapy.exceptions.CloseSpider('authentication_credentials_missing') return spider diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 30580bfb..5d81ea05 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -44,7 +44,7 @@ def from_crawler(cls, crawler, *args, **kwargs): spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN') if spider.request_token is None: - logging.error('No request token available') + spider.logger.error('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN is not set.') raise scrapy.exceptions.CloseSpider('authentication_credentials_missing') return spider diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py index 522dbde9..d25ae23b 100644 --- a/kingfisher_scrapy/spiders/paraguay_hacienda.py +++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py @@ -34,7 +34,8 @@ def from_crawler(cls, crawler, *args, **kwargs): spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN') spider.client_secret = crawler.settings.get('KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET') if spider.request_token is None or spider.client_secret is None: - spider.logger.error('No request token or client secret available') + spider.logger.error('KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN and/or ' + 'KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET is not set.') raise scrapy.exceptions.CloseSpider('authentication_credentials_missing') return spider From 924445635281e1ea1116dbf31541513b9e4277f6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jun 2020 00:11:58 -0400 Subject: [PATCH 08/15] Add test for replace_parameter --- tests/test_util.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/test_util.py diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 00000000..4d118cc4 --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,12 @@ +import pytest + +from kingfisher_scrapy.util import replace_parameter + + +@pytest.mark.parametrize('url,value,expected', [ + ('http://example.com/?page=1', 2, 'http://example.com/?page=2'), + ('http://example.com/?page=1', None, 'http://example.com/'), + ('http://example.com/', None, 'http://example.com/'), +]) +def test_replace_parameter(url, value, expected): + assert replace_parameter(url, 'page', value) == expected From 343ae4633fe5722dfd669ba336762f941ec43e5d Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jun 2020 00:17:48 -0400 Subject: [PATCH 09/15] Double the default DOWNLOAD_TIMEOUT setting --- kingfisher_scrapy/settings.py | 2 ++ kingfisher_scrapy/spiders/chile_base.py | 2 +- kingfisher_scrapy/spiders/dominican_republic.py | 2 -- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 7ee095a8..7db1f5bc 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -33,6 +33,8 @@ # The maximum response size (in bytes) that downloader will download (default: 1073741824): DOWNLOAD_MAXSIZE = 4000000000 DOWNLOAD_WARNSIZE = 0 +# Many spiders time out when using default of 180. +DOWNLOAD_TIMEOUT = 360 # The download delay setting will honor only one of: CONCURRENT_REQUESTS_PER_DOMAIN = 2 diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index 1ec7fbe6..26552dab 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -9,7 +9,7 @@ class ChileCompraBaseSpider(SimpleSpider): custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } - download_timeout = 300 + limit = 100 base_list_url = 'https://apis.mercadopublico.cl/OCDS/data/listaA%C3%B1oMes/{0.year:d}/{0.month:02d}/{1}/{2}' diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 18098a6c..0cde0c99 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -11,8 +11,6 @@ class DominicanRepublic(BaseSpider): name = 'dominican_republic' - download_timeout = 360 # 6min - def start_requests(self): yield scrapy.Request( 'https://www.dgcp.gob.do/estandar-mundial-ocds/', From c717c7f2fc4f80da619fd4c83cf65fafd73da31b Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jun 2020 00:18:13 -0400 Subject: [PATCH 10/15] Add command to easily run all spiders to check for code issues --- kingfisher_scrapy/commands/dryrun.py | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 kingfisher_scrapy/commands/dryrun.py diff --git a/kingfisher_scrapy/commands/dryrun.py b/kingfisher_scrapy/commands/dryrun.py new file mode 100644 index 00000000..44082e36 --- /dev/null +++ b/kingfisher_scrapy/commands/dryrun.py @@ -0,0 +1,54 @@ +from scrapy.commands import ScrapyCommand +from scrapy.crawler import CrawlerProcess +from scrapy.utils.project import get_project_settings + +from kingfisher_scrapy.base_spider import BaseSpider + + +def yield_nothing(*args, **kwargs): + yield + + +class DryRun(ScrapyCommand): + def short_desc(self): + return 'Run a dry run of all spiders' + + def add_options(self, parser): + ScrapyCommand.add_options(self, parser) + parser.add_option('-l', '--log-level', choices=['debug', 'info', 'warning', 'error', 'critical'], + default='debug', help='The minimum level to log') + + def run(self, args, opts): + BaseSpider.parse_json_lines = yield_nothing + BaseSpider.parse_json_array = yield_nothing + + settings = get_project_settings() + settings.set('LOG_LEVEL', opts.log_level.upper()) + + # Stop after one item or error. + settings.set('CLOSESPIDER_ERRORCOUNT', 1) + settings.set('CLOSESPIDER_ITEMCOUNT', 1) + + # Disable Kingfisher and Telnet extensions. + settings.set('EXTENSIONS', { + 'scrapy.extensions.telnet.TelnetConsole': None, + }) + + runner = CrawlerProcess(settings=settings) + + exceptions = { + # Server unavailable + 'mexico_cdmx', + # Require authentication + 'openopps', + 'paraguay_dncp_records', + 'paraguay_dncp_releases', + 'paraguay_hacienda', + } + + for spider_name in runner.spider_loader.list(): + if spider_name not in exceptions: + spidercls = runner.spider_loader.load(spider_name) + runner.crawl(spidercls) + + runner.start() From 3265f688eaabe6aa6cded163d66b5dc742cdad10 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jun 2020 00:53:04 -0400 Subject: [PATCH 11/15] Add missing yield in ZipSpider.parse --- kingfisher_scrapy/base_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index e2db8936..ddbdeab9 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -308,7 +308,7 @@ def start_requests(self): @handle_error def parse(self, response): if self.zip_file_format: - self.build_file_from_response(response, data_type='zip', post_to_api=False) + yield self.build_file_from_response(response, data_type='zip', post_to_api=False) zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): From 8267616f8fc479858e800ce5a5dbb0278e57e8df Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jun 2020 00:53:46 -0400 Subject: [PATCH 12/15] dryrun: Fix settings --- kingfisher_scrapy/commands/dryrun.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/kingfisher_scrapy/commands/dryrun.py b/kingfisher_scrapy/commands/dryrun.py index 44082e36..f4bb07cf 100644 --- a/kingfisher_scrapy/commands/dryrun.py +++ b/kingfisher_scrapy/commands/dryrun.py @@ -2,7 +2,7 @@ from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import BaseSpider, ZipSpider def yield_nothing(*args, **kwargs): @@ -13,30 +13,24 @@ class DryRun(ScrapyCommand): def short_desc(self): return 'Run a dry run of all spiders' - def add_options(self, parser): - ScrapyCommand.add_options(self, parser) - parser.add_option('-l', '--log-level', choices=['debug', 'info', 'warning', 'error', 'critical'], - default='debug', help='The minimum level to log') - def run(self, args, opts): BaseSpider.parse_json_lines = yield_nothing - BaseSpider.parse_json_array = yield_nothing - - settings = get_project_settings() - settings.set('LOG_LEVEL', opts.log_level.upper()) + ZipSpider.parse = yield_nothing # Stop after one item or error. - settings.set('CLOSESPIDER_ERRORCOUNT', 1) - settings.set('CLOSESPIDER_ITEMCOUNT', 1) + self.settings.set('CLOSESPIDER_ERRORCOUNT', 1) + self.settings.set('CLOSESPIDER_ITEMCOUNT', 1) - # Disable Kingfisher and Telnet extensions. - settings.set('EXTENSIONS', { + # Disable Kingfisher, Telnet, LogStats extensions. + self.settings.set('EXTENSIONS', { 'scrapy.extensions.telnet.TelnetConsole': None, }) + self.settings.set('LOGSTATS_INTERVAL', None) - runner = CrawlerProcess(settings=settings) + runner = CrawlerProcess(settings=self.settings) exceptions = { + 'test_fail', # Server unavailable 'mexico_cdmx', # Require authentication From 3a79675f790ff9f29a6a6b358b3ce9d3935277e3 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jun 2020 01:04:08 -0400 Subject: [PATCH 13/15] flake8 --- kingfisher_scrapy/commands/dryrun.py | 1 - kingfisher_scrapy/spiders/paraguay_dncp_base.py | 1 - 2 files changed, 2 deletions(-) diff --git a/kingfisher_scrapy/commands/dryrun.py b/kingfisher_scrapy/commands/dryrun.py index f4bb07cf..f7c35a71 100644 --- a/kingfisher_scrapy/commands/dryrun.py +++ b/kingfisher_scrapy/commands/dryrun.py @@ -1,6 +1,5 @@ from scrapy.commands import ScrapyCommand from scrapy.crawler import CrawlerProcess -from scrapy.utils.project import get_project_settings from kingfisher_scrapy.base_spider import BaseSpider, ZipSpider diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 5d81ea05..0a1b63c2 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -1,5 +1,4 @@ import json -import logging from datetime import datetime import scrapy From ac432f91464b43d130d913b4bbfd39127f03fb67 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jun 2020 01:12:45 -0400 Subject: [PATCH 14/15] Fix tests for "Add missing yield in ZipSpider.parse" 3265f688eaabe6aa6cded163d66b5dc742cdad10 --- tests/test_zip_spider.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/test_zip_spider.py b/tests/test_zip_spider.py index 2c464f82..7702dd83 100644 --- a/tests/test_zip_spider.py +++ b/tests/test_zip_spider.py @@ -51,9 +51,18 @@ def test_parse_json_lines(sample, len_items): response = response_fixture(body=io.getvalue()) generator = spider.parse(response) + item = next(generator) items = list(generator) - # assert len(items) == len_items + assert type(item) is File + assert len(item) == 6 + assert item['file_name'] == 'test' + assert item['url'] == 'http://example.com' + assert item['data_type'] == 'zip' + assert item['encoding'] == 'utf-8' + assert item['post_to_api'] == False + + assert len(items) == len_items for i, item in enumerate(items, 1): assert type(item) is FileItem @@ -83,8 +92,17 @@ def test_parse_release_package(sample, len_items, len_releases): response = response_fixture(body=io.getvalue()) generator = spider.parse(response) + item = next(generator) items = list(generator) + assert type(item) is File + assert len(item) == 6 + assert item['file_name'] == 'test' + assert item['url'] == 'http://example.com' + assert item['data_type'] == 'zip' + assert item['encoding'] == 'utf-8' + assert item['post_to_api'] == False + assert len(items) == len_items for i, item in enumerate(items, 1): From 25669c0f8a7fbdd23ea442af2fb2211af4ec1dfa Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 2 Jun 2020 01:14:46 -0400 Subject: [PATCH 15/15] flake8 --- tests/test_zip_spider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_zip_spider.py b/tests/test_zip_spider.py index 7702dd83..cdd936c2 100644 --- a/tests/test_zip_spider.py +++ b/tests/test_zip_spider.py @@ -60,7 +60,7 @@ def test_parse_json_lines(sample, len_items): assert item['url'] == 'http://example.com' assert item['data_type'] == 'zip' assert item['encoding'] == 'utf-8' - assert item['post_to_api'] == False + assert item['post_to_api'] is False assert len(items) == len_items @@ -101,7 +101,7 @@ def test_parse_release_package(sample, len_items, len_releases): assert item['url'] == 'http://example.com' assert item['data_type'] == 'zip' assert item['encoding'] == 'utf-8' - assert item['post_to_api'] == False + assert item['post_to_api'] is False assert len(items) == len_items