From b619514be498e47abddb80dbc8fc3f99f28aec23 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 20 Oct 2020 16:40:30 -0400 Subject: [PATCH 1/7] Add a --sample option to dryrun command --- kingfisher_scrapy/commands/dryrun.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/commands/dryrun.py b/kingfisher_scrapy/commands/dryrun.py index 8e9a82bb5..a457b6a7a 100644 --- a/kingfisher_scrapy/commands/dryrun.py +++ b/kingfisher_scrapy/commands/dryrun.py @@ -12,6 +12,10 @@ class DryRun(ScrapyCommand): def short_desc(self): return 'Run a dry run of all spiders' + def add_options(self, parser): + ScrapyCommand.add_options(self, parser) + parser.add_option('--sample', type=int, help='The number of sample files to store (default 0)') + def run(self, args, opts): BaseSpider.parse_json_lines = yield_nothing CompressedFileSpider.parse = yield_nothing @@ -20,8 +24,13 @@ def run(self, args, opts): self.settings.set('CLOSESPIDER_ERRORCOUNT', 1) # Disable LogStats extension. self.settings.set('LOGSTATS_INTERVAL', None) + # Disable custom and Telnet extensions. - self.settings.set('EXTENSIONS', {'scrapy.extensions.telnet.TelnetConsole': None}) + extensions = {'scrapy.extensions.telnet.TelnetConsole': None} + if opts.sample: + extensions['kingfisher_scrapy.extensions.KingfisherFilesStore'] = 100 + + self.settings.set('EXTENSIONS', extensions) runner = CrawlerProcess(settings=self.settings) @@ -37,6 +46,6 @@ def run(self, args, opts): for spider_name in runner.spider_loader.list(): if spider_name not in exceptions: spidercls = runner.spider_loader.load(spider_name) - runner.crawl(spidercls, sample=1) + runner.crawl(spidercls, sample=opts.sample or 1) runner.start() From ab4610503d9f88c54b080d71f7d772b98496e1fb Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 20 Oct 2020 16:52:59 -0400 Subject: [PATCH 2/7] Rename dryrun to crawlall and make appropriate changes --- .../commands/{dryrun.py => crawlall.py} | 51 ++++++++++++------- 1 file changed, 32 insertions(+), 19 deletions(-) rename kingfisher_scrapy/commands/{dryrun.py => crawlall.py} (57%) diff --git a/kingfisher_scrapy/commands/dryrun.py b/kingfisher_scrapy/commands/crawlall.py similarity index 57% rename from kingfisher_scrapy/commands/dryrun.py rename to kingfisher_scrapy/commands/crawlall.py index a457b6a7a..4e550e55f 100644 --- a/kingfisher_scrapy/commands/dryrun.py +++ b/kingfisher_scrapy/commands/crawlall.py @@ -1,22 +1,49 @@ from scrapy.commands import ScrapyCommand from scrapy.crawler import CrawlerProcess +from scrapy.exceptions import UsageError from kingfisher_scrapy.base_spider import BaseSpider, CompressedFileSpider +EXCEPTIONS = { + 'fail', + # Require authentication + 'openopps', + 'paraguay_dncp_records', + 'paraguay_dncp_releases', + 'paraguay_hacienda', +} + def yield_nothing(*args, **kwargs): yield -class DryRun(ScrapyCommand): +class CrawlAll(ScrapyCommand): def short_desc(self): - return 'Run a dry run of all spiders' + return 'Run all spiders' def add_options(self, parser): ScrapyCommand.add_options(self, parser) - parser.add_option('--sample', type=int, help='The number of sample files to store (default 0)') + parser.add_option('--dry-run', action='store_true', help='Runs the spiders without writing any files') + parser.add_option('--sample', type=int, help='The number of files to write') def run(self, args, opts): + if opts.sample and opts.dry_run: + raise UsageError('You cannot specify both --dry-run and --sample.') + if opts.sample is not None and opts.sample <= 0: + raise UsageError('--sample must be a positive integer.') + + kwargs = {} + extensions = {'scrapy.extensions.telnet.TelnetConsole': None} + + if opts.sample: + kwargs['sample'] = opts.sample + + if opts.dry_run: + kwargs['sample'] = 1 + else: + extensions['kingfisher_scrapy.extensions.KingfisherFilesStore'] = 100 + BaseSpider.parse_json_lines = yield_nothing CompressedFileSpider.parse = yield_nothing @@ -24,28 +51,14 @@ def run(self, args, opts): self.settings.set('CLOSESPIDER_ERRORCOUNT', 1) # Disable LogStats extension. self.settings.set('LOGSTATS_INTERVAL', None) - # Disable custom and Telnet extensions. - extensions = {'scrapy.extensions.telnet.TelnetConsole': None} - if opts.sample: - extensions['kingfisher_scrapy.extensions.KingfisherFilesStore'] = 100 - self.settings.set('EXTENSIONS', extensions) runner = CrawlerProcess(settings=self.settings) - exceptions = { - 'fail', - # Require authentication - 'openopps', - 'paraguay_dncp_records', - 'paraguay_dncp_releases', - 'paraguay_hacienda', - } - for spider_name in runner.spider_loader.list(): - if spider_name not in exceptions: + if spider_name not in EXCEPTIONS: spidercls = runner.spider_loader.load(spider_name) - runner.crawl(spidercls, sample=opts.sample or 1) + runner.crawl(spidercls, **kwargs) runner.start() From 12f40a6499a6a1db8a560df38d02196d35f3e6f5 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Tue, 20 Oct 2020 16:55:46 -0400 Subject: [PATCH 3/7] isort --- kingfisher_scrapy/spiders/costa_rica_poder_judicial_records.py | 2 +- kingfisher_scrapy/spiders/costa_rica_poder_judicial_releases.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/spiders/costa_rica_poder_judicial_records.py b/kingfisher_scrapy/spiders/costa_rica_poder_judicial_records.py index 56790db59..913f8131a 100644 --- a/kingfisher_scrapy/spiders/costa_rica_poder_judicial_records.py +++ b/kingfisher_scrapy/spiders/costa_rica_poder_judicial_records.py @@ -3,7 +3,7 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_http_error, components +from kingfisher_scrapy.util import components, handle_http_error class CostaRicaPoderJudicialRecords(SimpleSpider): diff --git a/kingfisher_scrapy/spiders/costa_rica_poder_judicial_releases.py b/kingfisher_scrapy/spiders/costa_rica_poder_judicial_releases.py index 5f547fa2e..65124ad28 100644 --- a/kingfisher_scrapy/spiders/costa_rica_poder_judicial_releases.py +++ b/kingfisher_scrapy/spiders/costa_rica_poder_judicial_releases.py @@ -3,7 +3,7 @@ import scrapy from kingfisher_scrapy.base_spider import CompressedFileSpider -from kingfisher_scrapy.util import handle_http_error, components +from kingfisher_scrapy.util import components, handle_http_error class CostaRicaPoderJudicialReleases(CompressedFileSpider): From fd8fc19cc0df9b2f75495cf73d6f105a5fcc9854 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 21 Oct 2020 16:22:50 -0400 Subject: [PATCH 4/7] test_fail.py was renamed to fail.py --- kingfisher_scrapy/commands/updatedocs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/commands/updatedocs.py b/kingfisher_scrapy/commands/updatedocs.py index e51c92d8f..0fdea02b4 100644 --- a/kingfisher_scrapy/commands/updatedocs.py +++ b/kingfisher_scrapy/commands/updatedocs.py @@ -31,7 +31,7 @@ def _keyfunc(module): """)) for key, group in groupby(walk_modules('kingfisher_scrapy.spiders'), _keyfunc): - if key in ('spiders', 'test', 'fail'): + if key in ('spiders', 'fail'): continue f.write('\n{}\n{}\n'.format(key.capitalize(), '-' * len(key))) From 491bc1ec4b08bf57f0f5b9996b9c63334e762aaa Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 21 Oct 2020 16:23:14 -0400 Subject: [PATCH 5/7] closespider_ is used by the CloseSpider prefix, so abbreviate reason to just "sample" --- kingfisher_scrapy/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index f026892ca..07666f3c1 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -58,7 +58,7 @@ def process_item(self, item, spider): if not isinstance(item, (File, FileItem)): raise DropItem() if self.item_count >= spider.sample: - spider.crawler.engine.close_spider(spider, 'closespider_sample') + spider.crawler.engine.close_spider(spider, 'sample') raise DropItem self.item_count += 1 return item From ecdec626f2d4fa2dfbe2dbffca80ae0abdac0d51 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 21 Oct 2020 16:24:08 -0400 Subject: [PATCH 6/7] dominican_republic: Restore comment about bulk download --- kingfisher_scrapy/spiders/dominican_republic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 25b06889c..1673cffed 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -5,6 +5,10 @@ class DominicanRepublic(CompressedFileSpider): + """ + Bulk download documentation + https://www.dgcp.gob.do/estandar-mundial-ocds/ + """ name = 'dominican_republic' data_type = 'release_package' compressed_file_format = 'release_package' From 77c2b15b5815fdb6e67226c30d8d3ed060a7a3ff Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Wed, 21 Oct 2020 16:30:15 -0400 Subject: [PATCH 7/7] updatedocs: Fix two-word country names, closes #527 --- docs/spiders.rst | 17 ++++++++--------- kingfisher_scrapy/commands/updatedocs.py | 7 +++++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/spiders.rst b/docs/spiders.rst index acc327b47..f6c1c4af4 100644 --- a/docs/spiders.rst +++ b/docs/spiders.rst @@ -8,7 +8,6 @@ All the spiders have these common arguments: .. Do not edit this file. Instead, run: `scrapy updatedocs` - Afghanistan ----------- @@ -72,13 +71,13 @@ Colombia .. autoclass:: kingfisher_scrapy.spiders.colombia_bulk.ColombiaBulk :no-members: -Costarica ---------- +Costa Rica +---------- -.. autoclass:: kingfisher_scrapy.spiders.costarica_poder_judicial_records.CostaRicaPoderJudicialRecords +.. autoclass:: kingfisher_scrapy.spiders.costa_rica_poder_judicial_records.CostaRicaPoderJudicialRecords :no-members: -.. autoclass:: kingfisher_scrapy.spiders.costarica_poder_judicial_releases.CostaRicaPoderJudicialReleases +.. autoclass:: kingfisher_scrapy.spiders.costa_rica_poder_judicial_releases.CostaRicaPoderJudicialReleases :no-members: Digiwhist @@ -186,13 +185,13 @@ Digiwhist .. autoclass:: kingfisher_scrapy.spiders.digiwhist_united_kingdom.DigiwhistUnitedKingdomRepublic :no-members: -Dominicanrepublic ------------------ +Dominican Republic +------------------ -.. autoclass:: kingfisher_scrapy.spiders.dominicanrepublic.DominicanRepublic +.. autoclass:: kingfisher_scrapy.spiders.dominican_republic.DominicanRepublic :no-members: -.. autoclass:: kingfisher_scrapy.spiders.dominicanrepublic_api.DominicanRepublicPortal +.. autoclass:: kingfisher_scrapy.spiders.dominican_republic_api.DominicanRepublicPortal :no-members: Ecuador diff --git a/kingfisher_scrapy/commands/updatedocs.py b/kingfisher_scrapy/commands/updatedocs.py index 0fdea02b4..195a25c07 100644 --- a/kingfisher_scrapy/commands/updatedocs.py +++ b/kingfisher_scrapy/commands/updatedocs.py @@ -15,7 +15,10 @@ def run(self, args, opts): basedir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def _keyfunc(module): - return module.__name__.rsplit('.', 1)[-1].split('_', 1)[0] + module_name = module.__name__.rsplit('.', 1)[-1] + if module_name.startswith(('costa_rica', 'dominican_republic')): + return '_'.join(module_name.split('_', 2)[:2]) + return module_name.split('_', 1)[0] with open(os.path.join(basedir, 'docs', 'spiders.rst'), 'w') as f: f.write(dedent("""\ @@ -34,7 +37,7 @@ def _keyfunc(module): if key in ('spiders', 'fail'): continue - f.write('\n{}\n{}\n'.format(key.capitalize(), '-' * len(key))) + f.write('\n{}\n{}\n'.format(key.replace('_', ' ').title(), '-' * len(key))) for module in group: for cls in iter_spider_classes(module):