Skip to content

Commit

Permalink
Merge 77c2b15 into 3bf61bf
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Oct 21, 2020
2 parents 3bf61bf + 77c2b15 commit c8a6f19
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 57 deletions.
17 changes: 8 additions & 9 deletions docs/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ All the spiders have these common arguments:
.. Do not edit this file. Instead, run: `scrapy updatedocs`
Afghanistan
-----------

Expand Down Expand Up @@ -72,13 +71,13 @@ Colombia
.. autoclass:: kingfisher_scrapy.spiders.colombia_bulk.ColombiaBulk
:no-members:

Costarica
---------
Costa Rica
----------

.. autoclass:: kingfisher_scrapy.spiders.costarica_poder_judicial_records.CostaRicaPoderJudicialRecords
.. autoclass:: kingfisher_scrapy.spiders.costa_rica_poder_judicial_records.CostaRicaPoderJudicialRecords
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.costarica_poder_judicial_releases.CostaRicaPoderJudicialReleases
.. autoclass:: kingfisher_scrapy.spiders.costa_rica_poder_judicial_releases.CostaRicaPoderJudicialReleases
:no-members:

Digiwhist
Expand Down Expand Up @@ -186,13 +185,13 @@ Digiwhist
.. autoclass:: kingfisher_scrapy.spiders.digiwhist_united_kingdom.DigiwhistUnitedKingdomRepublic
:no-members:

Dominicanrepublic
-----------------
Dominican Republic
------------------

.. autoclass:: kingfisher_scrapy.spiders.dominicanrepublic.DominicanRepublic
.. autoclass:: kingfisher_scrapy.spiders.dominican_republic.DominicanRepublic
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.dominicanrepublic_api.DominicanRepublicPortal
.. autoclass:: kingfisher_scrapy.spiders.dominican_republic_api.DominicanRepublicPortal
:no-members:

Ecuador
Expand Down
64 changes: 64 additions & 0 deletions kingfisher_scrapy/commands/crawlall.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import UsageError

from kingfisher_scrapy.base_spider import BaseSpider, CompressedFileSpider

EXCEPTIONS = {
'fail',
# Require authentication
'openopps',
'paraguay_dncp_records',
'paraguay_dncp_releases',
'paraguay_hacienda',
}


def yield_nothing(*args, **kwargs):
yield


class CrawlAll(ScrapyCommand):
def short_desc(self):
return 'Run all spiders'

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option('--dry-run', action='store_true', help='Runs the spiders without writing any files')
parser.add_option('--sample', type=int, help='The number of files to write')

def run(self, args, opts):
if opts.sample and opts.dry_run:
raise UsageError('You cannot specify both --dry-run and --sample.')
if opts.sample is not None and opts.sample <= 0:
raise UsageError('--sample must be a positive integer.')

kwargs = {}
extensions = {'scrapy.extensions.telnet.TelnetConsole': None}

if opts.sample:
kwargs['sample'] = opts.sample

if opts.dry_run:
kwargs['sample'] = 1
else:
extensions['kingfisher_scrapy.extensions.KingfisherFilesStore'] = 100

BaseSpider.parse_json_lines = yield_nothing
CompressedFileSpider.parse = yield_nothing

# Stop after one item or error.
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
# Disable LogStats extension.
self.settings.set('LOGSTATS_INTERVAL', None)
# Disable custom and Telnet extensions.
self.settings.set('EXTENSIONS', extensions)

runner = CrawlerProcess(settings=self.settings)

for spider_name in runner.spider_loader.list():
if spider_name not in EXCEPTIONS:
spidercls = runner.spider_loader.load(spider_name)
runner.crawl(spidercls, **kwargs)

runner.start()
42 changes: 0 additions & 42 deletions kingfisher_scrapy/commands/dryrun.py

This file was deleted.

9 changes: 6 additions & 3 deletions kingfisher_scrapy/commands/updatedocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ def run(self, args, opts):
basedir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

def _keyfunc(module):
return module.__name__.rsplit('.', 1)[-1].split('_', 1)[0]
module_name = module.__name__.rsplit('.', 1)[-1]
if module_name.startswith(('costa_rica', 'dominican_republic')):
return '_'.join(module_name.split('_', 2)[:2])
return module_name.split('_', 1)[0]

with open(os.path.join(basedir, 'docs', 'spiders.rst'), 'w') as f:
f.write(dedent("""\
Expand All @@ -31,10 +34,10 @@ def _keyfunc(module):
"""))

for key, group in groupby(walk_modules('kingfisher_scrapy.spiders'), _keyfunc):
if key in ('spiders', 'test', 'fail'):
if key in ('spiders', 'fail'):
continue

f.write('\n{}\n{}\n'.format(key.capitalize(), '-' * len(key)))
f.write('\n{}\n{}\n'.format(key.replace('_', ' ').title(), '-' * len(key)))

for module in group:
for cls in iter_spider_classes(module):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def process_item(self, item, spider):
if not isinstance(item, (File, FileItem)):
raise DropItem()
if self.item_count >= spider.sample:
spider.crawler.engine.close_spider(spider, 'closespider_sample')
spider.crawler.engine.close_spider(spider, 'sample')
raise DropItem
self.item_count += 1
return item
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, components
from kingfisher_scrapy.util import components, handle_http_error


class CostaRicaPoderJudicialRecords(SimpleSpider):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import scrapy

from kingfisher_scrapy.base_spider import CompressedFileSpider
from kingfisher_scrapy.util import handle_http_error, components
from kingfisher_scrapy.util import components, handle_http_error


class CostaRicaPoderJudicialReleases(CompressedFileSpider):
Expand Down
4 changes: 4 additions & 0 deletions kingfisher_scrapy/spiders/dominican_republic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@


class DominicanRepublic(CompressedFileSpider):
"""
Bulk download documentation
https://www.dgcp.gob.do/estandar-mundial-ocds/
"""
name = 'dominican_republic'
data_type = 'release_package'
compressed_file_format = 'release_package'
Expand Down

0 comments on commit c8a6f19

Please sign in to comment.