Skip to content

Commit

Permalink
Merge branch 'master' into 439-new-portugal
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Aug 20, 2020
2 parents d095693 + 61a9563 commit 73e0110
Show file tree
Hide file tree
Showing 45 changed files with 829 additions and 361 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ venv/
/data
/docs/_build
/htmlcov
/latestreleasedate
/pluck-*.csv
/pluck_skipped.json
23 changes: 19 additions & 4 deletions docs/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ Dominican
.. autoclass:: kingfisher_scrapy.spiders.dominican_republic.DominicanRepublic
:no-members:

Ecuador
-------

.. autoclass:: kingfisher_scrapy.spiders.ecuador_emergency.EcuadorEmergency
:no-members:

France
------

Expand Down Expand Up @@ -237,18 +243,24 @@ Mexico
.. autoclass:: kingfisher_scrapy.spiders.mexico_administracion_publica_federal.MexicoAdministracionPublicaFederal
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.mexico_cdmx.MexicoCDMXSource
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.mexico_grupo_aeroporto.MexicoGrupoAeroporto
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.mexico_inai.MexicoINAI
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.mexico_inai_portal.MexicoINAIPortal
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.mexico_jalisco.MexicoJalisco
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.mexico_nuevo_leon_records.MexicoNuevoLeonRecords
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.mexico_nuevo_leon_releases.MexicoNuevoLeonReleases
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.mexico_quien_es_quien.MexicoQuienEsQuien
:no-members:

Expand Down Expand Up @@ -306,7 +318,10 @@ Portugal
Scotland
--------

.. autoclass:: kingfisher_scrapy.spiders.scotland.Scotland
.. autoclass:: kingfisher_scrapy.spiders.scotland_proactis.ScotlandProactis
:no-members:

.. autoclass:: kingfisher_scrapy.spiders.scotland_public_contracts.ScotlandPublicContracts
:no-members:

Uganda
Expand Down
62 changes: 52 additions & 10 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,30 +40,59 @@ class BaseSpider(scrapy.Spider):
.. code:: bash
scrapy crawl spider_name -a note='Started by NAME.'
Each crawl writes data to its own directory. By default, this directory is named according to the time the crawl
started. To override the time (for example, to force a new crawl to write to the same directory as an earlier
crawl), you can set the crawl_time spider argument:
.. code:: bash
scrapy crawl spider_name -a crawl_time=2020-01-01T10:00:00
Don't close the Kingfisher Process collection when the crawl finishes:
.. code:: bash
scrapy crawl spider_name -a keep_collection_open=true
"""

MAX_SAMPLE = 10
MAX_RELEASES_PER_PACKAGE = 100
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'}

def __init__(self, sample=None, note=None, from_date=None, until_date=None,
date_format='date', latest=None, *args, **kwargs):
ocds_version = '1.1'
date_format = 'date'

def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
**kwargs):
super().__init__(*args, **kwargs)

# https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments
self.sample = sample == 'true'
self.note = note
self.from_date = from_date
self.until_date = until_date
self.date_format = self.VALID_DATE_FORMATS[date_format]
self.latest = latest == 'true'
self.crawl_time = crawl_time
self.keep_collection_open = keep_collection_open == 'true'
# Pluck-related arguments.
self.package_pointer = package_pointer
self.release_pointer = release_pointer
self.truncate = int(truncate) if truncate else None

self.date_format = self.VALID_DATE_FORMATS[self.date_format]
self.pluck = bool(package_pointer or release_pointer)

spider_arguments = {
'sample': sample,
'note': note,
'from_date': from_date,
'until_date': until_date,
'latest': latest,
'crawl_time': crawl_time,
'keep_collection_open': keep_collection_open,
'package_pointer': package_pointer,
'release_pointer': release_pointer,
'truncate': truncate,
}
spider_arguments.update(kwargs)
self.logger.info('Spider arguments: {!r}'.format(spider_arguments))
Expand All @@ -72,6 +101,15 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None,
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BaseSpider, cls).from_crawler(crawler, *args, **kwargs)

if spider.package_pointer and spider.release_pointer:
raise SpiderArgumentError('You cannot specify both package_pointer and release_pointer spider arguments.')

if spider.crawl_time:
try:
spider.crawl_time = datetime.strptime(spider.crawl_time, '%Y-%m-%dT%H:%M:%S')
except ValueError as e:
raise SpiderArgumentError('spider argument crawl_time: invalid date value: {}'.format(e))

# Checks Spider date ranges arguments
if spider.from_date or spider.until_date:
if not spider.from_date:
Expand Down Expand Up @@ -103,7 +141,11 @@ def get_start_time(self, format):
"""
Returns the formatted start time of the crawl.
"""
return self.crawler.stats.get_value('start_time').strftime(format)
if self.crawl_time:
date = self.crawl_time
else:
date = self.crawler.stats.get_value('start_time')
return date.strftime(format)

def build_request(self, url, formatter, **kwargs):
"""
Expand Down Expand Up @@ -307,7 +349,7 @@ def start_requests(self):
"""

encoding = 'utf-8'
skip_latest_release_date = "This command doesn't yet support identifying the latest release in a archive file."
skip_pluck = 'Archive files are not supported'
compressed_file_format = None
archive_format = 'zip'
file_name_must_contain = ''
Expand Down Expand Up @@ -359,8 +401,8 @@ class LinksSpider(SimpleSpider):
1. Set a ``data_type`` class attribute to the data type of the API responses
1. Set a ``next_page_formatter`` class attribute to set the file name as in
:meth:`~kingfisher_scrapy.base_spider.BaseSpider.build_request`
1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next")
1. Write a ``start_requests`` method to request the first page of API results
1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next")
.. code-block:: python
Expand All @@ -385,14 +427,14 @@ def parse(self, response):
if not self.sample:
yield self.next_link(response)

def next_link(self, response):
def next_link(self, response, **kwargs):
"""
If the JSON response has a ``links.next`` key, returns a ``scrapy.Request`` for the URL.
"""
data = json.loads(response.text)
url = resolve_pointer(data, self.next_pointer, None)
if url:
return self.build_request(url, formatter=self.next_page_formatter)
return self.build_request(url, formatter=self.next_page_formatter, **kwargs)

if response.meta['depth'] == 0:
raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url))
2 changes: 1 addition & 1 deletion kingfisher_scrapy/commands/dryrun.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ def run(self, args, opts):
CompressedFileSpider.parse = yield_nothing

# Stop after one item or error.
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
self.settings.set('CLOSESPIDER_ITEMCOUNT', 1)
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
# Disable LogStats extension.
self.settings.set('LOGSTATS_INTERVAL', None)
# Disable custom and Telnet extensions.
Expand Down
45 changes: 0 additions & 45 deletions kingfisher_scrapy/commands/latestreleasedate.py

This file was deleted.

61 changes: 61 additions & 0 deletions kingfisher_scrapy/commands/pluck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import json
import logging
import os
from collections import defaultdict
from datetime import datetime

from scrapy.commands import ScrapyCommand
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import UsageError

from kingfisher_scrapy.util import _pluck_filename

logger = logging.getLogger(__name__)


class Pluck(ScrapyCommand):
def short_desc(self):
return 'Pluck one data value per publisher'

def add_options(self, parser):
ScrapyCommand.add_options(self, parser)
parser.add_option('-p', '--package-pointer', help='The JSON Pointer to the value in the package')
parser.add_option('-r', '--release-pointer', help='The JSON Pointer to the value in the release')
parser.add_option('-t', '--truncate', type=int, help='Truncate the value to this number of characters')

def run(self, args, opts):
if not (bool(opts.package_pointer) ^ bool(opts.release_pointer)):
raise UsageError('Exactly one of --package-pointer or --release-pointer must be set.')

# Stop after one item or error.
self.settings.set('CLOSESPIDER_ITEMCOUNT', 1)
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
# Disable LogStats extension.
self.settings.set('LOGSTATS_INTERVAL', None)
# Limit concurrent requests, to download the minimum.
self.settings.set('CONCURRENT_REQUESTS', 1)

filename = _pluck_filename(opts)
if os.path.isfile(filename):
os.unlink(filename)

runner = CrawlerProcess(settings=self.settings)

year = datetime.today().year
skipped = defaultdict(list)
running = []
for spider_name in runner.spider_loader.list():
if spider_name != 'test_fail':
spidercls = runner.spider_loader.load(spider_name)
if hasattr(spidercls, 'skip_pluck'):
skipped[spidercls.skip_pluck].append(spider_name)
else:
running.append(spider_name)
runner.crawl(spidercls, year=year, package_pointer=opts.package_pointer,
release_pointer=opts.release_pointer, truncate=opts.truncate)

with open('pluck_skipped.json', 'w') as f:
json.dump(skipped, f, indent=2)

logger.info(f"Running {len(running)} spiders: {', '.join(sorted(running))}")
runner.start()
Loading

0 comments on commit 73e0110

Please sign in to comment.