Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add crawl_time and keep_collection_open spider parameters #462

Merged
merged 4 commits into from
Jul 24, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 18 additions & 1 deletion kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,12 @@ class BaseSpider(scrapy.Spider):
.. code:: bash

scrapy crawl spider_name -a note='Started by NAME.'

Use the same collection_id in Kingfisher Process for a source_id:

.. code:: bash

scrapy crawl spider_name -a custom_collection_data_version=2020-01-01
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
"""

MAX_SAMPLE = 10
Expand All @@ -49,7 +55,8 @@ class BaseSpider(scrapy.Spider):
ocds_version = '1.1'
date_format = 'date'

def __init__(self, sample=None, note=None, from_date=None, until_date=None, latest=None, *args,
def __init__(self, sample=None, note=None, from_date=None, until_date=None, latest=None,
custom_collection_data_version=None, *args,
**kwargs):
super().__init__(*args, **kwargs)

Expand All @@ -60,13 +67,15 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, late
self.until_date = until_date
self.date_format = self.VALID_DATE_FORMATS[self.date_format]
self.latest = latest == 'true'
self.custom_collection_data_version = custom_collection_data_version

spider_arguments = {
'sample': sample,
'note': note,
'from_date': from_date,
'until_date': until_date,
'latest': latest,
'custom_collection_data_version': custom_collection_data_version,
}
spider_arguments.update(kwargs)
self.logger.info('Spider arguments: {!r}'.format(spider_arguments))
Expand All @@ -75,6 +84,14 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, late
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BaseSpider, cls).from_crawler(crawler, *args, **kwargs)

if spider.custom_collection_data_version:
try:
spider.custom_collection_data_version = datetime.strptime(spider.custom_collection_data_version,
'%Y-%m-%d')
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
except ValueError as e:
raise SpiderArgumentError('spider argument custom_collection_data_version: '
'invalid date value: {}'.format(e))

# Checks Spider date ranges arguments
if spider.from_date or spider.until_date:
if not spider.from_date:
Expand Down
8 changes: 6 additions & 2 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def spider_closed(self, spider, reason):
Sends an API request to end the collection's store step.
"""
# https://docs.scrapy.org/en/latest/topics/signals.html#spider-closed
if reason != 'finished' or spider.latest:
if reason != 'finished' or spider.latest or spider.custom_collection_data_version:
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
return

response = self.client.end_collection_store({
Expand All @@ -154,9 +154,13 @@ def item_scraped(self, item, spider):

if not item.get('post_to_api', True) or isinstance(item, LatestReleaseDateItem):
return
date_format = '%Y-%m-%d %H:%M:%S'
collection_data_version = spider.get_start_time(date_format)
if spider.custom_collection_data_version:
collection_data_version = spider.custom_collection_data_version.strftime(date_format)
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
data = {
'collection_source': spider.name,
'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
'collection_data_version': collection_data_version,
'collection_sample': spider.sample,
'file_name': item['file_name'],
'url': item['url'],
Expand Down
10 changes: 7 additions & 3 deletions tests/extensions/test_kingfisher_process_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ def test_from_crawler_missing_arguments(api_url, api_key):
@pytest.mark.parametrize('directory', [False, True])
@pytest.mark.parametrize('ok', [True, False])
@pytest.mark.parametrize('post_to_api', [True, False])
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, post_to_api, tmpdir,
caplog):
spider = spider_with_files_store(tmpdir, sample=sample, note=note)
@pytest.mark.parametrize('custom_collection_data_version', [None, '2020-01-01'])
def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, directory, ok, post_to_api,
custom_collection_data_version, tmpdir, caplog):
spider = spider_with_files_store(tmpdir, sample=sample, note=note,
custom_collection_data_version=custom_collection_data_version)

if directory:
spider.crawler.settings['KINGFISHER_API_LOCAL_DIRECTORY'] = str(tmpdir.join('xxx'))
Expand Down Expand Up @@ -96,6 +98,8 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d
expected['collection_note'] = note
if directory:
expected['local_file_name'] = tmpdir.join('xxx', path)
if custom_collection_data_version:
expected['collection_data_version'] = '2020-01-01 00:00:00'
if not post_to_api:
assert mocked.call_count == 0
else:
Expand Down
10 changes: 10 additions & 0 deletions tests/test_base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,13 @@ def test_date_arguments():
with pytest.raises(SpiderArgumentError) as e:
assert spider_with_crawler(until_date='test', default_from_date=test_date)
assert str(e.value) == 'spider argument until_date: invalid date value: {}'.format(error_message)


def test_custom_collection_data_version():
error_message = "time data '2020' does not match format '%Y-%m-%d'"

assert spider_with_crawler(custom_collection_data_version='2020-01-01')
with pytest.raises(SpiderArgumentError) as e:
assert spider_with_crawler(custom_collection_data_version='2020')
assert str(e.value) == 'spider argument custom_collection_data_version: invalid date value: {}'.format(
error_message)