diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index 8aa0d255..6bf087ec 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -20,6 +20,7 @@ Access methods for OCDS data are very similar. Spiders therefore share a lot of - :class:`~kingfisher_scrapy.base_spider.PeriodicSpider`: Use if the bulk downloads or API methods accept a year or a year and month as a query string parameter or URL path component. - :class:`~kingfisher_scrapy.base_spider.LinksSpider`: Use if the API implements `pagination `__. - :class:`~kingfisher_scrapy.base_spider.CompressedFileSpider`: Use if the bulk downloads are ZIP or RAR files. +- :class:`~kingfisher_scrapy.base_spider.BigFileSpider`: Use if the downloads include a big JSON file as a release package that can not be processed in Kingfisher Process. - :class:`~kingfisher_scrapy.base_spider.SimpleSpider`: Use in almost all other cases. ``IndexSpider``, ``PeriodicSpider`` and ``LinksSpider`` are child classes of this class. - :class:`~kingfisher_scrapy.base_spider.BaseSpider`: All spiders inherit, directly or indirectly, from this class, which in turn inherits from `scrapy.Spider `__. Use if none of the above can be used. diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index d67da221..d7960135 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -612,3 +612,33 @@ def _build_url(self, params): url_params = params.copy() url_params.update(self.additional_params) return util.replace_parameters(self.base_url, **url_params) + + +class BigFileSpider(SimpleSpider): + """ + This class makes it easy to collect data from sources that provide big JSON files as a release package. + Each big file is resized to multiple small files that the current version of Kingfisher process is able to process. + + #. Inherit from ``BigFileSpider`` + #. Write a ``start_requests`` method to request the archive files + + .. code-block:: python + + from kingfisher_scrapy.base_spider import BigFileSpider + from kingfisher_scrapy.util import components + + class MySpider(BigFileSpider): + name = 'my_spider' + + def start_requests(self): + yield self.build_request('https://example.com/api/package.json', formatter=components(-1) + """ + + resize_package = True + + @handle_http_error + def parse(self, response): + data = {'data': response.body, + 'package': response.body} + yield self.build_file(file_name=response.request.meta['file_name'], url=response.request.url, + data_type='release_package', data=data) diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index 70fa0893..0fdb2d1a 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -1,10 +1,10 @@ import scrapy -from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.base_spider import BigFileSpider from kingfisher_scrapy.util import components, handle_http_error -class France(SimpleSpider): +class France(BigFileSpider): """ Domain France @@ -13,9 +13,6 @@ class France(SimpleSpider): """ name = 'france' - # SimpleSpider - data_type = 'release_package' - def start_requests(self): # A CKAN API JSON response. # Ministère de l'économie, des finances et de la relance diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py index 04067bc9..13b8744e 100644 --- a/kingfisher_scrapy/spiders/indonesia_bandung.py +++ b/kingfisher_scrapy/spiders/indonesia_bandung.py @@ -59,12 +59,5 @@ def parse_list(self, response): yield self.build_request(next_page_url, formatter=join(self.get_formatter(), parameters('page')), callback=self.parse_list) - @handle_http_error - def parse(self, response): - data = response.json() - if len(data) == 0: - return - yield self.build_file_from_response(response, data_type=self.data_type) - def get_formatter(self): return components(-1) diff --git a/kingfisher_scrapy/spiders/mexico_inai.py b/kingfisher_scrapy/spiders/mexico_inai.py index 7d59d3dc..bd2ccdad 100644 --- a/kingfisher_scrapy/spiders/mexico_inai.py +++ b/kingfisher_scrapy/spiders/mexico_inai.py @@ -33,6 +33,7 @@ def parse_list(self, response): for result in datas['result']['results']: for resource in result['resources']: if resource['format'] == 'JSON': + # http://bit.ly/ConcentradoINAI yield self.build_request(resource['url'], formatter=components(-1), meta={'dont_redirect': True}, callback=self.parse_redirect) diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index c5e0f6f6..ae5e5c46 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -1,3 +1,5 @@ +import scrapy + from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import components, handle_http_error, join, parameters, replace_parameters @@ -10,31 +12,32 @@ class Moldova(SimpleSpider): name = 'moldova' # SimpleSpider - data_type = 'record_package' + data_type = 'release_package' def start_requests(self): - endpoints = { - 'budgets': 'https://public.mtender.gov.md/budgets/', - # From https://github.com/open-contracting/kingfisher-collect/issues/192#issuecomment-529928683 - # The /tenders/plans endpoint appeared to return exactly the same data as the /tenders endpoint except - # that when given an OCID parameter it returned an error message. It may be that /tenders/plans just - # lists a subset of /tenders but this isn't clear. - # 'plans': 'https://public.mtender.gov.md/tenders/plan/', - 'tenders': 'https://public.mtender.gov.md/tenders/', - } - - for endpoint, url in endpoints.items(): - yield self.build_request(url, formatter=components(-1), callback=self.parse_list) + # https://public.mtender.gov.md offers three endpoints: /tenders/, /tenders/plan/ and /budgets/. However, this + # service publishes contracting processes under multiple OCIDs. + # + # The http://public.eprocurement.systems/ocds/ service instead publishes contracting processes under one OCID. + # However, it has no endpoint to list OCIDs. + # + # As such, we retrieve OCIDs from the first, and data from the second. + # + # Note: The OCIDs from the /budgets/ endpoint have no corresponding data in the second service. The OCIDs from + # the /tenders/plan/ endpoint are the same as from the /tenders/ endpoint. + url = 'https://public.mtender.gov.md/tenders/' + yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list) @handle_http_error def parse_list(self, response): + base_url = 'http://public.eprocurement.systems/ocds/tenders/' data = response.json() # The last page returns an empty JSON object. if not data: return for item in data['data']: - url = replace_parameters(response.request.url, offset=None) + item['ocid'] + url = replace_parameters(base_url, offset=None) + item['ocid'] yield self.build_request(url, formatter=components(-2)) url = replace_parameters(response.request.url, offset=data['offset']) diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index c58c5b26..156f168b 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -40,9 +40,9 @@ class OpenOpps(BaseSpider): # BaseSpider default_from_date = '2011-01-01' - root_path = 'item' + root_path = 'results.item.json' dont_truncate = True - + access_token = None api_limit = 10000 # OpenOpps API limit for search results request_time_limit = 60 # in minutes @@ -108,7 +108,7 @@ def start_requests_pages(self): yield from self.request_range_per_day(self.from_date, self.until_date, search_h) else: # Use larger ranges for filters with less than (api_limit) search results - release_date_gte_list = ['', '2009-01-01', '2010-01-01', '2010-07-01'] + release_date_gte_list = ['1970-01-01', '2009-01-01', '2010-01-01', '2010-07-01'] release_date_lte_list = ['2008-12-31', '2009-12-31', '2010-06-30', '2010-12-31'] for i in range(len(release_date_gte_list)): @@ -148,15 +148,7 @@ def parse(self, response): # Counts response and range hour split control if count <= self.api_limit or search_h == 1: - # Data type changed to release package list in order to have fewer files - all_data = [] - for data in results['results']: - json_data = data['json'] - if json_data: - all_data.append(json_data) - - if all_data: - yield self.build_file_from_response(response, data=all_data, data_type=self.data_type) + yield self.build_file_from_response(response, data_type=self.data_type) next_url = results.get('next') if next_url: diff --git a/tests/test_big_file_spider.py b/tests/test_big_file_spider.py new file mode 100644 index 00000000..830849d8 --- /dev/null +++ b/tests/test_big_file_spider.py @@ -0,0 +1,31 @@ +import json + +import pytest + +from kingfisher_scrapy.base_spider import BigFileSpider +from kingfisher_scrapy.items import File +from tests import response_fixture, spider_with_crawler + + +@pytest.mark.parametrize('sample,len_items,len_releases', [(None, 2, 100), (5, 1, 5)]) +def test_parse_release_package(sample, len_items, len_releases): + spider = spider_with_crawler(spider_class=BigFileSpider, sample=sample) + package = {'releases': []} + for i in range(200): + package['releases'].append({'key': 'value'}) + + response = response_fixture(body=json.dumps(package).encode(), meta={'file_name': 'test.json'}) + generator = spider.parse(response) + item = next(generator) + + assert type(item) is File + assert len(item) == 5 + assert item['file_name'] == 'test.json' + assert item['url'] == 'http://example.com' + assert item['data_type'] == 'release_package' + assert item['encoding'] == 'utf-8' + assert item['data']['package'] is not None + assert item['data']['data'] is not None + + with pytest.raises(StopIteration): + next(generator)