Skip to content

Commit

Permalink
Merge branch 'main' into fix-max-bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Mar 2, 2021
2 parents b142666 + 1d5c855 commit bc6a6f6
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 38 deletions.
1 change: 1 addition & 0 deletions docs/contributing/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Access methods for OCDS data are very similar. Spiders therefore share a lot of
- :class:`~kingfisher_scrapy.base_spider.PeriodicSpider`: Use if the bulk downloads or API methods accept a year or a year and month as a query string parameter or URL path component.
- :class:`~kingfisher_scrapy.base_spider.LinksSpider`: Use if the API implements `pagination <https://github.com/open-contracting-extensions/ocds_pagination_extension>`__.
- :class:`~kingfisher_scrapy.base_spider.CompressedFileSpider`: Use if the bulk downloads are ZIP or RAR files.
- :class:`~kingfisher_scrapy.base_spider.BigFileSpider`: Use if the downloads include a big JSON file as a release package that can not be processed in Kingfisher Process.
- :class:`~kingfisher_scrapy.base_spider.SimpleSpider`: Use in almost all other cases. ``IndexSpider``, ``PeriodicSpider`` and ``LinksSpider`` are child classes of this class.
- :class:`~kingfisher_scrapy.base_spider.BaseSpider`: All spiders inherit, directly or indirectly, from this class, which in turn inherits from `scrapy.Spider <https://docs.scrapy.org/en/latest/topics/spiders.html>`__. Use if none of the above can be used.

Expand Down
30 changes: 30 additions & 0 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,3 +612,33 @@ def _build_url(self, params):
url_params = params.copy()
url_params.update(self.additional_params)
return util.replace_parameters(self.base_url, **url_params)


class BigFileSpider(SimpleSpider):
"""
This class makes it easy to collect data from sources that provide big JSON files as a release package.
Each big file is resized to multiple small files that the current version of Kingfisher process is able to process.
#. Inherit from ``BigFileSpider``
#. Write a ``start_requests`` method to request the archive files
.. code-block:: python
from kingfisher_scrapy.base_spider import BigFileSpider
from kingfisher_scrapy.util import components
class MySpider(BigFileSpider):
name = 'my_spider'
def start_requests(self):
yield self.build_request('https://example.com/api/package.json', formatter=components(-1)
"""

resize_package = True

@handle_http_error
def parse(self, response):
data = {'data': response.body,
'package': response.body}
yield self.build_file(file_name=response.request.meta['file_name'], url=response.request.url,
data_type='release_package', data=data)
7 changes: 2 additions & 5 deletions kingfisher_scrapy/spiders/france.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.base_spider import BigFileSpider
from kingfisher_scrapy.util import components, handle_http_error


class France(SimpleSpider):
class France(BigFileSpider):
"""
Domain
France
Expand All @@ -13,9 +13,6 @@ class France(SimpleSpider):
"""
name = 'france'

# SimpleSpider
data_type = 'release_package'

def start_requests(self):
# A CKAN API JSON response.
# Ministère de l'économie, des finances et de la relance
Expand Down
7 changes: 0 additions & 7 deletions kingfisher_scrapy/spiders/indonesia_bandung.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,5 @@ def parse_list(self, response):
yield self.build_request(next_page_url, formatter=join(self.get_formatter(), parameters('page')),
callback=self.parse_list)

@handle_http_error
def parse(self, response):
data = response.json()
if len(data) == 0:
return
yield self.build_file_from_response(response, data_type=self.data_type)

def get_formatter(self):
return components(-1)
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/mexico_inai.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def parse_list(self, response):
for result in datas['result']['results']:
for resource in result['resources']:
if resource['format'] == 'JSON':
# http://bit.ly/ConcentradoINAI
yield self.build_request(resource['url'], formatter=components(-1), meta={'dont_redirect': True},
callback=self.parse_redirect)

Expand Down
31 changes: 17 additions & 14 deletions kingfisher_scrapy/spiders/moldova.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, handle_http_error, join, parameters, replace_parameters

Expand All @@ -10,31 +12,32 @@ class Moldova(SimpleSpider):
name = 'moldova'

# SimpleSpider
data_type = 'record_package'
data_type = 'release_package'

def start_requests(self):
endpoints = {
'budgets': 'https://public.mtender.gov.md/budgets/',
# From https://github.com/open-contracting/kingfisher-collect/issues/192#issuecomment-529928683
# The /tenders/plans endpoint appeared to return exactly the same data as the /tenders endpoint except
# that when given an OCID parameter it returned an error message. It may be that /tenders/plans just
# lists a subset of /tenders but this isn't clear.
# 'plans': 'https://public.mtender.gov.md/tenders/plan/',
'tenders': 'https://public.mtender.gov.md/tenders/',
}

for endpoint, url in endpoints.items():
yield self.build_request(url, formatter=components(-1), callback=self.parse_list)
# https://public.mtender.gov.md offers three endpoints: /tenders/, /tenders/plan/ and /budgets/. However, this
# service publishes contracting processes under multiple OCIDs.
#
# The http://public.eprocurement.systems/ocds/ service instead publishes contracting processes under one OCID.
# However, it has no endpoint to list OCIDs.
#
# As such, we retrieve OCIDs from the first, and data from the second.
#
# Note: The OCIDs from the /budgets/ endpoint have no corresponding data in the second service. The OCIDs from
# the /tenders/plan/ endpoint are the same as from the /tenders/ endpoint.
url = 'https://public.mtender.gov.md/tenders/'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
base_url = 'http://public.eprocurement.systems/ocds/tenders/'
data = response.json()
# The last page returns an empty JSON object.
if not data:
return

for item in data['data']:
url = replace_parameters(response.request.url, offset=None) + item['ocid']
url = replace_parameters(base_url, offset=None) + item['ocid']
yield self.build_request(url, formatter=components(-2))

url = replace_parameters(response.request.url, offset=data['offset'])
Expand Down
16 changes: 4 additions & 12 deletions kingfisher_scrapy/spiders/openopps.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ class OpenOpps(BaseSpider):

# BaseSpider
default_from_date = '2011-01-01'
root_path = 'item'
root_path = 'results.item.json'
dont_truncate = True

access_token = None
api_limit = 10000 # OpenOpps API limit for search results
request_time_limit = 60 # in minutes
Expand Down Expand Up @@ -108,7 +108,7 @@ def start_requests_pages(self):
yield from self.request_range_per_day(self.from_date, self.until_date, search_h)
else:
# Use larger ranges for filters with less than (api_limit) search results
release_date_gte_list = ['', '2009-01-01', '2010-01-01', '2010-07-01']
release_date_gte_list = ['1970-01-01', '2009-01-01', '2010-01-01', '2010-07-01']
release_date_lte_list = ['2008-12-31', '2009-12-31', '2010-06-30', '2010-12-31']

for i in range(len(release_date_gte_list)):
Expand Down Expand Up @@ -148,15 +148,7 @@ def parse(self, response):

# Counts response and range hour split control
if count <= self.api_limit or search_h == 1:
# Data type changed to release package list in order to have fewer files
all_data = []
for data in results['results']:
json_data = data['json']
if json_data:
all_data.append(json_data)

if all_data:
yield self.build_file_from_response(response, data=all_data, data_type=self.data_type)
yield self.build_file_from_response(response, data_type=self.data_type)

next_url = results.get('next')
if next_url:
Expand Down
31 changes: 31 additions & 0 deletions tests/test_big_file_spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import json

import pytest

from kingfisher_scrapy.base_spider import BigFileSpider
from kingfisher_scrapy.items import File
from tests import response_fixture, spider_with_crawler


@pytest.mark.parametrize('sample,len_items,len_releases', [(None, 2, 100), (5, 1, 5)])
def test_parse_release_package(sample, len_items, len_releases):
spider = spider_with_crawler(spider_class=BigFileSpider, sample=sample)
package = {'releases': []}
for i in range(200):
package['releases'].append({'key': 'value'})

response = response_fixture(body=json.dumps(package).encode(), meta={'file_name': 'test.json'})
generator = spider.parse(response)
item = next(generator)

assert type(item) is File
assert len(item) == 5
assert item['file_name'] == 'test.json'
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'release_package'
assert item['encoding'] == 'utf-8'
assert item['data']['package'] is not None
assert item['data']['data'] is not None

with pytest.raises(StopIteration):
next(generator)

0 comments on commit bc6a6f6

Please sign in to comment.