diff --git a/docs/api/base_spider.rst b/docs/api/base_spider.rst index 8dca7971..510e9de3 100644 --- a/docs/api/base_spider.rst +++ b/docs/api/base_spider.rst @@ -4,4 +4,3 @@ Base Spider .. automodule:: kingfisher_scrapy.base_spider :members: :undoc-members: - diff --git a/docs/api/exceptions.rst b/docs/api/exceptions.rst index 788097a2..f360575c 100644 --- a/docs/api/exceptions.rst +++ b/docs/api/exceptions.rst @@ -4,4 +4,3 @@ Exceptions .. automodule:: kingfisher_scrapy.exceptions :members: :undoc-members: - diff --git a/docs/api/extensions.rst b/docs/api/extensions.rst new file mode 100644 index 00000000..ab745966 --- /dev/null +++ b/docs/api/extensions.rst @@ -0,0 +1,6 @@ +Extensions +========== + +.. automodule:: kingfisher_scrapy.extensions + :members: + :undoc-members: diff --git a/docs/api/index.rst b/docs/api/index.rst index 42f566e7..e7c0c60a 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -4,4 +4,6 @@ API Reference .. toctree:: base_spider.rst + extensions.rst + util.rst exceptions.rst diff --git a/docs/api/util.rst b/docs/api/util.rst new file mode 100644 index 00000000..c92cae2c --- /dev/null +++ b/docs/api/util.rst @@ -0,0 +1,6 @@ +Utilities +========= + +.. automodule:: kingfisher_scrapy.util + :members: + :undoc-members: diff --git a/docs/writing-spiders.rst b/docs/writing-spiders.rst index 24aeab18..480fec07 100644 --- a/docs/writing-spiders.rst +++ b/docs/writing-spiders.rst @@ -53,18 +53,16 @@ Here is a sample: .. code-block:: python from kingfisher_scrapy.base_spider import SimpleSpider - from kingfisher_scrapy.util import handle_error + from kingfisher_scrapy.util import components, handle_error class VerySimple(SimpleSpider): name = 'very_simple' data_type = 'release_package' def start_requests(self): - # This API only has one URL to get. Make a request for that, and set a filename - yield scrapy.Request( - 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', - meta={'kf_filename': '13-14.json'} - ) + # Request the source's only URL, and transform the URL to a file name using ``basename``. + url = 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json' + yield self.build_request(url, formatter=components(-1)) Spider properties ----------------- diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 78163e16..e2db8936 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -1,4 +1,3 @@ -import hashlib import json from datetime import datetime from io import BytesIO @@ -102,6 +101,49 @@ def get_start_time(self, format): """ return self.crawler.stats.get_value('start_time').strftime(format) + def build_request(self, url, formatter, **kwargs): + """ + Returns a Scrapy request, with a file name added to the request's ``meta`` attribute. If the file name doesn't + have a ``.json`` or ``.zip`` extension, it adds a ``.json`` extension. + + If the last component of a URL's path is unique, use it as the file name. For example: + + >>> from kingfisher_scrapy.base_spider import BaseSpider + >>> from kingfisher_scrapy.util import components + >>> url = 'https://example.com/package.json' + >>> formatter = components(-1) + >>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta + {'kf_filename': 'package.json'} + + To use a query string parameter as the file name: + + >>> from kingfisher_scrapy.util import parameters + >>> url = 'https://example.com/packages?page=1&per_page=100' + >>> formatter = parameters('page') + >>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta + {'kf_filename': 'page-1.json'} + + To add a query string parameter to the file name: + + >>> from kingfisher_scrapy.util import join + >>> url = 'https://example.com/packages?page=1&per_page=100' + >>> formatter = join(components(-1), parameters('page')) + >>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta + {'kf_filename': 'packages-page-1.json'} + + :param str url: the URL to request + :param formatter: a function that accepts a URL and returns a file name + :returns: a Scrapy request + :rtype: scrapy.Request + """ + file_name = formatter(url) + if not file_name.endswith(('.json', '.zip')): + file_name += '.json' + meta = {'kf_filename': file_name} + if 'meta' in kwargs: + meta.update(kwargs.pop('meta')) + return scrapy.Request(url, meta=meta, **kwargs) + def build_file_from_response(self, response, **kwargs): """ Returns an item to yield, based on the response to a request. @@ -266,8 +308,7 @@ def start_requests(self): @handle_error def parse(self, response): if self.zip_file_format: - filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest()) - self.build_file_from_response(response, file_name=filename, post_to_api=False) + self.build_file_from_response(response, data_type='zip', post_to_api=False) zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): @@ -300,6 +341,8 @@ class LinksSpider(SimpleSpider): 1. Inherit from ``LinksSpider`` 1. Set a ``data_type`` class attribute to the data type of the API responses + 1. Set a ``next_page_formatter`` class attribute to set the file name as in + :meth:`~kingfisher_scrapy.base_spider.BaseSpider.build_request` 1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next") 1. Write a ``start_requests`` method to request the first page of API results @@ -333,4 +376,4 @@ def next_link(self, response): data = json.loads(response.text) url = resolve_pointer(data, self.next_pointer, None) if url: - return scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) + return self.build_request(url, formatter=self.next_page_formatter) diff --git a/kingfisher_scrapy/spiders/__init__.py b/kingfisher_scrapy/spiders/__init__.py index ebd689ac..e69de29b 100644 --- a/kingfisher_scrapy/spiders/__init__.py +++ b/kingfisher_scrapy/spiders/__init__.py @@ -1,4 +0,0 @@ -# This package will contain the spiders of your Scrapy project -# -# Please refer to the documentation for information on how to create and manage -# your spiders. diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index a662dc01..d009b066 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -3,7 +3,7 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class AfghanistanRecords(SimpleSpider): @@ -13,17 +13,15 @@ class AfghanistanRecords(SimpleSpider): download_delay = 1 def start_requests(self): - yield scrapy.Request( - 'https://ocds.ageops.net/api/ocds/records', - meta={'kf_filename': 'list.json'}, - callback=self.parse_list - ) + # A JSON array of URL strings, in reverse chronological order. + url = 'https://ocds.ageops.net/api/ocds/records' + yield scrapy.Request(url, meta={'kf_filename': 'list.json'}, callback=self.parse_list) @handle_error def parse_list(self, response): - files_urls = json.loads(response.text) + urls = json.loads(response.text) if self.sample: - files_urls = [files_urls[0]] - - for file_url in files_urls: - yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) + urls = [urls[0]] + for url in urls: + # URL looks like https://ocds.ageops.net/api/record/5ed2a62c4192f32c8c74a4e5 + yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index 99a49b69..c9755dea 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -3,7 +3,7 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class AfghanistanReleases(SimpleSpider): @@ -13,30 +13,25 @@ class AfghanistanReleases(SimpleSpider): download_delay = 1.5 def start_requests(self): - yield scrapy.Request( - 'https://ocds.ageops.net/api/ocds/releases/dates', - meta={'kf_filename': 'list.json'}, - callback=self.parse_list - ) + # A JSON array of URL strings, in reverse chronological order. + url = 'https://ocds.ageops.net/api/ocds/releases/dates' + yield scrapy.Request(url, meta={'kf_filename': 'list.json'}, callback=self.parse_list) @handle_error def parse_list(self, response): - files_urls = json.loads(response.text) + urls = json.loads(response.text) if self.sample: - files_urls = [files_urls[0]] - - for file_url in files_urls: - yield scrapy.Request( - file_url, - meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - callback=self.parse_release_list - ) + urls = [urls[0]] + for url in urls: + # A JSON array of URL strings, in reverse chronological order. + # URL looks like https://ocds.ageops.net/api/ocds/releases/2020-05-30 + yield self.build_request(url, formatter=components(-1), callback=self.parse_release_list) @handle_error def parse_release_list(self, response): - files_urls = json.loads(response.text) + urls = json.loads(response.text) if self.sample: - files_urls = [files_urls[0]] - - for file_url in files_urls: - yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) + urls = [urls[0]] + for url in urls: + # URL looks like https://ocds.ageops.net/api/release/5ed2a62c4192f32c8c74a4e3 + yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index 10dadbf5..8ab14456 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -3,7 +3,7 @@ import scrapy from kingfisher_scrapy.base_spider import ZipSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class ArgentinaBuenosAires(ZipSpider): @@ -24,15 +24,14 @@ class ArgentinaBuenosAires(ZipSpider): download_timeout = 1000 def start_requests(self): - yield scrapy.Request( - 'https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', - meta={'kf_filename': 'list.json'}, - callback=self.parse_list - ) + # A CKAN API JSON response. + url = 'https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras' + yield scrapy.Request(url, meta={'kf_filename': 'list.json'}, callback=self.parse_list) @handle_error def parse_list(self, response): data = json.loads(response.text) for resource in data['result']['resources']: if resource['format'].upper() == 'JSON': - yield scrapy.Request(resource['url'], meta={'kf_filename': resource['url'].rsplit('/', 1)[-1]}) + # Presently, only one URL matches. + yield self.build_request(resource['url'], formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index 1c44cfc8..f3abfe4e 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -8,7 +8,5 @@ class ArgentinaVialidad(SimpleSpider): data_type = 'release_package_list' def start_requests(self): - yield scrapy.Request( - 'https://datosabiertos.vialidad.gob.ar/api/ocds/package/all', - meta={'kf_filename': 'all.json'} - ) + url = 'https://datosabiertos.vialidad.gob.ar/api/ocds/package/all' + yield scrapy.Request(url, meta={'kf_filename': 'all.json'}) diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index 67c42a6e..04437cd2 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -1,12 +1,15 @@ import scrapy from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters class Armenia(LinksSpider): name = 'armenia' data_type = 'release_package' next_pointer = '/next_page/uri' + next_page_formatter = parameters('offset') def start_requests(self): - yield scrapy.Request('https://armeps.am/ocds/release', meta={'kf_filename': 'page1.json'}) + url = 'https://armeps.am/ocds/release' + yield scrapy.Request(url, meta={'kf_filename': 'offset-0.json'}) diff --git a/kingfisher_scrapy/spiders/australia.py b/kingfisher_scrapy/spiders/australia.py index a6f1da85..60ec5dcf 100644 --- a/kingfisher_scrapy/spiders/australia.py +++ b/kingfisher_scrapy/spiders/australia.py @@ -1,26 +1,18 @@ -import datetime +from datetime import date import scrapy from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters class Australia(LinksSpider): name = 'australia' data_type = 'release_package' + next_page_formatter = parameters('cursor') def start_requests(self): - url_prefix = 'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' + url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \ + f'2004-01-01T00:00:00Z/{date.today().year}-12-31T23:59:59Z' - if self.sample: - yield scrapy.Request( - url_prefix + '2018-01-01T00:00:00Z/2018-12-31T23:59:59Z', - meta={'kf_filename': 'year-2018.json'} - ) - else: - current_year = datetime.datetime.now().year + 1 - for year in range(2004, current_year): - yield scrapy.Request( - url_prefix + '{}-01-01T00:00:00Z/{}-12-31T23:59:59Z'.format(year, year), - meta={'kf_filename': 'year-{}.json'.format(year)} - ) + yield scrapy.Request(url, meta={'kf_filename': 'start.json'}) diff --git a/kingfisher_scrapy/spiders/australia_nsw.py b/kingfisher_scrapy/spiders/australia_nsw.py index b09326b4..a32bbbce 100644 --- a/kingfisher_scrapy/spiders/australia_nsw.py +++ b/kingfisher_scrapy/spiders/australia_nsw.py @@ -1,10 +1,7 @@ -import hashlib import json -import scrapy - from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import handle_error, parameters class AustraliaNSW(SimpleSpider): @@ -12,54 +9,45 @@ class AustraliaNSW(SimpleSpider): data_type = 'release_package' def start_requests(self): - release_types = ['planning', 'tender', 'contract'] - page_limit = 10 if self.sample else 1000 - url = 'https://tenders.nsw.gov.au/?event=public.api.{}.search&ResultsPerPage={}' - for release_type in release_types: - yield scrapy.Request( - url.format(release_type, page_limit), - meta={ - 'kf_filename': '{}.json'.format(release_type), - 'release_type': release_type, - }, + pattern = 'https://tenders.nsw.gov.au/?event=public.api.{}.search&ResultsPerPage=1000' + for release_type in ('planning', 'tender', 'contract'): + yield self.build_request( + pattern.format(release_type), + formatter=parameters('event'), + meta={'release_type': release_type}, callback=self.parse_list ) @handle_error def parse_list(self, response): - json_data = json.loads(response.text) + data = json.loads(response.text) release_type = response.request.meta['release_type'] - # More Pages? - if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \ - and not self.sample: - yield scrapy.Request( - json_data['links']['next'], - meta={ - 'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json', - 'release_type': release_type, - }, + if 'links' in data and isinstance(data['links'], dict) and 'next' in data['links'] and not self.sample: + yield self.build_request( + data['links']['next'], + formatter=parameters('event', 'startRow'), + meta={'release_type': release_type}, callback=self.parse_list ) - # Data? - for release in json_data['releases']: + for release in data['releases']: if release_type == 'planning': uuid = release['tender']['plannedProcurementUUID'] - yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid, - meta={'kf_filename': 'plannning-%s.json' % uuid} + yield self.build_request( + 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=' + uuid, + formatter=parameters('event', 'PlannedProcurementUUID') ) - if release_type == 'tender': + elif release_type == 'tender': uuid = release['tender']['RFTUUID'] - yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid, - meta={'kf_filename': 'tender-%s.json' % uuid} + yield self.build_request( + 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=' + uuid, + formatter=parameters('event', 'RFTUUID') ) - if release_type == 'contract': + elif release_type == 'contract': for award in release['awards']: uuid = award['CNUUID'] - yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=%s' % uuid, - meta={'kf_filename': 'contract-%s.json' % uuid} + yield self.build_request( + 'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=' + uuid, + formatter=parameters('event', 'CNUUID') ) diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index e9d311e2..546e7fcb 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -1,6 +1,5 @@ -import scrapy - from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import components class CanadaBuyAndSell(SimpleSpider): @@ -8,21 +7,14 @@ class CanadaBuyAndSell(SimpleSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( + urls = [ 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', - meta={'kf_filename': '13-14.json'} - ) - if self.sample: - return - yield scrapy.Request( 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', - meta={'kf_filename': '14-15.json'} - ) - yield scrapy.Request( 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', - meta={'kf_filename': '15-16.json'} - ) - yield scrapy.Request( 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', - meta={'kf_filename': '16-17.json'} - ) + ] + if self.sample: + urls = [urls[0]] + + for url in urls: + yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 64366213..7c9bc11a 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -2,32 +2,27 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import handle_error, parameters, replace_parameter -class CanadaMontreal(BaseSpider): +class CanadaMontreal(SimpleSpider): name = 'canada_montreal' - page_limit = 10000 + data_type = 'release_package' + step = 10000 def start_requests(self): - yield scrapy.Request( - 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d' % self.page_limit, - meta={'kf_filename': 'page0.json'} - ) + url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={step}'.format(step=self.step) + yield scrapy.Request(url, meta={'kf_filename': 'offset-0.json'}, callback=self.parse_list) @handle_error - def parse(self, response): - # Actual data - yield self.build_file_from_response(response, data_type='release_package') + def parse_list(self, response): + yield from self.parse(response) - # Load more pages? - if not self.sample and response.request.meta['kf_filename'] == 'page0.json': + if not self.sample: data = json.loads(response.text) + offset = data['meta']['pagination']['limit'] total = data['meta']['count'] - offset = self.page_limit - while offset < total: - url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d' % \ - (self.page_limit, offset) - yield scrapy.Request(url, meta={'kf_filename': 'page' + str(offset) + '.json'}) - offset += self.page_limit + for offset in range(offset, total, self.step): + url = replace_parameter(response.request.url, 'offset', offset) + yield self.build_request(url, formatter=parameters('offset')) diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index c0f851f6..1ec7fbe6 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -1,85 +1,71 @@ -import datetime import json +from datetime import date -import scrapy +from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import components, date_range_by_month, handle_error -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error - -class ChileCompraBaseSpider(BaseSpider): +class ChileCompraBaseSpider(SimpleSpider): custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } download_timeout = 300 limit = 100 - base_list_url = 'https://apis.mercadopublico.cl/OCDS/data/listaA%C3%B1oMes/{}/{:02d}/{}/{}' - record_url = 'https://apis.mercadopublico.cl/OCDS/data/record/%s' - start_year = 2008 + base_list_url = 'https://apis.mercadopublico.cl/OCDS/data/listaA%C3%B1oMes/{0.year:d}/{0.month:02d}/{1}/{2}' - def get_year_month_until(self): - until_year = datetime.datetime.now().year + 1 - until_month = datetime.datetime.now().month + def start_requests(self): + today = date.today() if hasattr(self, 'year'): - self.start_year = int(self.year) - until_year = self.start_year + 1 - until_month = 12 if self.start_year != datetime.datetime.now().year else until_month - return until_year, until_month + year = int(self.year) + start = date(year, 1, 1) + stop = date(year, 12, 1) + if year == today.year: + stop = stop.replace(month=today.month) + else: + start = date(2008, 1, 1) + stop = today - def start_requests(self): if self.sample: - yield scrapy.Request( - self.base_list_url.format(2017, 10, 0, 10), - meta={'kf_filename': 'list-2017-10.json', 'year': 2017, 'month': 10}, - ) - return + start = stop - until_year, until_month = self.get_year_month_until() - for year in range(self.start_year, until_year): - for month in range(1, 13): - # just scrape until the current month when the until year = current year - if (until_year - 1) == year and month > until_month: - break - yield scrapy.Request( - self.base_list_url.format(year, month, 0, self.limit), - meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month}, - ) + for d in date_range_by_month(start, stop): + yield self.build_request( + self.base_list_url.format(d, 0, self.limit), + formatter=components(-4, -1), + meta={ + 'year': d.year, + 'month': d.month, + }, + callback=self.parse_list + ) @handle_error - def parse(self, response): + def parse_list(self, response): data = json.loads(response.text) - if 'data' in data: - for data_item in data['data']: - if self.data_type == 'record_package': - yield scrapy.Request( - self.record_url % data_item['ocid'].replace('ocds-70d2nz-', ''), - meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], self.data_type)} - ) - else: - # the data comes in this format: - # "data": [ - # { - # "ocid": "", - # "urlTender": "..", - # "urlAward": ".." - # } - # ] - for stage in list(data_item.keys()): - if 'url' in stage: - name = stage.replace('url', '') - yield scrapy.Request( - data_item[stage], - meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], name)} - ) - if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']: - year = response.request.meta['year'] - month = response.request.meta['month'] - offset = data['pagination']['offset'] - yield scrapy.Request( - self.base_list_url.format(year, month, self.limit + offset, self.limit), - meta={'year': year, 'month': month} - ) - elif 'status' in data and data['status'] != 200: + if 'status' in data and data['status'] != 200: yield self.build_file_error_from_response(response, errors={'http_code': data['status']}) - else: - yield self.build_file_from_response(response, data_type=self.data_type) + return + + for item in data['data']: + # An item looks like: + # + # { + # "ocid": "ocds-70d2nz-2359-2-LE19", + # "urlTender": "https://apis.mercadopublico.cl/OCDS/data/tender/2359-2-LE19", + # "urlAward": "https://apis.mercadopublico.cl/OCDS/data/award/2359-2-LE19", + # "urlPlanning": "https://apis.mercadopublico.cl/OCDS/data/planning/2359-2-LE19" + # } + yield from self.handle_item(item) + + if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']: + year = response.request.meta['year'] + month = response.request.meta['month'] + offset = data['pagination']['offset'] + yield self.build_request( + self.base_list_url.format(date(year, month, 1), offset + self.limit, self.limit), + formatter=components(-4, -1), + meta={ + 'year': year, + 'month': month, + } + ) diff --git a/kingfisher_scrapy/spiders/chile_compra_bulk.py b/kingfisher_scrapy/spiders/chile_compra_bulk.py index 9b5c18b1..854d627d 100644 --- a/kingfisher_scrapy/spiders/chile_compra_bulk.py +++ b/kingfisher_scrapy/spiders/chile_compra_bulk.py @@ -1,8 +1,7 @@ -import hashlib - -import scrapy +from datetime import date from kingfisher_scrapy.base_spider import ZipSpider +from kingfisher_scrapy.util import components, date_range_by_month class ChileCompraBulk(ZipSpider): @@ -16,17 +15,12 @@ class ChileCompraBulk(ZipSpider): } def start_requests(self): - url = 'https://ocds.blob.core.windows.net/ocds/{}{}.zip' + url = 'https://ocds.blob.core.windows.net/ocds/{0.year:d}{0.month:02d}.zip' + + start = date(2009, 1, 1) + stop = date.today().replace(day=1) if self.sample: - years = ['2017'] - months = ['02'] - else: - years = range(2017, 2020) - months = ['0{}'.format(m) if m < 10 else str(m) for m in range(1, 13)] + start = stop - for year in years: - for month in months: - yield scrapy.Request( - url.format(year, month), - meta={'kf_filename': hashlib.md5((url).encode('utf-8')).hexdigest()} - ) + for d in date_range_by_month(start, stop): + yield self.build_request(url.format(d), formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/chile_compra_records.py b/kingfisher_scrapy/spiders/chile_compra_records.py index f7320964..14c5d99c 100644 --- a/kingfisher_scrapy/spiders/chile_compra_records.py +++ b/kingfisher_scrapy/spiders/chile_compra_records.py @@ -1,6 +1,11 @@ from kingfisher_scrapy.spiders.chile_base import ChileCompraBaseSpider +from kingfisher_scrapy.util import components class ChileCompraRecords(ChileCompraBaseSpider): name = 'chile_compra_records' data_type = 'record_package' + + def handle_item(self, item): + url = 'https://apis.mercadopublico.cl/OCDS/data/record/' + item['ocid'].replace('ocds-70d2nz-', '') + yield self.build_request(url, formatter=components(-2)) diff --git a/kingfisher_scrapy/spiders/chile_compra_releases.py b/kingfisher_scrapy/spiders/chile_compra_releases.py index e1082f83..a3467305 100644 --- a/kingfisher_scrapy/spiders/chile_compra_releases.py +++ b/kingfisher_scrapy/spiders/chile_compra_releases.py @@ -1,6 +1,12 @@ from kingfisher_scrapy.spiders.chile_base import ChileCompraBaseSpider +from kingfisher_scrapy.util import components class ChileCompraReleases(ChileCompraBaseSpider): name = 'chile_compra_releases' data_type = 'release_package' + + def handle_item(self, item): + for key in item: + if key.startswith('url'): + yield self.build_request(item[key], formatter=components(-2)) diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index 9a76d02d..faedc130 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -1,4 +1,3 @@ -import hashlib import logging import time from json import JSONDecodeError @@ -6,22 +5,29 @@ import scrapy from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters class Colombia(LinksSpider): name = 'colombia' - sleep = 120 * 60 + next_page_formatter = parameters('page') def start_requests(self): base_url = 'https://apiocds.colombiacompra.gov.co:8443/apiCCE2.0/rest/releases' if hasattr(self, 'year'): - base_url += '/page/{}'.format(int(self.year)) - base_url += '?page=%d' + base_url += f'/page/{int(self.year)}' + base_url += '?page={}' - start_page = 1 + page = 1 if hasattr(self, 'page'): - start_page = int(self.page) - yield scrapy.Request(base_url % start_page, meta={'kf_filename': 'page{}.json'.format(start_page)}) + page = int(self.page) + yield self.build_request(base_url.format(page), formatter=parameters('page')) + + def retry(self, response, reason): + url = response.request.url + logging.info(reason.format(url=url, status=response.status)) + time.sleep(120 * 60) + yield scrapy.Request(url, dont_filter=True, meta=response.request.meta) def parse(self, response): # In Colombia, every day at certain hour they run a process in their system that drops the database and make @@ -30,25 +36,13 @@ def parse(self, response): # so eventually the spider will always face the service problems. For that, when the problem occurs, (503 # status or invalid json) we wait 120 minutes and then continue try: - if response.status == 503 or response.status == 404: - url = response.request.url - logging.info('Sleeping due error {} in url {}'.format(response.status, url)) - time.sleep(self.sleep) - yield scrapy.Request( - url, - dont_filter=True, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) - elif self.is_http_success(response): + if self.is_http_success(response): yield self.build_file_from_response(response, data_type='release_package') if not self.sample: yield self.next_link(response) + elif response.status == 503 or response.status == 404: + self.retry(response, 'Sleeping due to HTTP error {status} from {url}') else: yield self.build_file_error_from_response(response) - except JSONDecodeError: - url = response.request.url - logging.info('Sleeping due json decode error in url {}'.format(url)) - time.sleep(self.sleep) - yield scrapy.Request(url, dont_filter=True, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) + self.retry(response, 'Sleeping due to JSONDecodeError from {url}') diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 2523f8bc..2f1c4036 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -1,9 +1,7 @@ -from urllib.parse import urlparse - import scrapy from kingfisher_scrapy.base_spider import ZipSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class ColombiaBulk(ZipSpider): @@ -34,8 +32,9 @@ def start_requests(self): @handle_error def parse_list(self, response): - urls = response.css('.enlaces_contenido').css('a::attr(href)').getall() - urls = [urls[0]] if self.sample else urls + urls = response.xpath('//a[@class="enlaces_contenido"]/@href').getall() + if self.sample: + urls = [urls[0]] for url in urls: - filename = urlparse(url).path.split('/')[-1] - yield scrapy.Request(url, meta={'kf_filename': filename}) + # URL looks like https://apiocds.colombiacompra.gov.co:8443/ArchivosSECOP/Archivos/SI2011.zip + yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index f6d117c0..28a903b6 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -15,7 +15,7 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, post_to_api=False) + yield self.build_file_from_response(response, data_type='tar.gz', post_to_api=False) # Load a line at the time, pass it to API with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar: diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 4fa46c4d..18098a6c 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -5,7 +5,7 @@ import scrapy from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class DominicanRepublic(BaseSpider): @@ -30,7 +30,7 @@ def parse_list(self, response): for url in json_urls: if '/JSON_DGCP_' in url: - yield scrapy.Request('https:' + url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) + yield self.build_request('https:' + url, formatter=components(-1)) @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index a624bffe..21c6fbf6 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -1,10 +1,9 @@ -import hashlib import json import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error, parameters class France(SimpleSpider): @@ -12,36 +11,24 @@ class France(SimpleSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - 'https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4', - meta={'kf_filename': 'list.json'}, - callback=self.parse_list, - ) + # A CKAN API JSON response. + url = 'https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4' + yield scrapy.Request(url, meta={'kf_filename': 'page-1.json'}, callback=self.parse_list) @handle_error def parse_list(self, response): - json_data = json.loads(response.text) - data = json_data['data'] - for item in data: - resources = item['resources'] - for resource in resources: + data = json.loads(response.text) + for item in data['data']: + for resource in item['resources']: description = resource['description'] - if description and (description.count("OCDS") or description.count("ocds")): - url = resource['url'] - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, - ) + if description and 'ocds' in description.lower(): + yield self.build_request(resource['url'], formatter=components(-2)) if self.sample: break else: continue break else: - next_page = json_data.get('next_page') + next_page = data.get('next_page') if next_page: - yield scrapy.Request( - next_page, - meta={'kf_filename': hashlib.md5(next_page.encode('utf-8')).hexdigest() + '.json'}, - callback=self.parse_list - ) + yield self.build_request(next_page, formatter=parameters('page'), callback=self.parse_list) diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index 60438bbc..f9f7cef3 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -1,11 +1,14 @@ import scrapy from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters class GeorgiaRecords(LinksSpider): name = 'georgia_records' data_type = 'record_package' + next_page_formatter = parameters('page') def start_requests(self): - yield scrapy.Request('https://odapi.spa.ge/api/records.json', meta={'kf_filename': 'page1.json'}) + url = 'https://odapi.spa.ge/api/records.json' + yield scrapy.Request(url, meta={'kf_filename': 'page-1.json'}) diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index 6cf0263d..ee3ce55b 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -1,11 +1,14 @@ import scrapy from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters class GeorgiaReleases(LinksSpider): name = 'georgia_releases' data_type = 'release_package' + next_page_formatter = parameters('page') def start_requests(self): - yield scrapy.Request('https://odapi.spa.ge/api/releases.json', meta={'kf_filename': 'page1.json'}) + url = 'https://odapi.spa.ge/api/releases.json' + yield scrapy.Request(url, meta={'kf_filename': 'page-1.json'}) diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py index a88e3e4b..329f0273 100644 --- a/kingfisher_scrapy/spiders/honduras_cost.py +++ b/kingfisher_scrapy/spiders/honduras_cost.py @@ -1,9 +1,6 @@ -import hashlib - import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class HondurasCoST(SimpleSpider): @@ -11,22 +8,5 @@ class HondurasCoST(SimpleSpider): data_type = 'record_package' def start_requests(self): - yield scrapy.Request( - 'http://app.sisocs.org/protected/ocdsShow/', - meta={'kf_filename': 'list.html'}, - callback=self.parse_list - ) - - @handle_error - def parse_list(self, response): - btns = response.css('script').xpath('text()').getall() - for btn in btns: - if 'download-all' and 'url:' in btn: - array_url = btn.split() - for url in array_url: - if 'url:' in url and '?' not in url: - url = url.replace('"', '').replace(',', '').lstrip('url:') - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + # Extracted from http://app.sisocs.org/protected/ocdsShow/ + yield scrapy.Request('http://67.207.88.38:8080/sisocs/records', meta={'kf_filename': 'all.json'}) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 91e2347f..611b0421 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -1,9 +1,7 @@ -from urllib.parse import urlparse - import scrapy from kingfisher_scrapy.base_spider import ZipSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class HondurasONCAE(ZipSpider): @@ -22,11 +20,9 @@ def start_requests(self): @handle_error def parse_list(self, response): - urls = response.css(".article-content ul")\ - .xpath(".//a[contains(., '[json]')]/@href")\ - .getall() + urls = response.xpath('//a[contains(., "[json]")]/@href').getall() if self.sample: urls = [urls[0]] for url in urls: - filename = urlparse(url).path.split('/')[-1] - yield scrapy.Request(url, meta={'kf_filename': filename}) + # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip + yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 55195d82..f0e1f9f3 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -3,7 +3,7 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class HondurasPortalBulkFiles(SimpleSpider): @@ -19,13 +19,10 @@ def start_requests(self): @handle_error def parse_list(self, response): - filelist = json.loads(response.text) - + items = json.loads(response.text) if self.sample: - url = filelist[0]['urls']['json'] - yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) + items = [items[0]] - else: - for item in filelist: - url = item['urls']['json'] - yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) + for item in items: + url = item['urls']['json'] + yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py index 1d3fc5de..c5f5ec69 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_records.py +++ b/kingfisher_scrapy/spiders/honduras_portal_records.py @@ -1,8 +1,7 @@ -import hashlib - import scrapy from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters class HondurasPortalRecords(LinksSpider): @@ -10,9 +9,10 @@ class HondurasPortalRecords(LinksSpider): data_type = 'record_package' data_pointer = '/recordPackage' next_pointer = '/next' + next_page_formatter = parameters('page') download_delay = 0.9 def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json' - yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) + yield scrapy.Request(url, meta={'kf_filename': 'page-1.json'}) diff --git a/kingfisher_scrapy/spiders/honduras_portal_releases.py b/kingfisher_scrapy/spiders/honduras_portal_releases.py index a676383b..ca4c56f1 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_releases.py +++ b/kingfisher_scrapy/spiders/honduras_portal_releases.py @@ -1,8 +1,7 @@ -import hashlib - import scrapy from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters class HondurasPortalReleases(LinksSpider): @@ -10,9 +9,10 @@ class HondurasPortalReleases(LinksSpider): data_type = 'release_package' data_pointer = '/releasePackage' next_pointer = '/next' + next_page_formatter = parameters('page') download_delay = 0.9 def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json' - yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) + yield scrapy.Request(url, meta={'kf_filename': 'page-1.json'}) diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py index b5af803b..9e1651bd 100644 --- a/kingfisher_scrapy/spiders/indonesia_bandung.py +++ b/kingfisher_scrapy/spiders/indonesia_bandung.py @@ -1,51 +1,40 @@ -import datetime -import hashlib import json - -import scrapy +from datetime import date from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, date_range_by_year, handle_error, join, parameters class IndonesiaBandung(BaseSpider): name = 'indonesia_bandung' def start_requests(self): - url = 'https://birms.bandung.go.id/api/packages/year/{}' - current_year = datetime.datetime.now().year + 1 - for year in range(2013, current_year): - yield scrapy.Request( - url.format(year), - meta={'kf_filename': 'start_requests'}, - callback=self.parse_data - ) + pattern = 'https://birms.bandung.go.id/api/packages/year/{}' + + start = 2013 + stop = date.today().year + + for year in date_range_by_year(start, stop): + yield self.build_request(pattern.format(year), formatter=components(-1), callback=self.parse_list) @handle_error - def parse_data(self, response): - json_data = json.loads(response.text) - items = json_data['data'] - for data in items: - url = data['uri'] + def parse_list(self, response): + data = json.loads(response.text) + for item in data['data']: + url = item['uri'] if url: - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, - ) + yield self.build_request(url, formatter=components(-1)) if self.sample: break else: - next_page_url = json_data.get('next_page_url') + next_page_url = data.get('next_page_url') if next_page_url: - yield scrapy.Request( - next_page_url, - meta={'kf_filename': next_page_url.rsplit('/', 1)[-1] + '.json'}, - callback=self.parse_data - ) + yield self.build_request(next_page_url, formatter=join(components(-1), parameters('page')), + callback=self.parse_list) @handle_error def parse(self, response): - json_data = json.loads(response.text) - if len(json_data) == 0: + data = json.loads(response.text) + if len(data) == 0: return yield self.build_file_from_response(response, data_type='release') diff --git a/kingfisher_scrapy/spiders/kenya_makueni.py b/kingfisher_scrapy/spiders/kenya_makueni.py index 11d4d9ac..036f7db0 100644 --- a/kingfisher_scrapy/spiders/kenya_makueni.py +++ b/kingfisher_scrapy/spiders/kenya_makueni.py @@ -1,39 +1,32 @@ -import hashlib from math import ceil import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import handle_error, parameters class KenyaMakueni(SimpleSpider): name = 'kenya_makueni' data_type = 'release_package_list' - url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={}&pageNumber={}' + step = 10 + + url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={step}&pageNumber={page}' def start_requests(self): if self.sample: - url = self.url.format(10, 0) - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + url = self.url.format(step=self.step, page=0) + yield self.build_request(url, formatter=parameters('pageNumber')) else: yield scrapy.Request( 'https://opencontracting.makueni.go.ke/api/ocds/release/count', - meta={'kf_filename': 'start_requests'}, + meta={'kf_filename': 'count.json'}, callback=self.parse_count ) @handle_error def parse_count(self, response): total = int(response.text) - page_size = 300 - - for page_number in range((ceil(total / page_size))): - url = self.url.format(page_size, page_number) - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + for page in range(ceil(total / self.step)): + url = self.url.format(step=self.step, page=page) + yield self.build_request(url, formatter=parameters('pageNumber')) diff --git a/kingfisher_scrapy/spiders/malta.py b/kingfisher_scrapy/spiders/malta.py index 1c3dfe4b..70b32400 100644 --- a/kingfisher_scrapy/spiders/malta.py +++ b/kingfisher_scrapy/spiders/malta.py @@ -1,11 +1,10 @@ -import hashlib import json -from urllib.parse import urlparse +from urllib.parse import urlsplit import scrapy from kingfisher_scrapy.base_spider import ZipSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class Malta(ZipSpider): @@ -15,22 +14,17 @@ class Malta(ZipSpider): def start_requests(self): yield scrapy.Request( 'http://demowww.etenders.gov.mt/ocds/services/recordpackage/getrecordpackagelist', - meta={'kf_filename': 'start_requests'}, + meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @handle_error def parse_list(self, response): - url = 'http://demowww.etenders.gov.mt{}' - json_data = json.loads(response.text) - packages = json_data['packagesPerMonth'] - for package in packages: - parsed = urlparse(package) - path = parsed.path - if path: - yield scrapy.Request( - url.format(path), - meta={'kf_filename': hashlib.md5(path.encode('utf-8')).hexdigest() + '.json'} - ) - if self.sample: - break + urls = json.loads(response.text)['packagesPerMonth'] + if self.sample: + urls = [urls[0]] + + netloc = urlsplit(response.request.url).netloc + for url in urls: + # URL looks like http://malta-demo-server.eurodyn.com/ocds/services/recordpackage/getrecordpackage/2020/1 + yield self.build_request(urlsplit(url)._replace(netloc=netloc).geturl(), formatter=components(-2)) diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py index 8d1b98f8..64425413 100644 --- a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py +++ b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py @@ -1,38 +1,32 @@ import json +from math import ceil import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import handle_error, parameters, replace_parameter -class MexicoAdministracionPublicaFederal(BaseSpider): +class MexicoAdministracionPublicaFederal(SimpleSpider): """ Bulk downloads: https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf """ name = 'mexico_administracion_publica_federal' + data_type = 'record_package_list_in_results' def start_requests(self): - yield scrapy.Request( - 'https://api.datos.gob.mx/v1/contratacionesabiertas', - meta={'kf_filename': 'page1.json'} - ) + url = 'https://api.datos.gob.mx/v1/contratacionesabiertas' + yield scrapy.Request(url, meta={'kf_filename': 'page-1.json'}, callback=self.parse_list) @handle_error - def parse(self, response): - data = json.loads(response.text) + def parse_list(self, response): + yield from self.parse(response) - # Actual data - yield self.build_file_from_response(response, data_type='record_package_list_in_results') - - # Load more pages? - if data['pagination']['page'] == 1 and not self.sample: + if not self.sample: + data = json.loads(response.text) + page = data['pagination']['page'] total = data['pagination']['total'] - page = 1 limit = data['pagination']['pageSize'] - while ((page - 1) * limit) < total: - yield scrapy.Request( - 'https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' % page, - meta={'kf_filename': 'page' + str(page) + '.json'} - ) - page += 1 + for page in range(page + 1, ceil(total / limit)): + url = replace_parameter(response.request.url, 'page', page) + yield self.build_request(url, formatter=parameters('page')) diff --git a/kingfisher_scrapy/spiders/mexico_cdmx.py b/kingfisher_scrapy/spiders/mexico_cdmx.py index 7a96ae64..363e6c64 100644 --- a/kingfisher_scrapy/spiders/mexico_cdmx.py +++ b/kingfisher_scrapy/spiders/mexico_cdmx.py @@ -3,7 +3,7 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class MexicoCDMXSource(SimpleSpider): @@ -19,9 +19,10 @@ def start_requests(self): @handle_error def parse_list(self, response): - data = json.loads(response.text) + items = json.loads(response.text) if self.sample: - data = [data[0]] + items = [items[0]] - for data_item in data: - yield scrapy.Request(data_item['uri'], meta={'kf_filename': 'id%s.json' % data_item['id']}) + for item in items: + # URL looks like http://www.contratosabiertos.cdmx.gob.mx/api/contrato/OCDS-87SD3T-SEDEMA-LP-0027-2017 + yield self.build_request(item['uri'], formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py index 08288d21..5d4ba879 100644 --- a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py +++ b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py @@ -8,7 +8,5 @@ class MexicoGrupoAeroporto(SimpleSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - 'http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json', - meta={'kf_filename': 'concentrado05032019RELEASE.json'} - ) + url = 'http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json' + yield scrapy.Request(url, meta={'kf_filename': 'all.json'}) diff --git a/kingfisher_scrapy/spiders/mexico_inai.py b/kingfisher_scrapy/spiders/mexico_inai.py index 3cb7bbd5..a58cbb09 100644 --- a/kingfisher_scrapy/spiders/mexico_inai.py +++ b/kingfisher_scrapy/spiders/mexico_inai.py @@ -1,10 +1,9 @@ -import hashlib import json import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error, parameters class MexicoINAI(SimpleSpider): @@ -13,6 +12,7 @@ class MexicoINAI(SimpleSpider): encoding = 'utf-8-sig' def start_requests(self): + # A CKAN API JSON response. yield scrapy.Request( 'https://datos.gob.mx/busca/api/3/action/package_search?q=organization:inai&rows=500', meta={'kf_filename': 'list.json'}, @@ -25,22 +25,12 @@ def parse_list(self, response): for result in datas['result']['results']: for resource in result['resources']: if resource['format'] == 'JSON': - kf_filename = 'redirect-' + hashlib.md5(resource['url'].encode('utf-8')).hexdigest() + '.json' - yield scrapy.Request( - resource['url'], - meta={ - 'kf_filename': kf_filename, - 'dont_redirect': True - }, - callback=self.parse_redirect - ) + yield self.build_request(resource['url'], formatter=components(-1), meta={'dont_redirect': True}, + callback=self.parse_redirect) def parse_redirect(self, response): if response.status == 301: - url = response.headers['Location'].decode("utf-8").replace("open?", "uc?export=download&") - yield scrapy.Request( - url, - meta={'kf_filename': 'data-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + url = response.headers['Location'].decode('utf-8').replace('open?', 'uc?export=download&') + yield self.build_request(url, formatter=parameters('id')) else: yield self.build_file_error_from_response(response) diff --git a/kingfisher_scrapy/spiders/mexico_jalisco.py b/kingfisher_scrapy/spiders/mexico_jalisco.py index 7aef33eb..07485b43 100644 --- a/kingfisher_scrapy/spiders/mexico_jalisco.py +++ b/kingfisher_scrapy/spiders/mexico_jalisco.py @@ -1,10 +1,9 @@ -import hashlib import json import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class MexicoJalisco(SimpleSpider): @@ -20,23 +19,22 @@ def start_requests(self): @handle_error def parse_list(self, response): - datas = json.loads(response.text) + items = json.loads(response.text) if self.sample: - datas = [datas[0]] - for data in datas: + items = [items[0]] + + for item in items: yield scrapy.Request( - data['URIContract'], - meta={'kf_filename': 'id%s.json' % data['ocid']}, + item['URIContract'], + meta={'kf_filename': f"id{item['ocid']}.json"}, callback=self.parse_record_package ) @handle_error def parse_record_package(self, response): - json_data = json.loads(response.text) - if 'packages' in json_data: - for url in json_data['packages']: - yield scrapy.Request( - url, - meta={'kf_filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest()} - ) yield self.build_file_from_response(response, data_type='record_package') + + data = json.loads(response.text) + if 'packages' in data: + for url in data['packages']: + yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien.py index c999d767..920537e2 100644 --- a/kingfisher_scrapy/spiders/mexico_quien_es_quien.py +++ b/kingfisher_scrapy/spiders/mexico_quien_es_quien.py @@ -1,52 +1,40 @@ -import hashlib import json from math import ceil import scrapy from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import handle_error, parameters class MexicoQuienEsQuien(BaseSpider): name = 'mexico_quien_es_quien' download_delay = 0.9 - url = 'https://api.quienesquien.wiki/v2/contracts?limit={}&offset={}' def start_requests(self): - if self.sample: - limit = 10 - offset = 0 - yield scrapy.Request( - self.url.format(limit, offset), - meta={'kf_filename': 'sample.json'} - ) - else: - yield scrapy.Request( - 'https://api.quienesquien.wiki/v2/sources', - meta={'kf_filename': 'start_requests'}, - callback=self.parse_count - ) + yield scrapy.Request( + 'https://api.quienesquien.wiki/v2/sources', + meta={'kf_filename': 'list.json'}, + callback=self.parse_list + ) @handle_error - def parse_count(self, response): + def parse_list(self, response): + pattern = 'https://api.quienesquien.wiki/v2/contracts?limit={limit}&offset={offset}' limit = 1000 - json_data = json.loads(response.text) - count_list = json_data['data'] - count = int(count_list[0]['collections']['contracts']['count']) + count = json.loads(response.text)['data'][0]['collections']['contracts']['count'] for offset in range(ceil(count / limit)): - yield scrapy.Request( - self.url.format(limit, (offset * limit)), - meta={'kf_filename': hashlib.md5((self.url + - str(offset)).encode('utf-8')).hexdigest() + '.json'} - ) + url = pattern.format(limit=limit, offset=offset * limit) + yield self.build_request(url, formatter=parameters('offset')) + if self.sample: + break @handle_error def parse(self, response): - json_data = json.loads(response.text) + data = json.loads(response.text) yield self.build_file_from_response( response, - data=json.dumps(json_data['data']).encode(), + data=json.dumps(data['data']).encode(), data_type='record_package_list' ) diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index fb146a61..2883379c 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -1,62 +1,40 @@ import json -import scrapy +from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import components, handle_error, join, parameters, replace_parameter -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error - -class Moldova(BaseSpider): +class Moldova(SimpleSpider): name = 'moldova' - - endpoints = {"budgets": "https://public.mtender.gov.md/budgets/", - # From https://github.com/open-contracting/kingfisher-collect/issues/192#issuecomment-529928683 - # The /tenders/plans endpoint appeared to return exactly the same data as the /tenders endpoint except - # that when given an OCID parameter it returned an error message. It may be that /tenders/plans just - # lists a subset of /tenders but this isn't clear. - # "plans": "https://public.mtender.gov.md/tenders/plan/", - "tenders": "https://public.mtender.gov.md/tenders/"} + data_type = 'record_package' def start_requests(self): - for endpoint, url in self.endpoints.items(): - yield scrapy.Request( - url, - meta={'kf_filename': 'meta-{}-start.json'.format(endpoint), 'endpoint': endpoint, 'data': False} - ) + endpoints = { + 'budgets': 'https://public.mtender.gov.md/budgets/', + # From https://github.com/open-contracting/kingfisher-collect/issues/192#issuecomment-529928683 + # The /tenders/plans endpoint appeared to return exactly the same data as the /tenders endpoint except + # that when given an OCID parameter it returned an error message. It may be that /tenders/plans just + # lists a subset of /tenders but this isn't clear. + # 'plans': 'https://public.mtender.gov.md/tenders/plan/', + 'tenders': 'https://public.mtender.gov.md/tenders/', + } + + for endpoint, url in endpoints.items(): + yield self.build_request(url, formatter=components(-1), callback=self.parse_list) @handle_error - def parse(self, response): - if response.request.meta['data']: - yield self.build_file_from_response(response, data_type='record_package') - else: - self.build_file_from_response(response) - json_data = json.loads(response.text) - offset = json_data.get('offset') - # not having an offset in the data means the data has come to an end. - if not offset: - return - - endpoint = response.request.meta['endpoint'] - endpoint_url = self.endpoints[endpoint] - - for data in json_data.get('data', []): - yield scrapy.Request( - endpoint_url + data['ocid'], - meta={ - 'kf_filename': 'data-{}-{}.json'.format(endpoint, data['ocid']), - 'endpoint': endpoint, - 'data': True, - } - ) - - if self.sample: - return - - yield scrapy.Request( - endpoint_url + '?offset=' + offset, - meta={ - 'kf_filename': 'meta-{}-{}.json'.format(endpoint, offset), - 'endpoint': endpoint, - 'data': False, - } - ) + def parse_list(self, response): + data = json.loads(response.text) + # The last page returns an empty JSON object. + if not data: + return + + for item in data['data']: + url = replace_parameter(response.request.url, 'offset', None) + item['ocid'] + yield self.build_request(url, formatter=components(-2)) + + if self.sample: + return + + url = replace_parameter(response.request.url, 'offset', data['offset']) + yield self.build_request(url, formatter=join(components(-1), parameters('offset')), callback=self.parse_list) diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index a8324c21..db50f1d7 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -1,6 +1,5 @@ -import scrapy - from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import components, date_range_by_year class MoldovaOld(SimpleSpider): @@ -8,14 +7,12 @@ class MoldovaOld(SimpleSpider): data_type = 'release_package' def start_requests(self): + pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}' + + start = 2012 + stop = 2018 if self.sample: - yield scrapy.Request( - 'http://opencontracting.date.gov.md/ocds-api/year/2017', - meta={'kf_filename': 'sample.json'} - ) - else: - for year in range(2012, 2018): - yield scrapy.Request( - 'http://opencontracting.date.gov.md/ocds-api/year/%d' % year, - meta={'kf_filename': 'year-%d.json' % year} - ) + start = 2018 + + for year in date_range_by_year(start, stop): + yield self.build_request(pattern.format(year), formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/nepal_dhangadhi.py b/kingfisher_scrapy/spiders/nepal_dhangadhi.py index 1b0f635a..2e0fbdf6 100644 --- a/kingfisher_scrapy/spiders/nepal_dhangadhi.py +++ b/kingfisher_scrapy/spiders/nepal_dhangadhi.py @@ -1,10 +1,9 @@ -import hashlib import json import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class NepalDhangadhi(SimpleSpider): @@ -20,14 +19,9 @@ def start_requests(self): @handle_error def parse_list(self, response): - url = 'https://admin.ims.susasan.org/ocds/json/dhangadhi-{}.json' - json_data = json.loads(response.text) - fiscal_years = json_data['data']['fiscal_years'] - for item in fiscal_years: - fy = item['name'] - yield scrapy.Request( - url.format(fy), - meta={'kf_filename': hashlib.md5((url + fy).encode('utf-8')).hexdigest() + '.json'}, - ) + pattern = 'https://admin.ims.susasan.org/ocds/json/dhangadhi-{}.json' + data = json.loads(response.text) + for item in data['data']['fiscal_years']: + yield self.build_request(pattern.format(item['name']), formatter=components(-1)) if self.sample: break diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py index 3cc70953..3b91a876 100644 --- a/kingfisher_scrapy/spiders/nepal_portal.py +++ b/kingfisher_scrapy/spiders/nepal_portal.py @@ -1,9 +1,7 @@ -import datetime -import hashlib - -import scrapy +from datetime import date from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import components, date_range_by_year class NepalPortal(SimpleSpider): @@ -11,18 +9,14 @@ class NepalPortal(SimpleSpider): data_type = 'release_package' def start_requests(self): + pattern = 'http://ppip.gov.np/bulk-download/{}' + if self.sample: - current_year = 2018 - end_year = 2018 + start = 2018 + stop = 2018 else: - current_year = 2013 - now = datetime.datetime.now() - end_year = now.year + start = 2012 + stop = date.today().year # HTTP 500 after 2018 - while current_year <= end_year: - url = 'http://ppip.gov.np/bulk-download/{}'.format(current_year) - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) - current_year += 1 + for year in date_range_by_year(start, stop): + yield self.build_request(pattern.format(year), formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/nigeria_portal.py b/kingfisher_scrapy/spiders/nigeria_portal.py index 82eac852..fc8509f0 100644 --- a/kingfisher_scrapy/spiders/nigeria_portal.py +++ b/kingfisher_scrapy/spiders/nigeria_portal.py @@ -1,5 +1,3 @@ -import hashlib - import scrapy from kingfisher_scrapy.base_spider import SimpleSpider @@ -16,7 +14,7 @@ class NigeriaPortal(SimpleSpider): def start_requests(self): yield scrapy.Request( 'http://nocopo.bpp.gov.ng/OpenData.aspx', - meta={'kf_filename': 'list.html'}, + meta={'kf_filename': 'form.html'}, callback=self.parse_list ) @@ -36,8 +34,4 @@ def parse_list(self, response): if self.sample: break - yield scrapy.FormRequest.from_response( - response, - formdata=formdata, - meta={'kf_filename': hashlib.md5(response.url.encode('utf-8')).hexdigest() + '.json'} - ) + yield scrapy.FormRequest.from_response(response, formdata=formdata, meta={'kf_filename': 'all.json'}) diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index c3399af9..e5d6ee32 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -1,4 +1,3 @@ -import hashlib import json from datetime import datetime, timedelta from math import ceil @@ -7,6 +6,7 @@ from kingfisher_scrapy.base_spider import BaseSpider from kingfisher_scrapy.exceptions import AuthenticationError +from kingfisher_scrapy.util import parameters class OpenOpps(BaseSpider): @@ -40,9 +40,8 @@ class OpenOpps(BaseSpider): reauthenticating = False # flag for request a new token start_time = None - base_page_url = \ - 'https://api.openopps.com/api/ocds/?' \ - 'format={}&ordering={}&page_size={}&releasedate__gte={{}}&releasedate__lte={{}}' + base_page_url = 'https://api.openopps.com/api/ocds/?format=json&ordering=releasedate&page_size=1000&' \ + 'releasedate__gte={}&releasedate__lte={}' custom_settings = { 'DOWNLOADER_MIDDLEWARES': { @@ -82,7 +81,7 @@ def parse_access_token(self, response): r = json.loads(response.text) token = r.get('token') if token: - self.logger.info('New access token: {}'.format(token)) + self.logger.info(f'New access token: {token}') self.access_token = 'JWT ' + token self.start_time = datetime.now() # If the request is initial authentication, start requests @@ -92,19 +91,15 @@ def parse_access_token(self, response): self.reauthenticating = False else: self.logger.error( - 'Authentication failed. Status code: {}. {}'.format(response.status, response.text)) + f'Authentication failed. Status code: {response.status}. {response.text}') raise AuthenticationError() else: self.logger.error( - 'Authentication failed. Status code: {}. {}'.format(response.status, response.text)) + f'Authentication failed. Status code: {response.status}. {response.text}') raise AuthenticationError() def start_requests_pages(self): - page_size = 1000 - page_format = 'json' - ordering = 'releasedate' search_h = 24 # start splitting one day search - self.base_page_url = self.base_page_url.format(page_format, ordering, page_size) # Case if we want to download a sample if self.sample: @@ -130,11 +125,10 @@ def start_requests_pages(self): yield from self.request_range_per_day(start_date, end_date, search_h) def request_range(self, start_date, end_date, search_h): - url = self.base_page_url.format(start_date, end_date) - return scrapy.Request( - url, + return self.build_request( + self.base_page_url.format(start_date, end_date), + formatter=parameters('releasedate__gte', 'releasedate__lte'), meta={ - 'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', 'release_date': start_date, 'search_h': search_h, }, @@ -171,10 +165,10 @@ def parse(self, response): next_url = results.get('next') if next_url: - yield scrapy.Request( + yield self.build_request( next_url, + formatter=parameters('releasedate__gte', 'releasedate__lte', 'page'), meta={ - 'kf_filename': hashlib.md5(next_url.encode('utf-8')).hexdigest() + '.json', 'release_date': release_date, 'search_h': search_h, }, @@ -184,7 +178,7 @@ def parse(self, response): # Tells if we have to re-authenticate before the token expires time_diff = datetime.now() - self.start_time if not self.reauthenticating and time_diff.total_seconds() > self.request_time_limit * 60: - self.logger.info('Time_diff: {}'.format(time_diff.total_seconds())) + self.logger.info(f'Time_diff: {time_diff.total_seconds()}') self.reauthenticating = True yield scrapy.Request( 'https://api.openopps.com/api/api-token-auth/', @@ -220,13 +214,12 @@ def parse(self, response): if len(start_hour_list) != len(end_hour_list): end_hour_list.append(last_hour) - self.logger.info('Changing filters, split in {}: {}.'.format(parts, response.request.url)) + self.logger.info(f'Changing filters, split in {parts}: {response.request.url}.') for i in range(len(start_hour_list)): - url = self.base_page_url.format(start_hour_list[i], end_hour_list[i]) - yield scrapy.Request( - url, + yield self.build_request( + self.base_page_url.format(start_hour_list[i], end_hour_list[i]), + formatter=parameters('releasedate__gte', 'releasedate__lte'), meta={ - 'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', 'release_date': start_hour_list[i], # release_date with star hour 'last_hour': end_hour_list[i], # release_date with last hour 'search_h': split_h, # new search range @@ -237,7 +230,7 @@ def parse(self, response): # Message for pages that exceed the 10,000 search results in the range of one hour # These are pages with status 500 and 'page=11' in the URL request if response.status == 500 and response.request.url.count("page=11"): - self.logger.info('Status: {}. Results exceeded in a range of one hour, we save the ' - 'first 10,000 data for: {}'.format(response.status, response.request.url)) + self.logger.info(f'Status: {response.status}. Results exceeded in a range of one hour, we save the ' + f'first 10,000 data for: {response.request.url}') else: yield self.build_file_error_from_response(response) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 4c6e265f..30580bfb 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -6,7 +6,7 @@ from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.exceptions import AuthenticationError -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error, parameters class ParaguayDNCPBaseSpider(SimpleSpider): @@ -22,8 +22,8 @@ class ParaguayDNCPBaseSpider(SimpleSpider): last_request = None request_time_limit = 13 # in minutes base_url = 'https://contrataciones.gov.py/datos/api/v3/doc' - base_page_url = '{}/search/processes?fecha_desde=2010-01-01'.format(base_url) - auth_url = '{}/oauth/token'.format(base_url) + base_page_url = f'{base_url}/search/processes?fecha_desde=2010-01-01' + auth_url = f'{base_url}/oauth/token' request_token = None max_attempts = 10 data_type = None @@ -54,10 +54,10 @@ def start_requests(self): from_date = self.from_date.strftime(self.date_format) self.base_page_url = '{}/search/processes?tipo_fecha=fecha_release&fecha_desde={}'\ .format(self.base_url, from_date) - yield scrapy.Request( + yield self.build_request( self.base_page_url, + formatter=parameters('fecha_desde'), meta={ - 'kf_filename': '{}-1.json'.format(from_date), 'from_date': from_date, }, # send duplicate requests when the token expired and in the continuation of last_request saved. @@ -69,7 +69,7 @@ def request_access_token(self): """ Requests a new access token """ attempt = 0 self.start_time = datetime.now() - self.logger.info('Requesting access token, attempt {} of {}'.format(attempt + 1, self.max_attempts)) + self.logger.info(f'Requesting access token, attempt {attempt + 1} of {self.max_attempts}') return scrapy.Request( self.auth_url, @@ -87,7 +87,7 @@ def parse_access_token(self, response): r = json.loads(response.text) token = r.get('access_token') if token: - self.logger.info('New access token: {}'.format(token)) + self.logger.info(f'New access token: {token}') self.access_token = token # continue scraping where it stopped after getting the token yield self.last_request @@ -98,10 +98,7 @@ def parse_access_token(self, response): self.auth_failed = True raise AuthenticationError() else: - self.logger.info('Requesting access token, attempt {} of {}'.format( - attempt + 1, - self.max_attempts) - ) + self.logger.info(f'Requesting access token, attempt {attempt + 1} of {self.max_attempts}') return scrapy.Request( self.auth_url, method='POST', @@ -113,7 +110,7 @@ def parse_access_token(self, response): priority=1000 ) else: - self.logger.error('Authentication failed. Status code: {}'.format(response.status)) + self.logger.error(f'Authentication failed. Status code: {response.status}') self.auth_failed = True raise AuthenticationError() @@ -121,18 +118,14 @@ def parse_access_token(self, response): def parse_pages(self, response): content = json.loads(response.text) for url in self.get_files_to_download(content): - yield scrapy.Request( - url, - dont_filter=True, - meta={'kf_filename': url.split('/')[-1] + '.json'} - ) + yield self.build_request(url, formatter=components(-1), dont_filter=True) pagination = content['pagination'] if pagination['current_page'] < pagination['total_pages'] and not self.sample: page = pagination['current_page'] + 1 - url = '{}&page={}'.format(self.base_page_url, page) - yield scrapy.Request( + url = f'{self.base_page_url}&page={page}' + yield self.build_request( url, - meta={'kf_filename': '{}-{}.json'.format(response.request.meta['from_date'], page)}, + formatter=parameters('fecha_desde', 'page'), dont_filter=True, callback=self.parse_pages ) @@ -148,5 +141,5 @@ def expires_soon(self, time_diff): """ if time_diff.total_seconds() < ParaguayDNCPBaseSpider.request_time_limit * 60: return False - self.logger.info('Time_diff: {}'.format(time_diff.total_seconds())) + self.logger.info(f'Time_diff: {time_diff.total_seconds()}') return True diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_records.py b/kingfisher_scrapy/spiders/paraguay_dncp_records.py index 6325cef6..721e666a 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_records.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_records.py @@ -20,4 +20,4 @@ class ParaguayDNCPRecords(ParaguayDNCPBaseSpider): def get_files_to_download(self, content): for record in content['records']: - yield '{}/ocds/record/{}'.format(self.base_url, record['ocid']) + yield f"{self.base_url}/ocds/record/{record['ocid']}" diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py index d10ebb98..522dbde9 100644 --- a/kingfisher_scrapy/spiders/paraguay_hacienda.py +++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py @@ -5,7 +5,7 @@ from kingfisher_scrapy.base_spider import BaseSpider from kingfisher_scrapy.exceptions import AuthenticationError -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error, parameters class ParaguayHacienda(BaseSpider): @@ -42,10 +42,10 @@ def from_crawler(cls, crawler, *args, **kwargs): def start_requests(self): # Paraguay Hacienda has a service that return all the ids that we need to get the releases packages # so we first iterate over this list that is paginated - yield scrapy.Request( + yield self.build_request( self.base_list_url.format(1), + formatter=parameters('page'), meta={ - 'kf_filename': 'list-1.json', 'meta': True, 'first': True, }, @@ -56,16 +56,16 @@ def start_requests(self): @handle_error def parse(self, response): data = json.loads(response.text) - base_url = 'https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/ocds/release-package/{}' + pattern = 'https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/ocds/release-package/{}' # If is the first URL, we need to iterate over all the pages to get all the process ids to query if response.request.meta['first'] and not self.sample: - total_pages = data['meta']['totalPages'] - for page in range(2, total_pages+1): - yield scrapy.Request( + total = data['meta']['totalPages'] + for page in range(2, total + 1): + yield self.build_request( self.base_list_url.format(page), + formatter=parameters('page'), meta={ - 'kf_filename': 'list-{}.json'.format(page), 'meta': True, 'first': False, }, @@ -82,10 +82,10 @@ def parse(self, response): for row in data['results']: if row['idLlamado'] and row['idLlamado'] not in self.release_ids: self.release_ids.append(row['idLlamado']) - yield scrapy.Request( - base_url.format(row['idLlamado']), + yield self.build_request( + pattern.format(row['idLlamado']), + formatter=components(-1), meta={ - 'kf_filename': 'release-{}.json'.format(row['idLlamado']), 'meta': False, 'first': False, }, @@ -98,7 +98,7 @@ def request_access_token(self): """ Requests a new access token """ attempt = 0 self.start_time = datetime.now() - self.logger.info('Requesting access token, attempt {} of {}'.format(attempt + 1, self.max_attempts)) + self.logger.info(f'Requesting access token, attempt {attempt + 1} of {self.max_attempts}') payload = {"clientSecret": self.client_secret} return scrapy.Request( @@ -117,7 +117,7 @@ def parse_access_token(self, response): r = json.loads(response.text) token = r.get('accessToken') if token: - self.logger.info('New access token: {}'.format(token)) + self.logger.info(f'New access token: {token}') self.access_token = 'Bearer ' + token # continue scraping where it stopped after getting the token yield self.last_request @@ -128,10 +128,7 @@ def parse_access_token(self, response): self.auth_failed = True raise AuthenticationError() else: - self.logger.info('Requesting access token, attempt {} of {}'.format( - attempt + 1, - self.max_attempts) - ) + self.logger.info(f'Requesting access token, attempt {attempt + 1} of {self.max_attempts}') return scrapy.Request( "https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/auth/token", method='POST', @@ -143,7 +140,7 @@ def parse_access_token(self, response): priority=1000 ) else: - self.logger.error('Authentication failed. Status code: {}'.format(response.status)) + self.logger.error(f'Authentication failed. Status code: {response.status}') self.auth_failed = True raise AuthenticationError() @@ -153,5 +150,5 @@ def expires_soon(self, time_diff): """ if time_diff.total_seconds() < ParaguayHacienda.request_time_limit * 60: return False - self.logger.info('Time_diff: {}'.format(time_diff.total_seconds())) + self.logger.info(f'Time_diff: {time_diff.total_seconds()}') return True diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py index 7a3b5f5c..fdf25ba7 100644 --- a/kingfisher_scrapy/spiders/portugal.py +++ b/kingfisher_scrapy/spiders/portugal.py @@ -1,10 +1,9 @@ -import hashlib import json import scrapy from kingfisher_scrapy.base_spider import ZipSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class Portugal(ZipSpider): @@ -17,26 +16,16 @@ class Portugal(ZipSpider): download_timeout = 9999 def start_requests(self): - url = 'https://dados.gov.pt/api/1/datasets/?q=ocds&organization={}&page_size={}' - id = '5ae97fa2c8d8c915d5faa3bf' - page_size = 20 - yield scrapy.Request( - url.format(id, page_size), - meta={'kf_filename': 'list.json'}, - callback=self.parse_list - ) + # A CKAN API JSON response. + url = 'https://dados.gov.pt/api/1/datasets/?q=ocds&organization=5ae97fa2c8d8c915d5faa3bf&page_size=20' + yield scrapy.Request(url, meta={'kf_filename': 'page-1.json'}, callback=self.parse_list) @handle_error def parse_list(self, response): - datas = json.loads(response.text) - for data in datas['data']: - for resource in data['resources']: + data = json.loads(response.text) + for item in data['data']: + for resource in item['resources']: description = resource['description'] - url = resource['url'] - if description.count("OCDS") or description.count("ocds"): - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) - if self.sample: - break + if description and 'ocds' in description.lower(): + # Presently, only one URL matches. + yield self.build_request(resource['url'], formatter=components(-2)) diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index 76b365dc..e8cb5973 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -1,53 +1,51 @@ -import datetime - -import scrapy +from datetime import date, timedelta from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import parameters class Scotland(SimpleSpider): name = 'scotland' data_type = 'release_package' - notice_types = [ - 1, # OJEU - F1 - Prior Information Notice - 2, # OJEU - F2 - Contract Notice - 3, # OJEU - F3 - Contract Award Notice - 4, # OJEU - F4 - Prior Information Notice(Utilities) - 5, # OJEU - F5 - Contract Notice(Utilities) - 6, # OJEU - F6 - Contract Award Notice(Utilities) - 7, # OJEU - F7 - Qualification Systems(Utilities) - 12, # OJEU - F12 - Design Contest Notice - 13, # OJEU - F13 - Results Of Design Contest - 14, # OJEU - F14 - Corrigendum - 15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice - 20, # OJEU - F20 - Modification Notice - 21, # OJEU - F21 - Social And other Specific Services(Public Contracts) - 22, # OJEU - F22 - Social And other Specific Services(Utilities) - 23, # OJEU - F23 - Social And other Specific Services(Concessions) - 24, # OJEU - F24 - Concession Notice - 25, # OJEU - F25 - Concession Award Notice - 101, # Site Notice - Website Contract Notice - 102, # Site Notice - Website Prior Information Notice - 103, # Site Notice - Website Contract Award Notice - 104, # Site Notice - Quick Quote Award - ] - def start_requests(self): - format_string = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={}&outputType=1¬iceType={}' + pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={}&outputType=1¬iceType={}' + + notice_types = [ + 1, # OJEU - F1 - Prior Information Notice + 2, # OJEU - F2 - Contract Notice + 3, # OJEU - F3 - Contract Award Notice + 4, # OJEU - F4 - Prior Information Notice(Utilities) + 5, # OJEU - F5 - Contract Notice(Utilities) + 6, # OJEU - F6 - Contract Award Notice(Utilities) + 7, # OJEU - F7 - Qualification Systems(Utilities) + 12, # OJEU - F12 - Design Contest Notice + 13, # OJEU - F13 - Results Of Design Contest + 14, # OJEU - F14 - Corrigendum + 15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice + 20, # OJEU - F20 - Modification Notice + 21, # OJEU - F21 - Social And other Specific Services(Public Contracts) + 22, # OJEU - F22 - Social And other Specific Services(Utilities) + 23, # OJEU - F23 - Social And other Specific Services(Concessions) + 24, # OJEU - F24 - Concession Notice + 25, # OJEU - F25 - Concession Award Notice + 101, # Site Notice - Website Contract Notice + 102, # Site Notice - Website Prior Information Notice + 103, # Site Notice - Website Contract Award Notice + 104, # Site Notice - Quick Quote Award + ] + + now = date.today() - now = datetime.datetime.today() - if self.sample: - marker = now - datetime.timedelta(days=14) - for notice_type in self.notice_types: - yield scrapy.Request(format_string.format(marker, notice_type), - meta={'kf_filename': 'sample_{}.json'.format(notice_type)}) - else: - # It's meant to go back a year, but in testing it seemed to be year minus one day! - marker = now - datetime.timedelta(days=364) - while marker <= now: - datestring = '{:04d}-{:02d}-{:02d}'.format(marker.year, marker.month, marker.day) - for notice_type in self.notice_types: - yield scrapy.Request(format_string.format(datestring, notice_type), - meta={'kf_filename': '{}_type_{}.json'.format(datestring, notice_type)}) - marker = marker + datetime.timedelta(days=14) + # It's meant to go back a year, but in testing it seemed to be year minus one day! + marker = now - timedelta(days=364) + while marker <= now: + datestring = '{:04d}-{:02d}-{:02d}'.format(marker.year, marker.month, marker.day) + for notice_type in notice_types: + yield self.build_request( + pattern.format(datestring, notice_type), + formatter=parameters('noticeType', 'dateFrom') + ) + marker = marker + timedelta(days=14) + if self.sample: + break diff --git a/kingfisher_scrapy/spiders/uganda_releases.py b/kingfisher_scrapy/spiders/uganda_releases.py index 1e8f5305..3a629c59 100644 --- a/kingfisher_scrapy/spiders/uganda_releases.py +++ b/kingfisher_scrapy/spiders/uganda_releases.py @@ -1,10 +1,9 @@ -import hashlib import json import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error, join, parameters class Uganda(SimpleSpider): @@ -16,54 +15,36 @@ class Uganda(SimpleSpider): def start_requests(self): yield scrapy.Request( 'https://gpp.ppda.go.ug/adminapi/public/api/pdes', - meta={'kf_filename': 'start_requests'}, - callback=self.parse_pages + meta={'kf_filename': 'page-1.json'}, + callback=self.parse_list ) @handle_error - def parse_pages(self, response): - url_pdes = 'https://gpp.ppda.go.ug/adminapi/public/api/pdes?page={}' + def parse_list(self, response): + pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/pdes?page={}' if self.sample: - total_pages = 1 + total = 1 else: - json_data = json.loads(response.text) - total_pages = json_data['data']['last_page'] + data = json.loads(response.text) + total = data['data']['last_page'] - for page_number in range(total_pages): - yield scrapy.Request( - url_pdes.format(page_number + 1), - meta={'kf_filename': 'pages_requests'}, - callback=self.parse_data - ) + for page in range(2, total + 1): + yield self.build_request(pattern.format(page), formatter=parameters('page'), callback=self.parse_data) @handle_error def parse_data(self, response): - url = 'https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/releases/{}?fy={}&pde={}' - tags = ['planning', 'tender', 'award', 'contract'] - pdes_fdy_checks = [] - - json_data = json.loads(response.text) - list_pdes = json_data['data']['data'] - - for pdes in list_pdes: - pde_plans = pdes['procurement_plans'] - - for plans in pde_plans: - financial_year = plans['financial_year'] - procurement_entity_id = plans['pde_id'] - pdes_fdy = financial_year + '&' + procurement_entity_id - - if pdes_fdy not in pdes_fdy_checks: - pdes_fdy_checks.append(pdes_fdy) - - for tag in tags: - yield scrapy.Request( - url.format(tag, financial_year, procurement_entity_id), - meta={'kf_filename': hashlib.md5( - (url + str(pdes_fdy + tag)).encode('utf-8')).hexdigest() + '.json'} - ) - if self.sample: - break + pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/open-data/v1/releases/{}?fy={}&pde={}' + + data = json.loads(response.text) + for pdes in data['data']['data']: + for plans in pdes['procurement_plans']: + for tag in ('planning', 'tender', 'award', 'contract'): + yield self.build_request( + pattern.format(tag, plans['financial_year'], plans['pde_id']), + formatter=join(components(-1), parameters('fy', 'pde')) + ) if self.sample: break + if self.sample: + break diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index 333cc89c..baf3dce3 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -1,28 +1,25 @@ import json -import scrapy - from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error, parameters, replace_parameter class UKContractsFinder(BaseSpider): name = 'uk_contracts_finder' - base_url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d' + data_type = 'release_package_list_in_results' + encoding = 'iso-8859-1' def start_requests(self): - yield scrapy.Request(self.base_url % 1, meta={'kf_filename': 'page1.json'}) + url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=1' + yield self.build_request(url, formatter=parameters('page'), callback=self.parse_list) @handle_error - def parse(self, response): - yield self.build_file_from_response( - response, - data_type='release_package_list_in_results', - encoding='iso-8859-1' - ) + def parse_list(self, response): + yield from self.parse(response) - if not self.sample and response.request.meta['kf_filename'] == 'page1.json': - json_data = json.loads(response.text) - last_page = json_data['maxPage'] - for page in range(1, last_page + 1): - yield scrapy.Request(self.base_url % page, meta={'kf_filename': 'page%d.json' % page}) + if not self.sample: + data = json.loads(response.text) + total = data['maxPage'] + for page in range(2, total + 1): + url = replace_parameter(response.request.url, 'page', page) + yield self.build_request(url, formatter=components('page')) diff --git a/kingfisher_scrapy/spiders/uk_fts.py b/kingfisher_scrapy/spiders/uk_fts.py index a876784e..6c172dbf 100644 --- a/kingfisher_scrapy/spiders/uk_fts.py +++ b/kingfisher_scrapy/spiders/uk_fts.py @@ -1,16 +1,15 @@ import scrapy from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters class UKContractsFinder(LinksSpider): name = 'uk_fts' data_type = 'release_package_in_ocdsReleasePackage_in_list_in_results' + next_page_formatter = parameters('cursor') def start_requests(self): - yield scrapy.Request( - # This URL was provided by the publisher and is not the production URL. - url='https://enoticetest.service.xgov.uk/api/1.0/ocdsReleasePackages', - meta={'kf_filename': 'start.json'}, - headers={'Accept': 'application/json'}, - ) + # This URL was provided by the publisher and is not the production URL. + url = 'https://enoticetest.service.xgov.uk/api/1.0/ocdsReleasePackages' + yield scrapy.Request(url, meta={'kf_filename': 'start.json'}, headers={'Accept': 'application/json'}) diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py index f1d6152a..d554fe89 100644 --- a/kingfisher_scrapy/spiders/uruguay_base.py +++ b/kingfisher_scrapy/spiders/uruguay_base.py @@ -1,30 +1,19 @@ -import hashlib -from datetime import date, timedelta - -import scrapy +from datetime import date from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import components, date_range_by_month class UruguayBase(SimpleSpider): download_delay = 0.9 def start_requests(self): - base_url = 'http://comprasestatales.gub.uy/ocds/rss/{year:d}/{month:02d}' + url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' - current_date = date(2017, 11, 1) + start = date(2017, 11, 1) + stop = date.today().replace(day=1) if self.sample: - end_date = date(2017, 12, 1) - else: - end_date = date.today().replace(day=1) - - while current_date < end_date: - current_date += timedelta(days=32) - current_date.replace(day=1) + start = stop - url = base_url.format(year=current_date.year, month=current_date.month) - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, - callback=self.parse_list - ) + for d in date_range_by_month(start, stop): + yield self.build_request(url.format(d), formatter=components(-2), callback=self.parse_list) diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py index 1d87df7c..057c2358 100644 --- a/kingfisher_scrapy/spiders/uruguay_historical.py +++ b/kingfisher_scrapy/spiders/uruguay_historical.py @@ -1,6 +1,5 @@ -import scrapy - from kingfisher_scrapy.base_spider import ZipSpider +from kingfisher_scrapy.util import components, date_range_by_year class UruguayHistorical(ZipSpider): @@ -17,10 +16,12 @@ class UruguayHistorical(ZipSpider): } def start_requests(self): - base_url = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites/agencia-compras-contrataciones' \ - '-estado/files/2019-04/OCDS-{}.zip' - end_year = 2018 + start = 2002 + stop = 2017 if self.sample: - end_year = 2003 - for year in range(2002, end_year): - yield scrapy.Request(base_url.format(year), meta={'kf_filename': 'OCDS-{}.zip'.format(year)}) + start = stop + + pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \ + '/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip' + for year in date_range_by_year(start, stop): + yield self.build_request(pattern.format(year), formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/uruguay_records.py b/kingfisher_scrapy/spiders/uruguay_records.py index 1e914bd1..90bf601d 100644 --- a/kingfisher_scrapy/spiders/uruguay_records.py +++ b/kingfisher_scrapy/spiders/uruguay_records.py @@ -1,9 +1,5 @@ -import hashlib - -import scrapy - from kingfisher_scrapy.spiders.uruguay_base import UruguayBase -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class UruguayRecords(UruguayBase): @@ -12,12 +8,12 @@ class UruguayRecords(UruguayBase): @handle_error def parse_list(self, response): - base_record_url = 'https://www.comprasestatales.gub.uy/ocds/record/{}' - root = response.xpath('//item/title/text()').getall() + pattern = 'https://www.comprasestatales.gub.uy/ocds/record/{}' + titles = response.xpath('//item/title/text()').getall() if self.sample: - root = [root[0]] + titles = [titles[0]] - for id_compra in root: - url = base_record_url.format(id_compra.split(',')[0].replace('id_compra:', '')) - yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) + for title in titles: + identifier = title.split(',')[0].split(':')[1] + yield self.build_request(pattern.format(identifier), formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/uruguay_releases.py b/kingfisher_scrapy/spiders/uruguay_releases.py index 832cc027..3e0a6da6 100644 --- a/kingfisher_scrapy/spiders/uruguay_releases.py +++ b/kingfisher_scrapy/spiders/uruguay_releases.py @@ -1,9 +1,5 @@ -import hashlib - -import scrapy - from kingfisher_scrapy.spiders.uruguay_base import UruguayBase -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class UruguayReleases(UruguayBase): @@ -12,10 +8,9 @@ class UruguayReleases(UruguayBase): @handle_error def parse_list(self, response): - root = response.xpath('//item/link/text()').getall() - + urls = response.xpath('//item/link/text()').getall() if self.sample: - root = [root[0]] + urls = [urls[0]] - for url in root: - yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) + for url in urls: + yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/zambia.py b/kingfisher_scrapy/spiders/zambia.py index b82185d3..ee2cf877 100644 --- a/kingfisher_scrapy/spiders/zambia.py +++ b/kingfisher_scrapy/spiders/zambia.py @@ -3,7 +3,7 @@ import scrapy from kingfisher_scrapy.base_spider import ZipSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.util import components, handle_error class Zambia(ZipSpider): @@ -19,14 +19,10 @@ def start_requests(self): @handle_error def parse_list(self, response): - json_data = json.loads(response.text) - files_urls = json_data['packagesPerMonth'] - + urls = json.loads(response.text)['packagesPerMonth'] if self.sample: - files_urls = [files_urls[0]] + urls = [urls[0]] - for file_url in files_urls: - yield scrapy.Request( - file_url, - meta={'kf_filename': '%s.json' % file_url[-16:].replace('/', '-')}, - ) + for url in urls: + # URL looks like https://www.zppa.org.zm/ocds/services/recordpackage/getrecordpackage/2016/7 + yield self.build_request(url, formatter=components(-2)) diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index d425d81c..f20487f5 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -1,16 +1,64 @@ import itertools import json +from datetime import date from decimal import Decimal from functools import wraps +from urllib.parse import parse_qs, urlencode, urlsplit from ijson import ObjectBuilder, utils +def components(start, stop=None): + """ + Returns a function that returns the selected non-empty path components, excluding the ``.json`` extension. + + >>> components(-1)('http://example.com/api/planning.json') + 'planning' + + >>> components(-2, -1)('http://example.com/api/planning/package.json') + 'planning' + """ + def wrapper(url): + value = '-'.join(list(filter(None, urlsplit(url).path.split('/')))[start:stop]) + if value.endswith('.json'): + return value[:-5] + return value + return wrapper + + +def parameters(*keys): + """ + Returns a function that returns the selected query string parameters. + + >>> parameters('page')('http://example.com/api/packages.json?page=1') + 'page-1' + + >>> parameters('year', 'page')('http://example.com/api/packages.json?year=2000&page=1') + 'year-2000-page-1' + """ + def wrapper(url): + query = parse_qs(urlsplit(url).query) + return '-'.join(s for key in keys for value in query[key] for s in [key, value]) + return wrapper + + +def join(*functions): + """ + Returns a function that joins the given functions' outputs. + + >>> join(components(-1), parameters('page'))('http://example.com/api/planning.json?page=1') + 'planning-page-1' + """ + def wrapper(url): + return '-'.join(function(url) for function in functions) + return wrapper + + def handle_error(decorated): """ A decorator for spider parse methods. - Yields a :class:`~kingfisher_scrapy.items.FileError` for non-2xx HTTP status codes. + Yields a :class:`~kingfisher_scrapy.items.FileError` for successful HTTP status codes. """ @wraps(decorated) def wrapper(self, response): @@ -21,6 +69,39 @@ def wrapper(self, response): return wrapper +# https://stackoverflow.com/questions/34898525/generate-list-of-months-between-interval-in-python +def date_range_by_month(start, stop): + """ + Yields the first day of the month from the ``start`` to the ``stop`` dates, in reverse chronological order. + """ + def number_of_months(d): + return 12 * d.year + d.month + + for months in reversed(range(number_of_months(start) - 1, number_of_months(stop))): + year, month = divmod(months, 12) + yield date(year, month + 1, 1) + + +def date_range_by_year(start, stop): + """ + Returns the year from the ``start`` to the ``stop`` years, in reverse chronological order. + """ + return reversed(range(start, stop + 1)) + + +def replace_parameter(url, key, value): + """ + Returns a URL after updating the query string parameter. + """ + parsed = urlsplit(url) + query = parse_qs(parsed.query) + if value is None: + del query[key] + else: + query[key] = [value] + return parsed._replace(query=urlencode(query, doseq=True)).geturl() + + @utils.coroutine def items_basecoro(target, prefix, map_type=None, skip_key=None): """ diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 00000000..df3eb518 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --doctest-modules diff --git a/tests/test_links_spider.py b/tests/test_links_spider.py index f41a26ee..730cf9e5 100644 --- a/tests/test_links_spider.py +++ b/tests/test_links_spider.py @@ -8,12 +8,13 @@ def test_next_link(): spider = spider_with_crawler(spider_class=LinksSpider) + spider.next_page_formatter = lambda url: 'next.json' request = spider.next_link(response_fixture()) assert type(request) is Request assert request.url == 'http://example.com/next' - assert request.meta == {'kf_filename': '166715ca8e5f3c1531156d8772b922b7.json'} + assert request.meta == {'kf_filename': 'next.json'} def test_parse_404(): @@ -36,6 +37,7 @@ def test_parse_404(): def test_parse_200(): spider = spider_with_crawler(spider_class=LinksSpider) spider.data_type = 'release_package' + spider.next_page_formatter = lambda url: 'next.json' generator = spider.parse(response_fixture()) item = next(generator) @@ -53,7 +55,7 @@ def test_parse_200(): assert type(request) is Request assert request.url == 'http://example.com/next' - assert request.meta == {'kf_filename': '166715ca8e5f3c1531156d8772b922b7.json'} + assert request.meta == {'kf_filename': 'next.json'} with pytest.raises(StopIteration): next(generator)