diff --git a/docs/writing-spiders.rst b/docs/writing-spiders.rst index 5252ee0d9..24aeab188 100644 --- a/docs/writing-spiders.rst +++ b/docs/writing-spiders.rst @@ -52,22 +52,20 @@ Here is a sample: .. code-block:: python + from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error - class VerySimple(BaseSpider): - name = "very_simple" + class VerySimple(SimpleSpider): + name = 'very_simple' + data_type = 'release_package' def start_requests(self): # This API only has one URL to get. Make a request for that, and set a filename yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', meta={'kf_filename': '13-14.json'} ) - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type='release_package') - Spider properties ----------------- diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index b97dbec2e..78163e16e 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -6,6 +6,7 @@ import ijson import scrapy +from jsonpointer import resolve_pointer from kingfisher_scrapy import util from kingfisher_scrapy.exceptions import SpiderArgumentError @@ -91,6 +92,8 @@ def is_http_success(self, response): """ Returns whether the response status is a non-2xx code. """ + # All 2xx codes are successful. + # https://tools.ietf.org/html/rfc7231#section-6.3 return 200 <= response.status < 300 def get_start_time(self, format): @@ -99,18 +102,24 @@ def get_start_time(self, format): """ return self.crawler.stats.get_value('start_time').strftime(format) - def build_file_from_response(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True): + def build_file_from_response(self, response, **kwargs): """ Returns an item to yield, based on the response to a request. """ - return self.build_file(response.body, filename, response.request.url, data_type, encoding, post_to_api) - - def build_file(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True): + if 'file_name' not in kwargs: + kwargs['file_name'] = response.request.meta['kf_filename'] + if 'url' not in kwargs: + kwargs['url'] = response.request.url + if 'data' not in kwargs: + kwargs['data'] = response.body + return self.build_file(**kwargs) + + def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True): """ Returns an item to yield. """ return File({ - 'file_name': filename, + 'file_name': file_name, 'data': data, 'data_type': data_type, 'url': url, @@ -118,7 +127,7 @@ def build_file(self, data, filename, url=None, data_type=None, encoding='utf-8', 'post_to_api': post_to_api, }) - def build_file_item(self, number, data, data_type, url, encoding, file_name): + def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'): return FileItem({ 'number': number, 'file_name': file_name, @@ -152,16 +161,17 @@ def _get_package_metadata(self, f, skip_key): package.update(item) return package - def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.json'): + def parse_json_lines(self, f, *, file_name='data.json', url=None, data_type=None, encoding='utf-8'): for number, line in enumerate(f, 1): if self.sample and number > self.MAX_SAMPLE: break if isinstance(line, bytes): line = line.decode(encoding=encoding) - yield self.build_file_item(number, line, data_type, url, encoding, file_name) + yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type, + encoding=encoding) - def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases', - file_name='data.json'): + def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None, data_type=None, encoding='utf-8', + array_field_name='releases'): if self.sample: size = self.MAX_SAMPLE else: @@ -172,64 +182,92 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1): package[array_field_name] = filter(None, items) data = json.dumps(package, default=util.default) - yield self.build_file_item(number, data, data_type, url, encoding, file_name) + yield self.build_file_item(number=number, file_name=file_name, url=url, data=data, data_type=data_type, + encoding=encoding) if self.sample: break -class ZipSpider(BaseSpider): +class SimpleSpider(BaseSpider): """ - This class makes it easy to collect data from ZIP files: + Most spiders can inherit from this class. It assumes all responses have the same data type. - - Inherit from ``ZipSpider`` - - Set a ``parse_zipfile_kwargs`` class attribute to the keyword arguments for the - :meth:`kingfisher_scrapy.base_spider.ZipSpider.parse_zipfile` method - - Write a ``start_requests`` method to request the ZIP files + 1. Inherit from ``SimpleSpider`` + 1. Set a ``data_type`` class attribute to the data type of the responses + 1. Optionally, set an ``encoding`` class attribute to the encoding of the responses (default UTF-8) + 1. Optionally, set a ``data_pointer`` class attribute to the JSON Pointer for OCDS data (default "") + 1. Write a ``start_requests`` method (and any intermediate callbacks) to send requests .. code-block:: python import scrapy - from kingfisher_scrapy.base_spider import ZipSpider + from kingfisher_scrapy.base_spider import SimpleSpider - class MySpider(LinksSpider): + class MySpider(SimpleSpider): name = 'my_spider' - - parse_zipfile_kwargs = {'data_type': 'release_package'} + data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - url='https://example.com/api/packages.zip', - meta={'kf_filename': 'all.json'} - ) + yield scrapy.Request('https://example.com/api/package.json', meta={'kf_filename': 'all.json'}) """ + + encoding = 'utf-8' + data_pointer = '' + @handle_error def parse(self, response): - yield from self.parse_zipfile(response, **self.parse_zipfile_kwargs) + kwargs = {} + if self.data_pointer: + kwargs['data'] = json.dumps(resolve_pointer(json.loads(response.text), self.data_pointer)).encode() - def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'): - """ - Handles a response that is a ZIP file. - - :param response response: the response - :param str data_type: the compressed files' ``data_type`` - :param str file_format: The compressed files' format - - ``json_lines`` - Yields each line of the compressed files. - The ZIP file is saved to disk. - ``release_package`` - Re-packages the releases in the compressed files in groups of - :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages. - The ZIP file is saved to disk. - ``None`` - Yields each compressed file. - Each compressed file is saved to disk. - :param str encoding: the compressed files' encoding - """ - if file_format: + yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding, **kwargs) + + +class ZipSpider(BaseSpider): + """ + This class makes it easy to collect data from ZIP files. It assumes all files have the same data type. + + 1. Inherit from ``ZipSpider`` + 1. Set a ``data_type`` class attribute to the data type of the compressed files + 1. Optionally, set an ``encoding`` class attribute to the encoding of the compressed_files (default UTF-8) + 1. Optionally, set a ``zip_file_format`` class attribute to the format of the compressed files + + ``json_lines`` + Yields each line of the compressed files. + The ZIP file is saved to disk. + ``release_package`` + Re-packages the releases in the compressed files in groups of + :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages. + The ZIP file is saved to disk. + ``None`` + Yields each compressed file. + Each compressed file is saved to disk. + + 1. Write a ``start_requests`` method to request the ZIP files + + .. code-block:: python + + import scrapy + + from kingfisher_scrapy.base_spider import ZipSpider + + class MySpider(ZipSpider): + name = 'my_spider' + data_type = 'release_package' + + def start_requests(self): + yield scrapy.Request('https://example.com/api/packages.zip', meta={'kf_filename': 'all.json'}) + """ + + encoding = 'utf-8' + zip_file_format = None + + @handle_error + def parse(self, response): + if self.zip_file_format: filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest()) - self.build_file_from_response(response, filename, post_to_api=False) + self.build_file_from_response(response, file_name=filename, post_to_api=False) zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): @@ -239,26 +277,31 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') data = zip_file.open(finfo.filename) - if file_format == 'json_lines': - yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding, - file_name=filename) - elif file_format == 'release_package': + kwargs = { + 'file_name': filename, + 'url': response.request.url, + 'data_type': self.data_type, + 'encoding': self.encoding, + } + + if self.zip_file_format == 'json_lines': + yield from self.parse_json_lines(data, **kwargs) + elif self.zip_file_format == 'release_package': package = zip_file.open(finfo.filename) - yield from self.parse_json_array(package, data, data_type, response.request.url, - encoding=encoding, file_name=filename) + yield from self.parse_json_array(package, data, **kwargs) else: - yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url, - encoding=encoding) + yield self.build_file(data=data.read(), **kwargs) -class LinksSpider(BaseSpider): +class LinksSpider(SimpleSpider): """ This class makes it easy to collect data from an API that implements the `pagination `__ pattern: - - Inherit from ``LinksSpider`` - - Set a ``data_type`` class attribute to the data type of the API responses - - Write a ``start_requests`` method to request the first page + 1. Inherit from ``LinksSpider`` + 1. Set a ``data_type`` class attribute to the data type of the API responses + 1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next") + 1. Write a ``start_requests`` method to request the first page of API results .. code-block:: python @@ -271,25 +314,23 @@ class MySpider(LinksSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - url='https://example.com/api/packages.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://example.com/api/packages.json', meta={'kf_filename': 'page1.json'}) """ + next_pointer = '/links/next' + @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=self.data_type) + yield from super().parse(response) if not self.sample: yield self.next_link(response) - @staticmethod - def next_link(response): + def next_link(self, response): """ If the JSON response has a ``links.next`` key, returns a ``scrapy.Request`` for the URL. """ data = json.loads(response.text) - if 'links' in data and 'next' in data['links']: - url = data['links']['next'] + url = resolve_pointer(data, self.next_pointer, None) + if url: return scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index d2827f0b5..a662dc01c 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -2,17 +2,19 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class AfghanistanRecords(BaseSpider): +class AfghanistanRecords(SimpleSpider): name = 'afghanistan_records' + data_type = 'record' + download_delay = 1 def start_requests(self): yield scrapy.Request( - url='https://ocds.ageops.net/api/ocds/records', + 'https://ocds.ageops.net/api/ocds/records', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -24,12 +26,4 @@ def parse_list(self, response): files_urls = [files_urls[0]] for file_url in files_urls: - yield scrapy.Request( - url=file_url, - meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - callback=self.parse_record - ) - - @handle_error - def parse_record(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="record") + yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index ec197c4fe..99a49b69a 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -2,17 +2,19 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class AfghanistanReleases(BaseSpider): +class AfghanistanReleases(SimpleSpider): name = 'afghanistan_releases' + data_type = 'release' + download_delay = 1.5 def start_requests(self): yield scrapy.Request( - url='https://ocds.ageops.net/api/ocds/releases/dates', + 'https://ocds.ageops.net/api/ocds/releases/dates', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -25,7 +27,7 @@ def parse_list(self, response): for file_url in files_urls: yield scrapy.Request( - url=file_url, + file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}, callback=self.parse_release_list ) @@ -37,12 +39,4 @@ def parse_release_list(self, response): files_urls = [files_urls[0]] for file_url in files_urls: - yield scrapy.Request( - url=file_url, - meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - callback=self.parse_release - ) - - @handle_error - def parse_release(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="release") + yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index d1847e3d6..10dadbf51 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -17,14 +17,15 @@ class ArgentinaBuenosAires(ZipSpider): Downloads the zip file and sends 10 releases to kingfisher process. """ name = 'argentina_buenos_aires' + data_type = 'release_package' + zip_file_format = 'release_package' + # the data list service takes too long to be downloaded, so we increase the download timeout download_timeout = 1000 - parse_zipfile_kwargs = {'data_type': 'release_package', 'file_format': 'release_package'} - def start_requests(self): yield scrapy.Request( - url='https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', + 'https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index 898b366dc..1c44cfc88 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -1,18 +1,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider -class ArgentinaVialidad(BaseSpider): +class ArgentinaVialidad(SimpleSpider): name = 'argentina_vialidad' + data_type = 'release_package_list' def start_requests(self): yield scrapy.Request( - url='https://datosabiertos.vialidad.gob.ar/api/ocds/package/all', + 'https://datosabiertos.vialidad.gob.ar/api/ocds/package/all', meta={'kf_filename': 'all.json'} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, 'all.json', data_type='release_package_list') diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index b00aacea8..67c42a6e4 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -1,31 +1,12 @@ -import hashlib -import json - import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import LinksSpider -class Armenia(BaseSpider): +class Armenia(LinksSpider): name = 'armenia' + data_type = 'release_package' + next_pointer = '/next_page/uri' def start_requests(self): - yield scrapy.Request( - url='https://armeps.am/ocds/release', - meta={'kf_filename': 'page1.json'} - ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') - - json_data = json.loads(response.text) - if not (self.sample): - if 'next_page' in json_data and 'uri' in json_data['next_page']: - url = json_data['next_page']['uri'] - yield scrapy.Request( - url=url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest()+'.json'} - ) + yield scrapy.Request('https://armeps.am/ocds/release', meta={'kf_filename': 'page1.json'}) diff --git a/kingfisher_scrapy/spiders/australia.py b/kingfisher_scrapy/spiders/australia.py index d17f023e1..a6f1da850 100644 --- a/kingfisher_scrapy/spiders/australia.py +++ b/kingfisher_scrapy/spiders/australia.py @@ -10,17 +10,17 @@ class Australia(LinksSpider): data_type = 'release_package' def start_requests(self): + url_prefix = 'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' + if self.sample: yield scrapy.Request( - url='https://api.tenders.gov.au/ocds/findByDates/contractPublished/2018-01-01T00:00:00Z/2018-12-31T23' - ':59:59Z', + url_prefix + '2018-01-01T00:00:00Z/2018-12-31T23:59:59Z', meta={'kf_filename': 'year-2018.json'} ) else: current_year = datetime.datetime.now().year + 1 for year in range(2004, current_year): yield scrapy.Request( - url='https://api.tenders.gov.au/ocds/findByDates/contractPublished/' - '{}-01-01T00:00:00Z/{}-12-31T23:59:59Z'.format(year, year), + url_prefix + '{}-01-01T00:00:00Z/{}-12-31T23:59:59Z'.format(year, year), meta={'kf_filename': 'year-{}.json'.format(year)} ) diff --git a/kingfisher_scrapy/spiders/australia_nsw.py b/kingfisher_scrapy/spiders/australia_nsw.py index 7a3d53478..b09326b4f 100644 --- a/kingfisher_scrapy/spiders/australia_nsw.py +++ b/kingfisher_scrapy/spiders/australia_nsw.py @@ -3,12 +3,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class AustraliaNSW(BaseSpider): +class AustraliaNSW(SimpleSpider): name = 'australia_nsw' + data_type = 'release_package' def start_requests(self): release_types = ['planning', 'tender', 'contract'] @@ -24,54 +25,41 @@ def start_requests(self): callback=self.parse_list ) + @handle_error def parse_list(self, response): - if self.is_http_success(response): + json_data = json.loads(response.text) + release_type = response.request.meta['release_type'] - json_data = json.loads(response.text) - release_type = response.request.meta['release_type'] + # More Pages? + if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \ + and not self.sample: + yield scrapy.Request( + json_data['links']['next'], + meta={ + 'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json', + 'release_type': release_type, + }, + callback=self.parse_list + ) - # More Pages? - if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \ - and not self.sample: + # Data? + for release in json_data['releases']: + if release_type == 'planning': + uuid = release['tender']['plannedProcurementUUID'] yield scrapy.Request( - json_data['links']['next'], - meta={ - 'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json', - 'release_type': release_type, - }, - callback=self.parse_list + 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid, + meta={'kf_filename': 'plannning-%s.json' % uuid} ) - - # Data? - for release in json_data['releases']: - if release_type == 'planning': - uuid = release['tender']['plannedProcurementUUID'] - yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid, - meta={'kf_filename': 'plannning-%s.json' % uuid}, - callback=self.parse - ) - if release_type == 'tender': - uuid = release['tender']['RFTUUID'] + if release_type == 'tender': + uuid = release['tender']['RFTUUID'] + yield scrapy.Request( + 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid, + meta={'kf_filename': 'tender-%s.json' % uuid} + ) + if release_type == 'contract': + for award in release['awards']: + uuid = award['CNUUID'] yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid, - meta={'kf_filename': 'tender-%s.json' % uuid}, - callback=self.parse + 'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=%s' % uuid, + meta={'kf_filename': 'contract-%s.json' % uuid} ) - if release_type == 'contract': - for award in release['awards']: - uuid = award['CNUUID'] - yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=%s' % uuid, - meta={'kf_filename': 'contract-%s.json' % uuid}, - callback=self.parse - ) - - else: - yield self.build_file_error_from_response( - response, file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json') - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index e63d2a93e..e9d311e25 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -1,33 +1,28 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider -class CanadaBuyAndSell(BaseSpider): - name = "canada_buyandsell" +class CanadaBuyAndSell(SimpleSpider): + name = 'canada_buyandsell' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', meta={'kf_filename': '13-14.json'} ) if self.sample: return yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', meta={'kf_filename': '14-15.json'} ) yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', meta={'kf_filename': '15-16.json'} ) yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', meta={'kf_filename': '16-17.json'} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 5953656ca..643662138 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -12,18 +12,14 @@ class CanadaMontreal(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d' % self.page_limit, + 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d' % self.page_limit, meta={'kf_filename': 'page0.json'} ) @handle_error def parse(self, response): # Actual data - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="release_package" - ) + yield self.build_file_from_response(response, data_type='release_package') # Load more pages? if not self.sample and response.request.meta['kf_filename'] == 'page0.json': @@ -33,8 +29,5 @@ def parse(self, response): while offset < total: url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d' % \ (self.page_limit, offset) - yield scrapy.Request( - url=url, - meta={'kf_filename': 'page' + str(offset) + '.json'} - ) + yield scrapy.Request(url, meta={'kf_filename': 'page' + str(offset) + '.json'}) offset += self.page_limit diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index 8c242de0d..c0f851f6b 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -4,6 +4,7 @@ import scrapy from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.util import handle_error class ChileCompraBaseSpider(BaseSpider): @@ -28,7 +29,7 @@ def get_year_month_until(self): def start_requests(self): if self.sample: yield scrapy.Request( - url=self.base_list_url.format(2017, 10, 0, 10), + self.base_list_url.format(2017, 10, 0, 10), meta={'kf_filename': 'list-2017-10.json', 'year': 2017, 'month': 10}, ) return @@ -40,20 +41,20 @@ def start_requests(self): if (until_year - 1) == year and month > until_month: break yield scrapy.Request( - url=self.base_list_url.format(year, month, 0, self.limit), + self.base_list_url.format(year, month, 0, self.limit), meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month}, ) - def base_parse(self, response, package_type): + @handle_error + def parse(self, response): data = json.loads(response.text) if 'data' in data: - yield_list = [] for data_item in data['data']: - if package_type == 'record': - yield_list.append(scrapy.Request( - url=self.record_url % data_item['ocid'].replace('ocds-70d2nz-', ''), - meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], package_type)} - )) + if self.data_type == 'record_package': + yield scrapy.Request( + self.record_url % data_item['ocid'].replace('ocds-70d2nz-', ''), + meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], self.data_type)} + ) else: # the data comes in this format: # "data": [ @@ -66,21 +67,19 @@ def base_parse(self, response, package_type): for stage in list(data_item.keys()): if 'url' in stage: name = stage.replace('url', '') - yield_list.append(scrapy.Request( - url=data_item[stage], + yield scrapy.Request( + data_item[stage], meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], name)} - )) + ) if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']: year = response.request.meta['year'] month = response.request.meta['month'] offset = data['pagination']['offset'] - yield_list.append(scrapy.Request( - url=self.base_list_url.format(year, month, self.limit + offset, self.limit), + yield scrapy.Request( + self.base_list_url.format(year, month, self.limit + offset, self.limit), meta={'year': year, 'month': month} - )) - return yield_list + ) elif 'status' in data and data['status'] != 200: - return [self.build_file_error_from_response(response, errors={'http_code': data['status']})] + yield self.build_file_error_from_response(response, errors={'http_code': data['status']}) else: - return [self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='%s_package' % package_type)] + yield self.build_file_from_response(response, data_type=self.data_type) diff --git a/kingfisher_scrapy/spiders/chile_compra_bulk.py b/kingfisher_scrapy/spiders/chile_compra_bulk.py index 8880bf935..9b5c18b18 100644 --- a/kingfisher_scrapy/spiders/chile_compra_bulk.py +++ b/kingfisher_scrapy/spiders/chile_compra_bulk.py @@ -7,14 +7,14 @@ class ChileCompraBulk(ZipSpider): name = 'chile_compra_bulk' + data_type = 'record_package' + download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } - parse_zipfile_kwargs = {'data_type': 'record_package'} - def start_requests(self): url = 'https://ocds.blob.core.windows.net/ocds/{}{}.zip' if self.sample: diff --git a/kingfisher_scrapy/spiders/chile_compra_records.py b/kingfisher_scrapy/spiders/chile_compra_records.py index 2565ab84b..f73209647 100644 --- a/kingfisher_scrapy/spiders/chile_compra_records.py +++ b/kingfisher_scrapy/spiders/chile_compra_records.py @@ -1,11 +1,6 @@ from kingfisher_scrapy.spiders.chile_base import ChileCompraBaseSpider -from kingfisher_scrapy.util import handle_error class ChileCompraRecords(ChileCompraBaseSpider): name = 'chile_compra_records' - - @handle_error - def parse(self, response): - for item in self.base_parse(response, 'record'): - yield item + data_type = 'record_package' diff --git a/kingfisher_scrapy/spiders/chile_compra_releases.py b/kingfisher_scrapy/spiders/chile_compra_releases.py index 93b9678d6..e1082f831 100644 --- a/kingfisher_scrapy/spiders/chile_compra_releases.py +++ b/kingfisher_scrapy/spiders/chile_compra_releases.py @@ -1,11 +1,6 @@ from kingfisher_scrapy.spiders.chile_base import ChileCompraBaseSpider -from kingfisher_scrapy.util import handle_error class ChileCompraReleases(ChileCompraBaseSpider): name = 'chile_compra_releases' - - @handle_error - def parse(self, response): - for item in self.base_parse(response, 'release'): - yield item + data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index 23c5ad94c..9a76d02d2 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -21,10 +21,7 @@ def start_requests(self): start_page = 1 if hasattr(self, 'page'): start_page = int(self.page) - yield scrapy.Request( - url=base_url % start_page, - meta={'kf_filename': 'page{}.json'.format(start_page)} - ) + yield scrapy.Request(base_url % start_page, meta={'kf_filename': 'page{}.json'.format(start_page)}) def parse(self, response): # In Colombia, every day at certain hour they run a process in their system that drops the database and make @@ -37,20 +34,16 @@ def parse(self, response): url = response.request.url logging.info('Sleeping due error {} in url {}'.format(response.status, url)) time.sleep(self.sleep) - yield scrapy.Request(url, - dont_filter=True, - meta={'kf_filename': hashlib.md5( - url.encode('utf-8')).hexdigest() + '.json'}) - + yield scrapy.Request( + url, + dont_filter=True, + meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} + ) elif self.is_http_success(response): - - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') - + yield self.build_file_from_response(response, data_type='release_package') if not self.sample: yield self.next_link(response) else: - yield self.build_file_error_from_response(response) except JSONDecodeError: diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 874b3c97e..2523f8bcc 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -15,17 +15,19 @@ class ColombiaBulk(ZipSpider): Downloads the zip file and sends 10 releases to kingfisher process. """ name = 'colombia_bulk' + data_type = 'release_in_Release' + encoding = 'iso-8859-1' + zip_file_format = 'json_lines' + download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } - parse_zipfile_kwargs = {'data_type': 'release_in_Release', 'file_format': 'json_lines', 'encoding': 'iso-8859-1'} - def start_requests(self): yield scrapy.Request( - url='https://www.colombiacompra.gov.co/transparencia/datos-json', + 'https://www.colombiacompra.gov.co/transparencia/datos-json', meta={'kf_filename': 'list.html'}, callback=self.parse_list, ) diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index a35d418c7..f6d117c01 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -15,9 +15,9 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], post_to_api=False) + yield self.build_file_from_response(response, post_to_api=False) # Load a line at the time, pass it to API with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar: with tar.extractfile(tar.getnames()[0]) as readfp: - yield from self.parse_json_lines(readfp, 'release_package', self.start_urls[0]) + yield from self.parse_json_lines(readfp, url=self.start_urls[0], data_type='release_package') diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 7962a749d..4fa46c4d1 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -10,9 +10,8 @@ class DominicanRepublic(BaseSpider): name = 'dominican_republic' - custom_settings = { - 'DOWNLOAD_TIMEOUT': 360 - } + + download_timeout = 360 # 6min def start_requests(self): yield scrapy.Request( @@ -33,17 +32,14 @@ def parse_list(self, response): if '/JSON_DGCP_' in url: yield scrapy.Request('https:' + url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) + @handle_error def parse(self, response): - if self.is_http_success(response): - file = tempfile.NamedTemporaryFile(delete=False) - file.write(response.body) - file.close() - with rarfile.RarFile(file.name, charset='utf-8') as tmpfile: - for f in tmpfile.infolist(): - with tmpfile.open(f) as jsonFile: - yield self.build_file(jsonFile.read(), f.filename, data_type='release_package', - url=response.request.url) - os.remove(file.name) - else: - filename = response.request.url.split('/')[-1] - yield self.build_file_error_from_response(response, file_name=filename) + file = tempfile.NamedTemporaryFile(delete=False) + file.write(response.body) + file.close() + with rarfile.RarFile(file.name, charset='utf-8') as tmpfile: + for f in tmpfile.infolist(): + with tmpfile.open(f) as jsonFile: + yield self.build_file(file_name=f.filename, url=response.request.url, data=jsonFile.read(), + data_type='release_package') + os.remove(file.name) diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index 203b9ba0e..a624bffea 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -3,16 +3,17 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class France(BaseSpider): - name = "france" +class France(SimpleSpider): + name = 'france' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( - url='https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4', + 'https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4', meta={'kf_filename': 'list.json'}, callback=self.parse_list, ) @@ -44,11 +45,3 @@ def parse_list(self, response): meta={'kf_filename': hashlib.md5(next_page.encode('utf-8')).hexdigest() + '.json'}, callback=self.parse_list ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="release_package" - ) diff --git a/kingfisher_scrapy/spiders/georgia_opendata.py b/kingfisher_scrapy/spiders/georgia_opendata.py index c810e9d1a..48306d104 100644 --- a/kingfisher_scrapy/spiders/georgia_opendata.py +++ b/kingfisher_scrapy/spiders/georgia_opendata.py @@ -5,15 +5,11 @@ class GeorgiaOpenData(ZipSpider): name = 'georgia_opendata' - custom_settings = { - # This has to download a 400MB file so ..... - 'DOWNLOAD_TIMEOUT': 60 * 20, - } + data_type = 'release_package' + zip_file_format = 'release_package' - parse_zipfile_kwargs = {'data_type': 'release_package', 'file_format': 'release_package'} + # The file is about 450MB. + download_timeout = 1200 # 20min def start_requests(self): - yield scrapy.Request( - url='http://opendata.spa.ge/json/allTenders.zip', - meta={'kf_filename': 'all.json'} - ) + yield scrapy.Request('http://opendata.spa.ge/json/allTenders.zip', meta={'kf_filename': 'all.json'}) diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index fc257370c..60438bbc1 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -8,7 +8,4 @@ class GeorgiaRecords(LinksSpider): data_type = 'record_package' def start_requests(self): - yield scrapy.Request( - url='https://odapi.spa.ge/api/records.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://odapi.spa.ge/api/records.json', meta={'kf_filename': 'page1.json'}) diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index fd0ed606a..6cf0263da 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -8,7 +8,4 @@ class GeorgiaReleases(LinksSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - url='https://odapi.spa.ge/api/releases.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://odapi.spa.ge/api/releases.json', meta={'kf_filename': 'page1.json'}) diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py index 6d3fb9fda..a88e3e4bc 100644 --- a/kingfisher_scrapy/spiders/honduras_cost.py +++ b/kingfisher_scrapy/spiders/honduras_cost.py @@ -2,21 +2,23 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class HondurasCoST(BaseSpider): +class HondurasCoST(SimpleSpider): name = 'honduras_cost' + data_type = 'record_package' def start_requests(self): yield scrapy.Request( 'http://app.sisocs.org/protected/ocdsShow/', meta={'kf_filename': 'list.html'}, + callback=self.parse_list ) @handle_error - def parse(self, response): + def parse_list(self, response): btns = response.css('script').xpath('text()').getall() for btn in btns: if 'download-all' and 'url:' in btn: @@ -26,14 +28,5 @@ def parse(self, response): url = url.replace('"', '').replace(',', '').lstrip('url:') yield scrapy.Request( url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, - callback=self.parse_btn + meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) - - @handle_error - def parse_btn(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="record_package" - ) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index cd9672006..91e2347f6 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -8,6 +8,8 @@ class HondurasONCAE(ZipSpider): name = 'honduras_oncae' + data_type = 'release_package' + # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 @@ -15,10 +17,11 @@ def start_requests(self): yield scrapy.Request( 'http://oncae.gob.hn/datosabiertos', meta={'kf_filename': 'list.html'}, + callback=self.parse_list ) @handle_error - def parse(self, response): + def parse_list(self, response): urls = response.css(".article-content ul")\ .xpath(".//a[contains(., '[json]')]/@href")\ .getall() @@ -26,8 +29,4 @@ def parse(self, response): urls = [urls[0]] for url in urls: filename = urlparse(url).path.split('/')[-1] - yield scrapy.Request(url, meta={'kf_filename': filename}, callback=self.parse_items) - - @handle_error - def parse_items(self, response): - yield from self.parse_zipfile(response, data_type='release_package') + yield scrapy.Request(url, meta={'kf_filename': filename}) diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index e60582bae..55195d821 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -1,14 +1,14 @@ import json -from urllib.parse import urlparse import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class HondurasPortalBulkFiles(BaseSpider): +class HondurasPortalBulkFiles(SimpleSpider): name = 'honduras_portal_bulk_files' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( @@ -29,14 +29,3 @@ def parse_list(self, response): for item in filelist: url = item['urls']['json'] yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) - - def parse(self, response): - filename = urlparse(response.request.url).path.split('/')[-2] - if self.is_http_success(response): - yield self.build_file_from_response( - response, - filename, - data_type='release_package' - ) - else: - yield self.build_file_error_from_response(response, file_name=filename) diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py index eb1744542..1d3fc5de2 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_records.py +++ b/kingfisher_scrapy/spiders/honduras_portal_records.py @@ -1,38 +1,18 @@ import hashlib -import json import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import LinksSpider -class HondurasPortalRecords(BaseSpider): +class HondurasPortalRecords(LinksSpider): name = 'honduras_portal_records' + data_type = 'record_package' + data_pointer = '/recordPackage' + next_pointer = '/next' + download_delay = 0.9 def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json' - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) - - @handle_error - def parse(self, response): - json_data = json.loads(response.text) - yield self.build_file( - json.dumps(json_data['releasePackage']).encode(), - response.request.meta['kf_filename'], - data_type='record_package', - url=response.request.url - ) - - url = json_data.get('next') - if not url or self.sample: - return - else: - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/honduras_portal_releases.py b/kingfisher_scrapy/spiders/honduras_portal_releases.py index 1d759c00b..a676383ba 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_releases.py +++ b/kingfisher_scrapy/spiders/honduras_portal_releases.py @@ -1,38 +1,18 @@ import hashlib -import json import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import LinksSpider -class HondurasPortalReleases(BaseSpider): +class HondurasPortalReleases(LinksSpider): name = 'honduras_portal_releases' + data_type = 'release_package' + data_pointer = '/releasePackage' + next_pointer = '/next' + download_delay = 0.9 def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json' - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) - - @handle_error - def parse(self, response): - json_data = json.loads(response.text) - yield self.build_file( - json.dumps(json_data['releasePackage']).encode(), - response.request.meta['kf_filename'], - data_type='release_package', - url=response.request.url - ) - - url = json_data.get('next') - if not url or self.sample: - return - else: - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py index 0b6c7a683..b5af803be 100644 --- a/kingfisher_scrapy/spiders/indonesia_bandung.py +++ b/kingfisher_scrapy/spiders/indonesia_bandung.py @@ -48,8 +48,4 @@ def parse(self, response): json_data = json.loads(response.text) if len(json_data) == 0: return - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release' - ) + yield self.build_file_from_response(response, data_type='release') diff --git a/kingfisher_scrapy/spiders/kenya_makueni.py b/kingfisher_scrapy/spiders/kenya_makueni.py index cccda785d..11d4d9acd 100644 --- a/kingfisher_scrapy/spiders/kenya_makueni.py +++ b/kingfisher_scrapy/spiders/kenya_makueni.py @@ -3,22 +3,21 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class KenyaMakueni(BaseSpider): +class KenyaMakueni(SimpleSpider): name = 'kenya_makueni' + data_type = 'release_package_list' url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={}&pageNumber={}' def start_requests(self): if self.sample: - page_number = 0 - page_size = 10 + url = self.url.format(10, 0) yield scrapy.Request( - self.url.format(page_size, page_number), - meta={'kf_filename': hashlib.md5((self.url + - str(page_number)).encode('utf-8')).hexdigest() + '.json'} + url, + meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) else: yield scrapy.Request( @@ -33,13 +32,8 @@ def parse_count(self, response): page_size = 300 for page_number in range((ceil(total / page_size))): + url = self.url.format(page_size, page_number) yield scrapy.Request( - self.url.format(page_size, page_number), - meta={'kf_filename': hashlib.md5((self.url + - str(page_number)).encode('utf-8')).hexdigest() + '.json'} + url, + meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package_list') diff --git a/kingfisher_scrapy/spiders/malta.py b/kingfisher_scrapy/spiders/malta.py index 887df95de..1c3dfe4b0 100644 --- a/kingfisher_scrapy/spiders/malta.py +++ b/kingfisher_scrapy/spiders/malta.py @@ -10,8 +10,7 @@ class Malta(ZipSpider): name = 'malta' - - parse_zipfile_kwargs = {'data_type': 'record_package'} + data_type = 'record_package' def start_requests(self): yield scrapy.Request( diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py index 00536da39..8d1b98f83 100644 --- a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py +++ b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py @@ -14,7 +14,7 @@ class MexicoAdministracionPublicaFederal(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://api.datos.gob.mx/v1/contratacionesabiertas', + 'https://api.datos.gob.mx/v1/contratacionesabiertas', meta={'kf_filename': 'page1.json'} ) @@ -23,11 +23,7 @@ def parse(self, response): data = json.loads(response.text) # Actual data - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="record_package_list_in_results" - ) + yield self.build_file_from_response(response, data_type='record_package_list_in_results') # Load more pages? if data['pagination']['page'] == 1 and not self.sample: @@ -36,7 +32,7 @@ def parse(self, response): limit = data['pagination']['pageSize'] while ((page - 1) * limit) < total: yield scrapy.Request( - url='https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' % page, + 'https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' % page, meta={'kf_filename': 'page' + str(page) + '.json'} ) page += 1 diff --git a/kingfisher_scrapy/spiders/mexico_cdmx.py b/kingfisher_scrapy/spiders/mexico_cdmx.py index 33f73da03..7a96ae64a 100644 --- a/kingfisher_scrapy/spiders/mexico_cdmx.py +++ b/kingfisher_scrapy/spiders/mexico_cdmx.py @@ -2,16 +2,17 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class MexicoCDMXSource(BaseSpider): +class MexicoCDMXSource(SimpleSpider): name = 'mexico_cdmx' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( - url='http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos', + 'http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -23,13 +24,4 @@ def parse_list(self, response): data = [data[0]] for data_item in data: - yield scrapy.Request( - url=data_item['uri'], - meta={'kf_filename': 'id%s.json' % data_item['id']}, - callback=self.parse_record - ) - - @handle_error - def parse_record(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield scrapy.Request(data_item['uri'], meta={'kf_filename': 'id%s.json' % data_item['id']}) diff --git a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py index 2d72d6aac..08288d218 100644 --- a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py +++ b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py @@ -1,19 +1,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider -class MexicoGrupoAeroporto(BaseSpider): +class MexicoGrupoAeroporto(SimpleSpider): name = 'mexico_grupo_aeroporto' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( - url='http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json', + 'http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json', meta={'kf_filename': 'concentrado05032019RELEASE.json'} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') diff --git a/kingfisher_scrapy/spiders/mexico_inai.py b/kingfisher_scrapy/spiders/mexico_inai.py index 06160b3c4..3cb7bbd56 100644 --- a/kingfisher_scrapy/spiders/mexico_inai.py +++ b/kingfisher_scrapy/spiders/mexico_inai.py @@ -3,16 +3,18 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class MexicoINAI(BaseSpider): +class MexicoINAI(SimpleSpider): name = 'mexico_inai' + data_type = 'release_package' + encoding = 'utf-8-sig' def start_requests(self): yield scrapy.Request( - url='https://datos.gob.mx/busca/api/3/action/package_search?q=organization:inai&rows=500', + 'https://datos.gob.mx/busca/api/3/action/package_search?q=organization:inai&rows=500', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -25,7 +27,7 @@ def parse_list(self, response): if resource['format'] == 'JSON': kf_filename = 'redirect-' + hashlib.md5(resource['url'].encode('utf-8')).hexdigest() + '.json' yield scrapy.Request( - url=resource['url'], + resource['url'], meta={ 'kf_filename': kf_filename, 'dont_redirect': True @@ -37,18 +39,8 @@ def parse_redirect(self, response): if response.status == 301: url = response.headers['Location'].decode("utf-8").replace("open?", "uc?export=download&") yield scrapy.Request( - url=url, - meta={'kf_filename': 'data-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, - callback=self.parse + url, + meta={'kf_filename': 'data-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) else: yield self.build_file_error_from_response(response) - - @handle_error - def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="release_package", - encoding='utf-8-sig' - ) diff --git a/kingfisher_scrapy/spiders/mexico_jalisco.py b/kingfisher_scrapy/spiders/mexico_jalisco.py index 15809fd5b..7aef33eba 100644 --- a/kingfisher_scrapy/spiders/mexico_jalisco.py +++ b/kingfisher_scrapy/spiders/mexico_jalisco.py @@ -3,16 +3,17 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class MexicoJalisco(BaseSpider): +class MexicoJalisco(SimpleSpider): name = 'mexico_jalisco' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( - url='https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts', + 'https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -24,7 +25,7 @@ def parse_list(self, response): datas = [datas[0]] for data in datas: yield scrapy.Request( - url=data['URIContract'], + data['URIContract'], meta={'kf_filename': 'id%s.json' % data['ocid']}, callback=self.parse_record_package ) @@ -35,14 +36,7 @@ def parse_record_package(self, response): if 'packages' in json_data: for url in json_data['packages']: yield scrapy.Request( - url=url, - meta={'kf_filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest()}, - callback=self.parse_release_package + url, + meta={'kf_filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest()} ) - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='record_package') - - @handle_error - def parse_release_package(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='record_package') diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien.py index dbc8bb161..c999d7670 100644 --- a/kingfisher_scrapy/spiders/mexico_quien_es_quien.py +++ b/kingfisher_scrapy/spiders/mexico_quien_es_quien.py @@ -45,9 +45,8 @@ def parse_count(self, response): @handle_error def parse(self, response): json_data = json.loads(response.text) - yield self.build_file( - json.dumps(json_data['data']).encode(), - response.request.meta['kf_filename'], - data_type='record_package_list', - url=response.request.url + yield self.build_file_from_response( + response, + data=json.dumps(json_data['data']).encode(), + data_type='record_package_list' ) diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index e87e9a88c..fb146a613 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -20,17 +20,16 @@ class Moldova(BaseSpider): def start_requests(self): for endpoint, url in self.endpoints.items(): yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'meta-{}-start.json'.format(endpoint), 'endpoint': endpoint, 'data': False} ) @handle_error def parse(self, response): if response.request.meta['data']: - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='record_package') + yield self.build_file_from_response(response, data_type='record_package') else: - self.build_file_from_response(response, response.request.meta['kf_filename']) + self.build_file_from_response(response) json_data = json.loads(response.text) offset = json_data.get('offset') # not having an offset in the data means the data has come to an end. @@ -42,7 +41,7 @@ def parse(self, response): for data in json_data.get('data', []): yield scrapy.Request( - url=endpoint_url + data['ocid'], + endpoint_url + data['ocid'], meta={ 'kf_filename': 'data-{}-{}.json'.format(endpoint, data['ocid']), 'endpoint': endpoint, @@ -54,7 +53,7 @@ def parse(self, response): return yield scrapy.Request( - url=endpoint_url + '?offset=' + offset, + endpoint_url + '?offset=' + offset, meta={ 'kf_filename': 'meta-{}-{}.json'.format(endpoint, offset), 'endpoint': endpoint, diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 267536ed0..a8324c211 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -1,26 +1,21 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider -class MoldovaOld(BaseSpider): +class MoldovaOld(SimpleSpider): name = 'moldova_old' + data_type = 'release_package' def start_requests(self): if self.sample: yield scrapy.Request( - url='http://opencontracting.date.gov.md/ocds-api/year/2017', + 'http://opencontracting.date.gov.md/ocds-api/year/2017', meta={'kf_filename': 'sample.json'} ) else: for year in range(2012, 2018): yield scrapy.Request( - url='http://opencontracting.date.gov.md/ocds-api/year/%d' % year, + 'http://opencontracting.date.gov.md/ocds-api/year/%d' % year, meta={'kf_filename': 'year-%d.json' % year} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') diff --git a/kingfisher_scrapy/spiders/moldova_records.py b/kingfisher_scrapy/spiders/moldova_records.py index 669ee79f5..3d6bc5a1d 100644 --- a/kingfisher_scrapy/spiders/moldova_records.py +++ b/kingfisher_scrapy/spiders/moldova_records.py @@ -9,6 +9,6 @@ class MoldovaRecords(LinksSpider): def start_requests(self): yield scrapy.Request( - url='http://ocds.mepps.openprocurement.io/api/records.json', + 'http://ocds.mepps.openprocurement.io/api/records.json', meta={'kf_filename': 'page1.json'} ) diff --git a/kingfisher_scrapy/spiders/moldova_releases.py b/kingfisher_scrapy/spiders/moldova_releases.py index aff804665..81da3a56e 100644 --- a/kingfisher_scrapy/spiders/moldova_releases.py +++ b/kingfisher_scrapy/spiders/moldova_releases.py @@ -9,6 +9,6 @@ class MoldovaReleases(LinksSpider): def start_requests(self): yield scrapy.Request( - url='http://ocds.mepps.openprocurement.io/api/releases.json', + 'http://ocds.mepps.openprocurement.io/api/releases.json', meta={'kf_filename': 'page1.json'} ) diff --git a/kingfisher_scrapy/spiders/nepal_dhangadhi.py b/kingfisher_scrapy/spiders/nepal_dhangadhi.py index 342e9d208..1b0f635a9 100644 --- a/kingfisher_scrapy/spiders/nepal_dhangadhi.py +++ b/kingfisher_scrapy/spiders/nepal_dhangadhi.py @@ -3,12 +3,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class NepalDhangadhi(BaseSpider): - name = "nepal_dhangadhi" +class NepalDhangadhi(SimpleSpider): + name = 'nepal_dhangadhi' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( @@ -30,11 +31,3 @@ def parse_list(self, response): ) if self.sample: break - - @handle_error - def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release_package' - ) diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py index 18fef550d..3cc709531 100644 --- a/kingfisher_scrapy/spiders/nepal_portal.py +++ b/kingfisher_scrapy/spiders/nepal_portal.py @@ -3,12 +3,12 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider -class NepalPortal(BaseSpider): +class NepalPortal(SimpleSpider): name = 'nepal_portal' + data_type = 'release_package' def start_requests(self): if self.sample: @@ -26,11 +26,3 @@ def start_requests(self): meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) current_year += 1 - - @handle_error - def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release_package' - ) diff --git a/kingfisher_scrapy/spiders/nigeria_portal.py b/kingfisher_scrapy/spiders/nigeria_portal.py index 8b532503f..82eac8521 100644 --- a/kingfisher_scrapy/spiders/nigeria_portal.py +++ b/kingfisher_scrapy/spiders/nigeria_portal.py @@ -2,12 +2,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class NigeriaPortal(BaseSpider): +class NigeriaPortal(SimpleSpider): name = 'nigeria_portal' + data_type = 'release_package' + download_delay = 0.9 user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # noqa: E501 @@ -15,10 +17,11 @@ def start_requests(self): yield scrapy.Request( 'http://nocopo.bpp.gov.ng/OpenData.aspx', meta={'kf_filename': 'list.html'}, + callback=self.parse_list ) @handle_error - def parse(self, response): + def parse_list(self, response): formdata = { '__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)').extract_first(), '__VIEWSTATEGENERATOR': 'CA0B0334', @@ -36,14 +39,5 @@ def parse(self, response): yield scrapy.FormRequest.from_response( response, formdata=formdata, - meta={'kf_filename': hashlib.md5(response.url.encode('utf-8')).hexdigest() + '.json'}, - callback=self.parse_post - ) - - @handle_error - def parse_post(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release_package' + meta={'kf_filename': hashlib.md5(response.url.encode('utf-8')).hexdigest() + '.json'} ) diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index 79bb8bb59..c3399af98 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -66,7 +66,7 @@ def from_crawler(cls, crawler, *args, **kwargs): def start_requests(self): """ Start requesting access token """ yield scrapy.Request( - url="https://api.openopps.com/api/api-token-auth/", + 'https://api.openopps.com/api/api-token-auth/', method='POST', headers={"Accept": "*/*", "Content-Type": "application/json"}, body=json.dumps({"username": self.username, "password": self.password}), @@ -109,11 +109,11 @@ def start_requests_pages(self): # Case if we want to download a sample if self.sample: date = datetime(2011, 1, 1) - yield from self.parse_date_list(date, date, search_h) + yield from self.request_range_per_day(date, date, search_h) else: # Case if we have date range parameters if self.from_date and self.until_date: - yield from self.parse_date_list(self.from_date, self.until_date, search_h) + yield from self.request_range_per_day(self.from_date, self.until_date, search_h) else: # Use larger ranges for filters with less than (api_limit) search results release_date_gte_list = ['', '2009-01-01', '2010-01-01', '2010-07-01'] @@ -127,19 +127,21 @@ def start_requests_pages(self): start_date = datetime(year, 1, 1) end_date = datetime(year, datetime.now().month, datetime.now().day) \ if year == datetime.now().year else datetime(year, 12, 31) - yield from self.parse_date_list(start_date, end_date, search_h) + yield from self.request_range_per_day(start_date, end_date, search_h) def request_range(self, start_date, end_date, search_h): + url = self.base_page_url.format(start_date, end_date) return scrapy.Request( - url=self.base_page_url.format( - start_date, - end_date - ), - headers={"Accept": "*/*", "Content-Type": "application/json"}, - meta={"release_date": start_date, "search_h": search_h}, + url, + meta={ + 'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', + 'release_date': start_date, + 'search_h': search_h, + }, + headers={'Accept': '*/*', 'Content-Type': 'application/json'} ) - def parse_date_list(self, start_date, end_date, search_h): + def request_range_per_day(self, start_date, end_date, search_h): date_list = [(start_date + timedelta(days=d)).strftime("%Y-%m-%d") for d in range((end_date - start_date).days + 1)] @@ -163,12 +165,7 @@ def parse(self, response): all_data.append(json_data) if all_data: - yield self.build_file( - all_data, - filename=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json', - url=response.request.url, - data_type='release_package_list' - ) + yield self.build_file_from_response(data=all_data, data_type='release_package_list') if self.sample: return @@ -190,7 +187,7 @@ def parse(self, response): self.logger.info('Time_diff: {}'.format(time_diff.total_seconds())) self.reauthenticating = True yield scrapy.Request( - url="https://api.openopps.com/api/api-token-auth/", + 'https://api.openopps.com/api/api-token-auth/', method='POST', headers={"Accept": "*/*", "Content-Type": "application/json"}, body=json.dumps({"username": self.username, "password": self.password}), @@ -243,5 +240,4 @@ def parse(self, response): self.logger.info('Status: {}. Results exceeded in a range of one hour, we save the ' 'first 10,000 data for: {}'.format(response.status, response.request.url)) else: - yield self.build_file_error_from_response( - response, filename=hashlib.md5(response.request.url.encode('utf-8')).hexdigest()) + yield self.build_file_error_from_response(response) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 69b2d4a27..4c6e265fc 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -4,12 +4,12 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.exceptions import AuthenticationError from kingfisher_scrapy.util import handle_error -class ParaguayDNCPBaseSpider(BaseSpider): +class ParaguayDNCPBaseSpider(SimpleSpider): """ This base class contains methods used for Paraguay DNCP's authentication protocol. """ @@ -137,14 +137,6 @@ def parse_pages(self, response): callback=self.parse_pages ) - @handle_error - def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type=self.data_type - ) - def get_files_to_download(self, content): """ Override this """ diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py index b0db40f4d..d10ebb989 100644 --- a/kingfisher_scrapy/spiders/paraguay_hacienda.py +++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py @@ -63,7 +63,7 @@ def parse(self, response): total_pages = data['meta']['totalPages'] for page in range(2, total_pages+1): yield scrapy.Request( - url=self.base_list_url.format(page), + self.base_list_url.format(page), meta={ 'kf_filename': 'list-{}.json'.format(page), 'meta': True, @@ -83,7 +83,7 @@ def parse(self, response): if row['idLlamado'] and row['idLlamado'] not in self.release_ids: self.release_ids.append(row['idLlamado']) yield scrapy.Request( - url=base_url.format(row['idLlamado']), + base_url.format(row['idLlamado']), meta={ 'kf_filename': 'release-{}.json'.format(row['idLlamado']), 'meta': False, @@ -92,8 +92,7 @@ def parse(self, response): dont_filter=True ) else: - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') def request_access_token(self): """ Requests a new access token """ diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py index 049e4b532..7a3b5f5cc 100644 --- a/kingfisher_scrapy/spiders/portugal.py +++ b/kingfisher_scrapy/spiders/portugal.py @@ -9,17 +9,19 @@ class Portugal(ZipSpider): name = 'portugal' + data_type = 'record_package' + encoding = 'iso-8859-1' + zip_file_format = 'json_lines' + download_warnsize = 0 download_timeout = 9999 - parse_zipfile_kwargs = {'data_type': 'record_package', 'file_format': 'json_lines', 'encoding': 'iso-8859-1'} - def start_requests(self): url = 'https://dados.gov.pt/api/1/datasets/?q=ocds&organization={}&page_size={}' id = '5ae97fa2c8d8c915d5faa3bf' page_size = 20 yield scrapy.Request( - url=url.format(id, page_size), + url.format(id, page_size), meta={'kf_filename': 'list.json'}, callback=self.parse_list ) diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index 49eb7597c..76b365dcd 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -2,12 +2,12 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider -class Scotland(BaseSpider): +class Scotland(SimpleSpider): name = 'scotland' + data_type = 'release_package' notice_types = [ 1, # OJEU - F1 - Prior Information Notice @@ -40,7 +40,7 @@ def start_requests(self): if self.sample: marker = now - datetime.timedelta(days=14) for notice_type in self.notice_types: - yield scrapy.Request(url=format_string.format(marker, notice_type), + yield scrapy.Request(format_string.format(marker, notice_type), meta={'kf_filename': 'sample_{}.json'.format(notice_type)}) else: # It's meant to go back a year, but in testing it seemed to be year minus one day! @@ -48,11 +48,6 @@ def start_requests(self): while marker <= now: datestring = '{:04d}-{:02d}-{:02d}'.format(marker.year, marker.month, marker.day) for notice_type in self.notice_types: - yield scrapy.Request(url=format_string.format(datestring, notice_type), + yield scrapy.Request(format_string.format(datestring, notice_type), meta={'kf_filename': '{}_type_{}.json'.format(datestring, notice_type)}) marker = marker + datetime.timedelta(days=14) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') diff --git a/kingfisher_scrapy/spiders/test_fail.py b/kingfisher_scrapy/spiders/test_fail.py index 94162e4df..e27ef2343 100644 --- a/kingfisher_scrapy/spiders/test_fail.py +++ b/kingfisher_scrapy/spiders/test_fail.py @@ -3,36 +3,25 @@ """ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider -class TestFail(BaseSpider): - name = "test_fail" +class TestFail(SimpleSpider): + name = 'test_fail' + data_type = 'release_package' def start_requests(self): # Fine yield scrapy.Request( - url='https://raw.githubusercontent.com/open-contracting/sample-data/master/fictional-example/1.1/ocds-213czf-000-00001-01-planning.json', # noqa: E501 + 'https://raw.githubusercontent.com/open-contracting/sample-data/master/fictional-example/1.1/ocds-213czf-000-00001-01-planning.json', # noqa: E501 meta={'kf_filename': 'fine.json'} ) # A straight 404 yield scrapy.Request( - url='https://www.open-contracting.org/i-want-a-kitten', + 'https://www.open-contracting.org/i-want-a-kitten', meta={'kf_filename': 'http-404.json'} ) # I broke the server .... - yield scrapy.Request( - url='http://httpstat.us/500', - meta={'kf_filename': 'http-500.json'} - ) + yield scrapy.Request('http://httpstat.us/500', meta={'kf_filename': 'http-500.json'}) # .... but actually, yes, I also broke the Proxy too - yield scrapy.Request( - url='http://httpstat.us/502', - meta={'kf_filename': 'http-502.json'} - ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield scrapy.Request('http://httpstat.us/502', meta={'kf_filename': 'http-502.json'}) diff --git a/kingfisher_scrapy/spiders/uganda_releases.py b/kingfisher_scrapy/spiders/uganda_releases.py index 73b08bd27..1e8f5305d 100644 --- a/kingfisher_scrapy/spiders/uganda_releases.py +++ b/kingfisher_scrapy/spiders/uganda_releases.py @@ -3,12 +3,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class Uganda(BaseSpider): +class Uganda(SimpleSpider): name = 'uganda_releases' + data_type = 'release_package' + download_delay = 0.9 def start_requests(self): @@ -65,11 +67,3 @@ def parse_data(self, response): break if self.sample: break - - @handle_error - def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release_package' - ) diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index d1629a754..333cc89c9 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -11,25 +11,18 @@ class UKContractsFinder(BaseSpider): base_url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d' def start_requests(self): - yield scrapy.Request( - url=self.base_url % 1, - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request(self.base_url % 1, meta={'kf_filename': 'page1.json'}) @handle_error def parse(self, response): yield self.build_file_from_response( response, - response.request.meta['kf_filename'], data_type='release_package_list_in_results', - encoding='ISO-8859-1' + encoding='iso-8859-1' ) if not self.sample and response.request.meta['kf_filename'] == 'page1.json': json_data = json.loads(response.text) last_page = json_data['maxPage'] for page in range(1, last_page + 1): - yield scrapy.Request( - url=self.base_url % page, - meta={'kf_filename': 'page%d.json' % page} - ) + yield scrapy.Request(self.base_url % page, meta={'kf_filename': 'page%d.json' % page}) diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py index f4486eeb5..f1d6152a8 100644 --- a/kingfisher_scrapy/spiders/uruguay_base.py +++ b/kingfisher_scrapy/spiders/uruguay_base.py @@ -3,15 +3,15 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider -from kingfisher_scrapy.util import handle_error +from kingfisher_scrapy.base_spider import SimpleSpider -class UruguayBase(BaseSpider): - base_url = 'http://comprasestatales.gub.uy/ocds/rss/{year:d}/{month:02d}' +class UruguayBase(SimpleSpider): download_delay = 0.9 def start_requests(self): + base_url = 'http://comprasestatales.gub.uy/ocds/rss/{year:d}/{month:02d}' + current_date = date(2017, 11, 1) if self.sample: end_date = date(2017, 12, 1) @@ -22,17 +22,9 @@ def start_requests(self): current_date += timedelta(days=32) current_date.replace(day=1) - url = self.base_url.format(year=current_date.year, month=current_date.month) + url = base_url.format(year=current_date.year, month=current_date.month) yield scrapy.Request( url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, callback=self.parse_list ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type=response.request.meta['data_type'] - ) diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py index e7c0b96d0..1d87df7c1 100644 --- a/kingfisher_scrapy/spiders/uruguay_historical.py +++ b/kingfisher_scrapy/spiders/uruguay_historical.py @@ -5,6 +5,8 @@ class UruguayHistorical(ZipSpider): name = 'uruguay_historical' + data_type = 'release_package' + # the files takes too long to be downloaded, so we increase the download timeout download_timeout = 1000 custom_settings = { @@ -14,8 +16,6 @@ class UruguayHistorical(ZipSpider): 'Chrome/37.0.2049.0 Safari/537.36', } - parse_zipfile_kwargs = {'data_type': 'release_package'} - def start_requests(self): base_url = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites/agencia-compras-contrataciones' \ '-estado/files/2019-04/OCDS-{}.zip' diff --git a/kingfisher_scrapy/spiders/uruguay_records.py b/kingfisher_scrapy/spiders/uruguay_records.py index 1941d89ae..1e914bd1c 100644 --- a/kingfisher_scrapy/spiders/uruguay_records.py +++ b/kingfisher_scrapy/spiders/uruguay_records.py @@ -8,22 +8,16 @@ class UruguayRecords(UruguayBase): name = 'uruguay_records' - base_record_url = 'https://www.comprasestatales.gub.uy/ocds/record/{}' + data_type = 'record_package' @handle_error def parse_list(self, response): + base_record_url = 'https://www.comprasestatales.gub.uy/ocds/record/{}' root = response.xpath('//item/title/text()').getall() if self.sample: root = [root[0]] for id_compra in root: - url = self.get_url_compra(id_compra) - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', - 'data_type': 'record_package'} - ) - - def get_url_compra(self, text): - return self.base_record_url.format(text.split(',')[0].replace('id_compra:', '')) + url = base_record_url.format(id_compra.split(',')[0].replace('id_compra:', '')) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/uruguay_releases.py b/kingfisher_scrapy/spiders/uruguay_releases.py index 81e6b4f2c..832cc0274 100644 --- a/kingfisher_scrapy/spiders/uruguay_releases.py +++ b/kingfisher_scrapy/spiders/uruguay_releases.py @@ -8,6 +8,7 @@ class UruguayReleases(UruguayBase): name = 'uruguay_releases' + data_type = 'release_package' @handle_error def parse_list(self, response): @@ -17,8 +18,4 @@ def parse_list(self, response): root = [root[0]] for url in root: - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', - 'data_type': 'release_package'} - ) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/zambia.py b/kingfisher_scrapy/spiders/zambia.py index 0302d80a3..b82185d3d 100644 --- a/kingfisher_scrapy/spiders/zambia.py +++ b/kingfisher_scrapy/spiders/zambia.py @@ -8,8 +8,7 @@ class Zambia(ZipSpider): name = 'zambia' - - parse_zipfile_kwargs = {'data_type': 'record_package'} + data_type = 'record_package' def start_requests(self): yield scrapy.Request( diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index ef2f3a361..d425d81ca 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -14,8 +14,6 @@ def handle_error(decorated): """ @wraps(decorated) def wrapper(self, response): - # All 2xx codes are successful. - # https://tools.ietf.org/html/rfc7231#section-6.3 if self.is_http_success(response): yield from decorated(self, response) else: diff --git a/requirements.in b/requirements.in index 34c50ac3d..df5a116ec 100644 --- a/requirements.in +++ b/requirements.in @@ -1,6 +1,7 @@ # Any change to this file MUST be replicated in: # https://github.com/open-contracting/deploy/blob/master/salt/ocdskingfishercollect/scrapyd-requirements.txt +jsonpointer rarfile requests Scrapy diff --git a/requirements.txt b/requirements.txt index 16b7f6ad4..c126ac982 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,6 @@ automat==0.8.0 # via twisted certifi==2019.11.28 # via requests cffi==1.13.2 # via cryptography chardet==3.0.4 # via requests -click==7.1.2 # via pip-tools constantly==15.1.0 # via twisted cryptography==2.8 # via pyopenssl, scrapy, service-identity cssselect==1.1.0 # via parsel, scrapy @@ -17,9 +16,9 @@ hyperlink==19.0.0 # via twisted idna==2.8 # via hyperlink, requests ijson==3.0.3 incremental==17.5.0 # via twisted +jsonpointer==2.0 lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy -pip-tools==5.1.0 protego==0.1.16 # via scrapy pyasn1-modules==0.2.7 # via service-identity pyasn1==0.4.8 # via pyasn1-modules, service-identity @@ -33,12 +32,11 @@ requests==2.22.0 scrapy==1.8.0 scrapyd-client==1.1.0 service-identity==18.1.0 # via scrapy -six==1.13.0 # via automat, cryptography, parsel, pip-tools, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib +six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib twisted==20.3.0 # via scrapy urllib3==1.25.7 # via requests w3lib==1.21.0 # via parsel, scrapy zope.interface==4.7.1 # via scrapy, twisted # The following packages are considered to be unsafe in a requirements file: -# pip # setuptools diff --git a/requirements_dev.txt b/requirements_dev.txt index 1be8d3f0c..9ef3704a8 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -9,7 +9,7 @@ automat==0.8.0 certifi==2019.11.28 cffi==1.13.2 chardet==3.0.4 -click==7.1.2 +click==7.1.2 # via pip-tools constantly==15.1.0 coverage==5.0.3 # via coveralls, pytest-cov coveralls==2.0.0 @@ -24,6 +24,7 @@ ijson==3.0.3 importlib-metadata==1.3.0 # via pluggy, pytest incremental==17.5.0 isort==4.3.21 +jsonpointer==2.0 lxml==4.4.2 mccabe==0.6.1 # via flake8 more-itertools==8.0.2 # via pytest, zipp diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index e80295db0..61db44389 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -45,7 +45,8 @@ def test_build_file_from_response(): response.request = Mock() response.request.url = 'https://example.com/remote.json' - actual = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1') + actual = spider.build_file_from_response(response, file_name='file.json', data_type='release_package', + encoding='iso-8859-1') assert actual == File({ 'file_name': 'file.json', @@ -63,7 +64,8 @@ def test_build_file(): data = b'{"key": "value"}' url = 'https://example.com/remote.json' - actual = spider.build_file(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1') + actual = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package', + encoding='iso-8859-1') assert actual == File({ 'file_name': 'file.json', diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 332001430..cb51cdfcd 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -69,7 +69,7 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d kwargs = {} if encoding: kwargs['encoding'] = encoding - item = spider.build_file(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json', + item = spider.build_file(file_name='file.json', url='https://example.com/remote.json', data=b'{"key": "value"}', data_type='release_package', post_to_api=post_to_api, **kwargs) store_extension.item_scraped(item, spider) @@ -146,12 +146,12 @@ def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok if encoding: kwargs['encoding'] = encoding item = spider.build_file_item( - 1, - b'{"key": "value"}', - data_type='release_package', + number=1, + file_name='data.json', url='https://example.com/remote.json', + data=b'{"key": "value"}', + data_type='release_package', encoding=encoding2, - file_name='data.json', ) api_extension.item_scraped(item, spider) @@ -294,7 +294,8 @@ def test_item_scraped_with_build_file_from_response(sample, path, tmpdir): response.request = Mock() response.request.url = 'https://example.com/remote.json' - item = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1') + item = spider.build_file_from_response(response, file_name='file.json', data_type='release_package', + encoding='iso-8859-1') store_extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: @@ -322,7 +323,8 @@ def test_item_scraped_with_build_file(sample, path, tmpdir): data = b'{"key": "value"}' url = 'https://example.com/remote.json' - item = spider.build_file(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1') + item = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package', + encoding='iso-8859-1') store_extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: @@ -349,4 +351,4 @@ def test_build_file_with_existing_directory(): os.makedirs(os.path.join(files_store, 'test', '20010203_040506')) # No FileExistsError exception. - store_extension.item_scraped(spider.build_file(b'{"key": "value"}', 'file.json'), spider) + store_extension.item_scraped(spider.build_file(file_name='file.json', data=b'{"key": "value"}'), spider) diff --git a/tests/test_links_spider.py b/tests/test_links_spider.py index 19703e749..f41a26ee9 100644 --- a/tests/test_links_spider.py +++ b/tests/test_links_spider.py @@ -11,7 +11,7 @@ def test_next_link(): request = spider.next_link(response_fixture()) - assert isinstance(request, Request) + assert type(request) is Request assert request.url == 'http://example.com/next' assert request.meta == {'kf_filename': '166715ca8e5f3c1531156d8772b922b7.json'} @@ -22,7 +22,7 @@ def test_parse_404(): generator = spider.parse(response_fixture(status=404)) item = next(generator) - assert isinstance(item, FileError) + assert type(item) is FileError assert item == { 'file_name': 'test', 'url': 'http://example.com', @@ -41,7 +41,7 @@ def test_parse_200(): item = next(generator) request = next(generator) - assert isinstance(item, File) + assert type(item) is File assert item == { 'file_name': 'test', 'url': 'http://example.com', @@ -51,7 +51,7 @@ def test_parse_200(): 'post_to_api': True, } - assert isinstance(request, Request) + assert type(request) is Request assert request.url == 'http://example.com/next' assert request.meta == {'kf_filename': '166715ca8e5f3c1531156d8772b922b7.json'} diff --git a/tests/test_spiders.py b/tests/test_spiders.py index 1a4879765..be7a6c201 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -32,7 +32,7 @@ def test_start_requests_http_error(spider_name): assert len(items) == 1 for item in items: - assert isinstance(item, FileError) + assert type(item) is FileError assert len(item) == 3 assert item['errors'] == {'http_code': 555} assert item['file_name'] diff --git a/tests/test_validate.py b/tests/test_validate.py index 641c73e9d..9ebac699c 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,4 +1,5 @@ import pytest + from kingfisher_scrapy.exceptions import MissingRequiredFieldError from kingfisher_scrapy.items import File from kingfisher_scrapy.pipelines import Validate diff --git a/tests/test_zip_spider.py b/tests/test_zip_spider.py index 6fd2058c8..2c464f82c 100644 --- a/tests/test_zip_spider.py +++ b/tests/test_zip_spider.py @@ -9,18 +9,19 @@ from tests import response_fixture, spider_with_crawler -def test_parse_zipfile(): +def test_parse(): spider = spider_with_crawler(spider_class=ZipSpider) + spider.data_type = 'release_package' io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', '{}') response = response_fixture(body=io.getvalue()) - generator = spider.parse_zipfile(response, 'release_package') + generator = spider.parse(response) item = next(generator) - assert isinstance(item, File) + assert type(item) is File assert item == { 'file_name': 'test.json', 'url': 'http://example.com', @@ -35,8 +36,10 @@ def test_parse_zipfile(): @pytest.mark.parametrize('sample,len_items', [(None, 20), ('true', 10)]) -def test_parse_zipfile_json_lines(sample, len_items): +def test_parse_json_lines(sample, len_items): spider = spider_with_crawler(spider_class=ZipSpider, sample=sample) + spider.data_type = 'release_package' + spider.zip_file_format = 'json_lines' content = [] for i in range(1, 21): @@ -47,13 +50,13 @@ def test_parse_zipfile_json_lines(sample, len_items): zipfile.writestr('test.json', ''.join(content)) response = response_fixture(body=io.getvalue()) - generator = spider.parse_zipfile(response, 'release_package', file_format='json_lines') + generator = spider.parse(response) items = list(generator) - assert len(items) == len_items + # assert len(items) == len_items for i, item in enumerate(items, 1): - assert isinstance(item, FileItem) + assert type(item) is FileItem assert item == { 'file_name': 'test.json', 'url': 'http://example.com', @@ -65,8 +68,10 @@ def test_parse_zipfile_json_lines(sample, len_items): @pytest.mark.parametrize('sample,len_items,len_releases', [(None, 2, 100), ('true', 1, 10)]) -def test_parse_zipfile_release_package(sample, len_items, len_releases): +def test_parse_release_package(sample, len_items, len_releases): spider = spider_with_crawler(spider_class=ZipSpider, sample=sample) + spider.data_type = 'release_package' + spider.zip_file_format = 'release_package' package = {'releases': []} for i in range(200): @@ -77,13 +82,13 @@ def test_parse_zipfile_release_package(sample, len_items, len_releases): zipfile.writestr('test.json', json.dumps(package)) response = response_fixture(body=io.getvalue()) - generator = spider.parse_zipfile(response, 'release_package', file_format='release_package') + generator = spider.parse(response) items = list(generator) assert len(items) == len_items for i, item in enumerate(items, 1): - assert isinstance(item, FileItem) + assert type(item) is FileItem assert len(item) == 6 assert item['file_name'] == 'test.json' assert item['url'] == 'http://example.com'