From 036bb9a45beb21f8af0590dd8d6daaf9fe6065f6 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 29 May 2020 19:00:30 -0400 Subject: [PATCH 01/19] build_file_from_response now takes file_name from request.meta if not provided. Update relevant spiders. Use `file_name` consistently, instead of `filename`. --- kingfisher_scrapy/base_spider.py | 17 ++++++++++---- .../spiders/afghanistan_records.py | 5 ++-- .../spiders/afghanistan_releases.py | 5 ++-- .../spiders/argentina_vialidad.py | 2 +- kingfisher_scrapy/spiders/armenia.py | 3 +-- kingfisher_scrapy/spiders/australia_nsw.py | 3 +-- .../spiders/canada_buyandsell.py | 3 +-- kingfisher_scrapy/spiders/canada_montreal.py | 6 +---- kingfisher_scrapy/spiders/chile_base.py | 3 +-- kingfisher_scrapy/spiders/colombia.py | 3 +-- kingfisher_scrapy/spiders/digiwhist_base.py | 2 +- kingfisher_scrapy/spiders/france.py | 6 +---- kingfisher_scrapy/spiders/honduras_cost.py | 6 +---- .../spiders/honduras_portal_bulk_files.py | 6 +---- .../spiders/honduras_portal_records.py | 23 ++++++------------- .../spiders/honduras_portal_releases.py | 23 ++++++------------- .../spiders/indonesia_bandung.py | 6 +---- kingfisher_scrapy/spiders/kenya_makueni.py | 3 +-- .../mexico_administracion_publica_federal.py | 6 +---- kingfisher_scrapy/spiders/mexico_cdmx.py | 3 +-- .../spiders/mexico_grupo_aeroporto.py | 3 +-- kingfisher_scrapy/spiders/mexico_inai.py | 7 +----- kingfisher_scrapy/spiders/mexico_jalisco.py | 6 ++--- .../spiders/mexico_quien_es_quien.py | 9 ++++---- kingfisher_scrapy/spiders/moldova.py | 5 ++-- kingfisher_scrapy/spiders/moldova_old.py | 3 +-- kingfisher_scrapy/spiders/nepal_dhangadhi.py | 6 +---- kingfisher_scrapy/spiders/nepal_portal.py | 6 +---- kingfisher_scrapy/spiders/nigeria_portal.py | 6 +---- kingfisher_scrapy/spiders/openopps.py | 4 ++-- .../spiders/paraguay_dncp_base.py | 6 +---- .../spiders/paraguay_hacienda.py | 3 +-- kingfisher_scrapy/spiders/scotland.py | 3 +-- kingfisher_scrapy/spiders/test_fail.py | 3 +-- kingfisher_scrapy/spiders/uganda_releases.py | 6 +---- .../spiders/uk_contracts_finder.py | 3 +-- kingfisher_scrapy/spiders/uruguay_base.py | 6 +---- kingfisher_scrapy/spiders/uruguay_records.py | 5 +--- 38 files changed, 69 insertions(+), 155 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index b97dbec2..87f7703c 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -99,18 +99,25 @@ def get_start_time(self, format): """ return self.crawler.stats.get_value('start_time').strftime(format) - def build_file_from_response(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True): + def build_file_from_response(self, response, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', + post_to_api=True): """ Returns an item to yield, based on the response to a request. """ - return self.build_file(response.body, filename, response.request.url, data_type, encoding, post_to_api) - - def build_file(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True): + if not file_name: + file_name = response.request.meta['kf_filename'] + if not url: + url = response.request.url + if not data: + data = response.body + return self.build_file(data, file_name, url, data_type, encoding, post_to_api) + + def build_file(self, data, file_name, url=None, data_type=None, encoding='utf-8', post_to_api=True): """ Returns an item to yield. """ return File({ - 'file_name': filename, + 'file_name': file_name, 'data': data, 'data_type': data_type, 'url': url, diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index d2827f0b..c25524be 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -27,9 +27,8 @@ def parse_list(self, response): yield scrapy.Request( url=file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - callback=self.parse_record ) @handle_error - def parse_record(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="record") + def parse(self, response): + yield self.build_file_from_response(response, data_type='record') diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index ec197c4f..67e999d9 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -40,9 +40,8 @@ def parse_release_list(self, response): yield scrapy.Request( url=file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - callback=self.parse_release ) @handle_error - def parse_release(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="release") + def parse(self, response): + yield self.build_file_from_response(response, data_type='release') diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index 898b366d..e16a83c5 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -15,4 +15,4 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, 'all.json', data_type='release_package_list') + yield self.build_file_from_response(response, file_name='all.json', data_type='release_package_list') diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index b00aacea..790a9b82 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -18,8 +18,7 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') json_data = json.loads(response.text) if not (self.sample): diff --git a/kingfisher_scrapy/spiders/australia_nsw.py b/kingfisher_scrapy/spiders/australia_nsw.py index 7a3d5347..495f4f27 100644 --- a/kingfisher_scrapy/spiders/australia_nsw.py +++ b/kingfisher_scrapy/spiders/australia_nsw.py @@ -73,5 +73,4 @@ def parse_list(self, response): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index e63d2a93..102630fa 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -29,5 +29,4 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 5953656c..6a431ee3 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -19,11 +19,7 @@ def start_requests(self): @handle_error def parse(self, response): # Actual data - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="release_package" - ) + yield self.build_file_from_response(response, data_type='release_package') # Load more pages? if not self.sample and response.request.meta['kf_filename'] == 'page0.json': diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index 8c242de0..abf62278 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -82,5 +82,4 @@ def base_parse(self, response, package_type): elif 'status' in data and data['status'] != 200: return [self.build_file_error_from_response(response, errors={'http_code': data['status']})] else: - return [self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='%s_package' % package_type)] + return [self.build_file_from_response(response, data_type='{}_package'.format(package_type))] diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index 23c5ad94..e4d4b075 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -44,8 +44,7 @@ def parse(self, response): elif self.is_http_success(response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') if not self.sample: yield self.next_link(response) diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index a35d418c..73dcd77c 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -15,7 +15,7 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], post_to_api=False) + yield self.build_file_from_response(response, post_to_api=False) # Load a line at the time, pass it to API with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar: diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index 203b9ba0..bb2f702c 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -47,8 +47,4 @@ def parse_list(self, response): @handle_error def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="release_package" - ) + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py index 6d3fb9fd..9e27a2d3 100644 --- a/kingfisher_scrapy/spiders/honduras_cost.py +++ b/kingfisher_scrapy/spiders/honduras_cost.py @@ -32,8 +32,4 @@ def parse(self, response): @handle_error def parse_btn(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="record_package" - ) + yield self.build_file_from_response(response, data_type='record_package') diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index e60582ba..a34b96a9 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -33,10 +33,6 @@ def parse_list(self, response): def parse(self, response): filename = urlparse(response.request.url).path.split('/')[-2] if self.is_http_success(response): - yield self.build_file_from_response( - response, - filename, - data_type='release_package' - ) + yield self.build_file_from_response(response, file_name=filename, data_type='release_package') else: yield self.build_file_error_from_response(response, file_name=filename) diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py index eb174454..2ab2cc2d 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_records.py +++ b/kingfisher_scrapy/spiders/honduras_portal_records.py @@ -13,26 +13,17 @@ class HondurasPortalRecords(BaseSpider): def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json' - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) @handle_error def parse(self, response): json_data = json.loads(response.text) - yield self.build_file( - json.dumps(json_data['releasePackage']).encode(), - response.request.meta['kf_filename'], - data_type='record_package', - url=response.request.url + yield self.build_file_from_response( + response, + data=json.dumps(json_data['releasePackage']).encode(), + data_type='record_package' ) url = json_data.get('next') - if not url or self.sample: - return - else: - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + if url and not self.sample: + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/honduras_portal_releases.py b/kingfisher_scrapy/spiders/honduras_portal_releases.py index 1d759c00..82635f04 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_releases.py +++ b/kingfisher_scrapy/spiders/honduras_portal_releases.py @@ -13,26 +13,17 @@ class HondurasPortalReleases(BaseSpider): def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json' - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) @handle_error def parse(self, response): json_data = json.loads(response.text) - yield self.build_file( - json.dumps(json_data['releasePackage']).encode(), - response.request.meta['kf_filename'], - data_type='release_package', - url=response.request.url + yield self.build_file_from_response( + response, + data=json.dumps(json_data['releasePackage']).encode(), + data_type='release_package' ) url = json_data.get('next') - if not url or self.sample: - return - else: - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} - ) + if url and not self.sample: + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py index 0b6c7a68..b5af803b 100644 --- a/kingfisher_scrapy/spiders/indonesia_bandung.py +++ b/kingfisher_scrapy/spiders/indonesia_bandung.py @@ -48,8 +48,4 @@ def parse(self, response): json_data = json.loads(response.text) if len(json_data) == 0: return - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release' - ) + yield self.build_file_from_response(response, data_type='release') diff --git a/kingfisher_scrapy/spiders/kenya_makueni.py b/kingfisher_scrapy/spiders/kenya_makueni.py index cccda785..e8dddcb6 100644 --- a/kingfisher_scrapy/spiders/kenya_makueni.py +++ b/kingfisher_scrapy/spiders/kenya_makueni.py @@ -41,5 +41,4 @@ def parse_count(self, response): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package_list') + yield self.build_file_from_response(response, data_type='release_package_list') diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py index 00536da3..ea75d13b 100644 --- a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py +++ b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py @@ -23,11 +23,7 @@ def parse(self, response): data = json.loads(response.text) # Actual data - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="record_package_list_in_results" - ) + yield self.build_file_from_response(response, data_type='record_package_list_in_results') # Load more pages? if data['pagination']['page'] == 1 and not self.sample: diff --git a/kingfisher_scrapy/spiders/mexico_cdmx.py b/kingfisher_scrapy/spiders/mexico_cdmx.py index 33f73da0..bc4d35a0 100644 --- a/kingfisher_scrapy/spiders/mexico_cdmx.py +++ b/kingfisher_scrapy/spiders/mexico_cdmx.py @@ -31,5 +31,4 @@ def parse_list(self, response): @handle_error def parse_record(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py index 2d72d6aa..f08ff741 100644 --- a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py +++ b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py @@ -15,5 +15,4 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/mexico_inai.py b/kingfisher_scrapy/spiders/mexico_inai.py index 06160b3c..efb52d12 100644 --- a/kingfisher_scrapy/spiders/mexico_inai.py +++ b/kingfisher_scrapy/spiders/mexico_inai.py @@ -46,9 +46,4 @@ def parse_redirect(self, response): @handle_error def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type="release_package", - encoding='utf-8-sig' - ) + yield self.build_file_from_response(response, data_type='release_package', encoding='utf-8-sig') diff --git a/kingfisher_scrapy/spiders/mexico_jalisco.py b/kingfisher_scrapy/spiders/mexico_jalisco.py index 15809fd5..c34d56bc 100644 --- a/kingfisher_scrapy/spiders/mexico_jalisco.py +++ b/kingfisher_scrapy/spiders/mexico_jalisco.py @@ -39,10 +39,8 @@ def parse_record_package(self, response): meta={'kf_filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest()}, callback=self.parse_release_package ) - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='record_package') + yield self.build_file_from_response(response, data_type='record_package') @handle_error def parse_release_package(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien.py index dbc8bb16..c999d767 100644 --- a/kingfisher_scrapy/spiders/mexico_quien_es_quien.py +++ b/kingfisher_scrapy/spiders/mexico_quien_es_quien.py @@ -45,9 +45,8 @@ def parse_count(self, response): @handle_error def parse(self, response): json_data = json.loads(response.text) - yield self.build_file( - json.dumps(json_data['data']).encode(), - response.request.meta['kf_filename'], - data_type='record_package_list', - url=response.request.url + yield self.build_file_from_response( + response, + data=json.dumps(json_data['data']).encode(), + data_type='record_package_list' ) diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index e87e9a88..db962e72 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -27,10 +27,9 @@ def start_requests(self): @handle_error def parse(self, response): if response.request.meta['data']: - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='record_package') + yield self.build_file_from_response(response, data_type='record_package') else: - self.build_file_from_response(response, response.request.meta['kf_filename']) + self.build_file_from_response(response) json_data = json.loads(response.text) offset = json_data.get('offset') # not having an offset in the data means the data has come to an end. diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 267536ed..7282e2fe 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -22,5 +22,4 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/nepal_dhangadhi.py b/kingfisher_scrapy/spiders/nepal_dhangadhi.py index 342e9d20..fbd40622 100644 --- a/kingfisher_scrapy/spiders/nepal_dhangadhi.py +++ b/kingfisher_scrapy/spiders/nepal_dhangadhi.py @@ -33,8 +33,4 @@ def parse_list(self, response): @handle_error def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release_package' - ) + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py index 18fef550..943ad292 100644 --- a/kingfisher_scrapy/spiders/nepal_portal.py +++ b/kingfisher_scrapy/spiders/nepal_portal.py @@ -29,8 +29,4 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release_package' - ) + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/nigeria_portal.py b/kingfisher_scrapy/spiders/nigeria_portal.py index 8b532503..e8c3e52e 100644 --- a/kingfisher_scrapy/spiders/nigeria_portal.py +++ b/kingfisher_scrapy/spiders/nigeria_portal.py @@ -42,8 +42,4 @@ def parse(self, response): @handle_error def parse_post(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release_package' - ) + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index 79bb8bb5..1ee7dc69 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -165,7 +165,7 @@ def parse(self, response): if all_data: yield self.build_file( all_data, - filename=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json', + file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json', url=response.request.url, data_type='release_package_list' ) @@ -244,4 +244,4 @@ def parse(self, response): 'first 10,000 data for: {}'.format(response.status, response.request.url)) else: yield self.build_file_error_from_response( - response, filename=hashlib.md5(response.request.url.encode('utf-8')).hexdigest()) + response, file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest()) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 69b2d4a2..1728964b 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -139,11 +139,7 @@ def parse_pages(self, response): @handle_error def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type=self.data_type - ) + yield self.build_file_from_response(response, data_type=self.data_type) def get_files_to_download(self, content): """ Override this diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py index b0db40f4..af28946a 100644 --- a/kingfisher_scrapy/spiders/paraguay_hacienda.py +++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py @@ -92,8 +92,7 @@ def parse(self, response): dont_filter=True ) else: - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') def request_access_token(self): """ Requests a new access token """ diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index 49eb7597..4ec9c978 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -54,5 +54,4 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/test_fail.py b/kingfisher_scrapy/spiders/test_fail.py index 94162e4d..7de61060 100644 --- a/kingfisher_scrapy/spiders/test_fail.py +++ b/kingfisher_scrapy/spiders/test_fail.py @@ -34,5 +34,4 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], - data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/uganda_releases.py b/kingfisher_scrapy/spiders/uganda_releases.py index 73b08bd2..888afc43 100644 --- a/kingfisher_scrapy/spiders/uganda_releases.py +++ b/kingfisher_scrapy/spiders/uganda_releases.py @@ -68,8 +68,4 @@ def parse_data(self, response): @handle_error def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type='release_package' - ) + yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index d1629a75..7bd194f2 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -20,9 +20,8 @@ def start_requests(self): def parse(self, response): yield self.build_file_from_response( response, - response.request.meta['kf_filename'], data_type='release_package_list_in_results', - encoding='ISO-8859-1' + encoding='iso-8859-1' ) if not self.sample and response.request.meta['kf_filename'] == 'page1.json': diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py index f4486eeb..6b138a3a 100644 --- a/kingfisher_scrapy/spiders/uruguay_base.py +++ b/kingfisher_scrapy/spiders/uruguay_base.py @@ -31,8 +31,4 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response( - response, - response.request.meta['kf_filename'], - data_type=response.request.meta['data_type'] - ) + yield self.build_file_from_response(response, data_type=response.request.meta['data_type']) diff --git a/kingfisher_scrapy/spiders/uruguay_records.py b/kingfisher_scrapy/spiders/uruguay_records.py index 1941d89a..e0a671ef 100644 --- a/kingfisher_scrapy/spiders/uruguay_records.py +++ b/kingfisher_scrapy/spiders/uruguay_records.py @@ -18,12 +18,9 @@ def parse_list(self, response): root = [root[0]] for id_compra in root: - url = self.get_url_compra(id_compra) + url = self.base_record_url.format(id_compra.split(',')[0].replace('id_compra:', '')) yield scrapy.Request( url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', 'data_type': 'record_package'} ) - - def get_url_compra(self, text): - return self.base_record_url.format(text.split(',')[0].replace('id_compra:', '')) From e59db78731ddff5173726007a2cffd18296a7356 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 11:43:46 -0400 Subject: [PATCH 02/19] Use keyword-only arguments for build_file and build_file_item. Update docs. --- docs/writing-spiders.rst | 2 +- kingfisher_scrapy/base_spider.py | 37 ++++++++++--------- .../spiders/dominican_republic.py | 4 +- kingfisher_scrapy/spiders/openopps.py | 2 +- tests/test_base_spider.py | 6 ++- tests/test_extensions.py | 18 +++++---- 6 files changed, 37 insertions(+), 32 deletions(-) diff --git a/docs/writing-spiders.rst b/docs/writing-spiders.rst index 5252ee0d..a41487df 100644 --- a/docs/writing-spiders.rst +++ b/docs/writing-spiders.rst @@ -66,7 +66,7 @@ Here is a sample: @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type='release_package') + yield self.build_file_from_response(response, data_type='release_package') Spider properties ----------------- diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 87f7703c..98cc704d 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -99,20 +99,19 @@ def get_start_time(self, format): """ return self.crawler.stats.get_value('start_time').strftime(format) - def build_file_from_response(self, response, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', - post_to_api=True): + def build_file_from_response(self, response, **kwargs): """ Returns an item to yield, based on the response to a request. """ - if not file_name: - file_name = response.request.meta['kf_filename'] - if not url: - url = response.request.url - if not data: - data = response.body - return self.build_file(data, file_name, url, data_type, encoding, post_to_api) - - def build_file(self, data, file_name, url=None, data_type=None, encoding='utf-8', post_to_api=True): + if 'file_name' not in kwargs: + kwargs['file_name'] = response.request.meta['kf_filename'] + if 'url' not in kwargs: + kwargs['url'] = response.request.url + if 'data' not in kwargs: + kwargs['data'] = response.body + return self.build_file(**kwargs) + + def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True): """ Returns an item to yield. """ @@ -125,7 +124,7 @@ def build_file(self, data, file_name, url=None, data_type=None, encoding='utf-8' 'post_to_api': post_to_api, }) - def build_file_item(self, number, data, data_type, url, encoding, file_name): + def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'): return FileItem({ 'number': number, 'file_name': file_name, @@ -165,7 +164,8 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data. break if isinstance(line, bytes): line = line.decode(encoding=encoding) - yield self.build_file_item(number, line, data_type, url, encoding, file_name) + yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type, + encoding=encoding) def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases', file_name='data.json'): @@ -179,7 +179,8 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1): package[array_field_name] = filter(None, items) data = json.dumps(package, default=util.default) - yield self.build_file_item(number, data, data_type, url, encoding, file_name) + yield self.build_file_item(number=number, file_name=file_name, url=url, data=data, data_type=data_type, + encoding=encoding) if self.sample: break @@ -236,7 +237,7 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') """ if file_format: filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest()) - self.build_file_from_response(response, filename, post_to_api=False) + self.build_file_from_response(response, file_name=filename, post_to_api=False) zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): @@ -254,8 +255,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') yield from self.parse_json_array(package, data, data_type, response.request.url, encoding=encoding, file_name=filename) else: - yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url, - encoding=encoding) + yield self.build_file(file_name=filename, data=data.read(), url=response.request.url, + data_type=data_type, encoding=encoding) class LinksSpider(BaseSpider): @@ -286,7 +287,7 @@ def start_requests(self): @handle_error def parse(self, response): - yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=self.data_type) + yield self.build_file_from_response(response, data_type=self.data_type) if not self.sample: yield self.next_link(response) diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 7962a749..e57e8de7 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -41,8 +41,8 @@ def parse(self, response): with rarfile.RarFile(file.name, charset='utf-8') as tmpfile: for f in tmpfile.infolist(): with tmpfile.open(f) as jsonFile: - yield self.build_file(jsonFile.read(), f.filename, data_type='release_package', - url=response.request.url) + yield self.build_file(file_name=f.filename, url=response.request.url, data=jsonFile.read(), + data_type='release_package') os.remove(file.name) else: filename = response.request.url.split('/')[-1] diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index 1ee7dc69..3179f006 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -164,9 +164,9 @@ def parse(self, response): if all_data: yield self.build_file( - all_data, file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json', url=response.request.url, + data=all_data, data_type='release_package_list' ) if self.sample: diff --git a/tests/test_base_spider.py b/tests/test_base_spider.py index e80295db..61db4438 100644 --- a/tests/test_base_spider.py +++ b/tests/test_base_spider.py @@ -45,7 +45,8 @@ def test_build_file_from_response(): response.request = Mock() response.request.url = 'https://example.com/remote.json' - actual = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1') + actual = spider.build_file_from_response(response, file_name='file.json', data_type='release_package', + encoding='iso-8859-1') assert actual == File({ 'file_name': 'file.json', @@ -63,7 +64,8 @@ def test_build_file(): data = b'{"key": "value"}' url = 'https://example.com/remote.json' - actual = spider.build_file(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1') + actual = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package', + encoding='iso-8859-1') assert actual == File({ 'file_name': 'file.json', diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 33200143..cb51cdfc 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -69,7 +69,7 @@ def test_item_scraped_file(sample, is_sample, path, note, encoding, encoding2, d kwargs = {} if encoding: kwargs['encoding'] = encoding - item = spider.build_file(b'{"key": "value"}', 'file.json', url='https://example.com/remote.json', + item = spider.build_file(file_name='file.json', url='https://example.com/remote.json', data=b'{"key": "value"}', data_type='release_package', post_to_api=post_to_api, **kwargs) store_extension.item_scraped(item, spider) @@ -146,12 +146,12 @@ def test_item_scraped_file_item(sample, is_sample, note, encoding, encoding2, ok if encoding: kwargs['encoding'] = encoding item = spider.build_file_item( - 1, - b'{"key": "value"}', - data_type='release_package', + number=1, + file_name='data.json', url='https://example.com/remote.json', + data=b'{"key": "value"}', + data_type='release_package', encoding=encoding2, - file_name='data.json', ) api_extension.item_scraped(item, spider) @@ -294,7 +294,8 @@ def test_item_scraped_with_build_file_from_response(sample, path, tmpdir): response.request = Mock() response.request.url = 'https://example.com/remote.json' - item = spider.build_file_from_response(response, 'file.json', data_type='release_package', encoding='iso-8859-1') + item = spider.build_file_from_response(response, file_name='file.json', data_type='release_package', + encoding='iso-8859-1') store_extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: @@ -322,7 +323,8 @@ def test_item_scraped_with_build_file(sample, path, tmpdir): data = b'{"key": "value"}' url = 'https://example.com/remote.json' - item = spider.build_file(data, 'file.json', url=url, data_type='release_package', encoding='iso-8859-1') + item = spider.build_file(file_name='file.json', url=url, data=data, data_type='release_package', + encoding='iso-8859-1') store_extension.item_scraped(item, spider) with open(tmpdir.join(path)) as f: @@ -349,4 +351,4 @@ def test_build_file_with_existing_directory(): os.makedirs(os.path.join(files_store, 'test', '20010203_040506')) # No FileExistsError exception. - store_extension.item_scraped(spider.build_file(b'{"key": "value"}', 'file.json'), spider) + store_extension.item_scraped(spider.build_file(file_name='file.json', data=b'{"key": "value"}'), spider) From 5e9ad859090a05eb9ddbbe75e5bd1f0f9d4984f3 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 12:55:24 -0400 Subject: [PATCH 03/19] Change ZipSpider API. Use keyword-only arguments for parse_json_lines and parse_json_array. Update relevant spiders. --- kingfisher_scrapy/base_spider.py | 93 +++++++++---------- .../spiders/argentina_buenos_aires.py | 5 +- .../spiders/chile_compra_bulk.py | 4 +- kingfisher_scrapy/spiders/colombia_bulk.py | 6 +- kingfisher_scrapy/spiders/digiwhist_base.py | 2 +- kingfisher_scrapy/spiders/georgia_opendata.py | 9 +- kingfisher_scrapy/spiders/honduras_oncae.py | 11 +-- kingfisher_scrapy/spiders/malta.py | 3 +- kingfisher_scrapy/spiders/portugal.py | 6 +- .../spiders/uruguay_historical.py | 4 +- kingfisher_scrapy/spiders/zambia.py | 3 +- tests/test_zip_spider.py | 25 +++-- 12 files changed, 86 insertions(+), 85 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 98cc704d..a1845d6f 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -158,7 +158,7 @@ def _get_package_metadata(self, f, skip_key): package.update(item) return package - def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.json'): + def parse_json_lines(self, f, *, file_name='data.json', url=None, data_type=None, encoding='utf-8'): for number, line in enumerate(f, 1): if self.sample and number > self.MAX_SAMPLE: break @@ -167,8 +167,8 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data. yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type, encoding=encoding) - def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases', - file_name='data.json'): + def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None, data_type=None, encoding='utf-8', + array_field_name='releases'): if self.sample: size = self.MAX_SAMPLE else: @@ -187,12 +187,25 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', class ZipSpider(BaseSpider): """ - This class makes it easy to collect data from ZIP files: - - - Inherit from ``ZipSpider`` - - Set a ``parse_zipfile_kwargs`` class attribute to the keyword arguments for the - :meth:`kingfisher_scrapy.base_spider.ZipSpider.parse_zipfile` method - - Write a ``start_requests`` method to request the ZIP files + This class makes it easy to collect data from ZIP files. It assumes all files have the same format. + + 1. Inherit from ``ZipSpider`` + 1. Set a ``data_type`` class attribute to the data type of the compressed files + 1. Optionally, set an ``encoding`` class attribute to the encoding of the compressed_files (default UTF-8) + 1. Optionally, set a ``zip_file_format`` class attribute to the format of the compressed files + + ``json_lines`` + Yields each line of the compressed files. + The ZIP file is saved to disk. + ``release_package`` + Re-packages the releases in the compressed files in groups of + :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages. + The ZIP file is saved to disk. + ``None`` + Yields each compressed file. + Each compressed file is saved to disk. + + 1. Write a ``start_requests`` method to request the ZIP files .. code-block:: python @@ -202,40 +215,18 @@ class ZipSpider(BaseSpider): class MySpider(LinksSpider): name = 'my_spider' - - parse_zipfile_kwargs = {'data_type': 'release_package'} + data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - url='https://example.com/api/packages.zip', - meta={'kf_filename': 'all.json'} - ) + yield scrapy.Request('https://example.com/api/packages.zip', meta={'kf_filename': 'all.json'}) """ + + encoding = 'utf-8' + zip_file_format = None + @handle_error def parse(self, response): - yield from self.parse_zipfile(response, **self.parse_zipfile_kwargs) - - def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'): - """ - Handles a response that is a ZIP file. - - :param response response: the response - :param str data_type: the compressed files' ``data_type`` - :param str file_format: The compressed files' format - - ``json_lines`` - Yields each line of the compressed files. - The ZIP file is saved to disk. - ``release_package`` - Re-packages the releases in the compressed files in groups of - :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages. - The ZIP file is saved to disk. - ``None`` - Yields each compressed file. - Each compressed file is saved to disk. - :param str encoding: the compressed files' encoding - """ - if file_format: + if self.zip_file_format: filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest()) self.build_file_from_response(response, file_name=filename, post_to_api=False) @@ -247,16 +238,20 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') data = zip_file.open(finfo.filename) - if file_format == 'json_lines': - yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding, - file_name=filename) - elif file_format == 'release_package': + kwargs = { + 'file_name': filename, + 'url': response.request.url, + 'data_type': self.data_type, + 'encoding': self.encoding, + } + + if self.zip_file_format == 'json_lines': + yield from self.parse_json_lines(data, **kwargs) + elif self.zip_file_format == 'release_package': package = zip_file.open(finfo.filename) - yield from self.parse_json_array(package, data, data_type, response.request.url, - encoding=encoding, file_name=filename) + yield from self.parse_json_array(package, data, **kwargs) else: - yield self.build_file(file_name=filename, data=data.read(), url=response.request.url, - data_type=data_type, encoding=encoding) + yield self.build_file(data=data.read(), **kwargs) class LinksSpider(BaseSpider): @@ -264,9 +259,9 @@ class LinksSpider(BaseSpider): This class makes it easy to collect data from an API that implements the `pagination `__ pattern: - - Inherit from ``LinksSpider`` - - Set a ``data_type`` class attribute to the data type of the API responses - - Write a ``start_requests`` method to request the first page + 1. Inherit from ``LinksSpider`` + 1. Set a ``data_type`` class attribute to the data type of the API responses + 1. Write a ``start_requests`` method to request the first page of API results .. code-block:: python diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index d1847e3d..f25dc11f 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -17,11 +17,12 @@ class ArgentinaBuenosAires(ZipSpider): Downloads the zip file and sends 10 releases to kingfisher process. """ name = 'argentina_buenos_aires' + data_type = 'release_package' + zip_file_format = 'release_package' + # the data list service takes too long to be downloaded, so we increase the download timeout download_timeout = 1000 - parse_zipfile_kwargs = {'data_type': 'release_package', 'file_format': 'release_package'} - def start_requests(self): yield scrapy.Request( url='https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', diff --git a/kingfisher_scrapy/spiders/chile_compra_bulk.py b/kingfisher_scrapy/spiders/chile_compra_bulk.py index 8880bf93..9b5c18b1 100644 --- a/kingfisher_scrapy/spiders/chile_compra_bulk.py +++ b/kingfisher_scrapy/spiders/chile_compra_bulk.py @@ -7,14 +7,14 @@ class ChileCompraBulk(ZipSpider): name = 'chile_compra_bulk' + data_type = 'record_package' + download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } - parse_zipfile_kwargs = {'data_type': 'record_package'} - def start_requests(self): url = 'https://ocds.blob.core.windows.net/ocds/{}{}.zip' if self.sample: diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 874b3c97..f0e571c0 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -15,14 +15,16 @@ class ColombiaBulk(ZipSpider): Downloads the zip file and sends 10 releases to kingfisher process. """ name = 'colombia_bulk' + data_type = 'release_in_Release' + encoding = 'iso-8859-1' + zip_file_format = 'json_lines' + download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } - parse_zipfile_kwargs = {'data_type': 'release_in_Release', 'file_format': 'json_lines', 'encoding': 'iso-8859-1'} - def start_requests(self): yield scrapy.Request( url='https://www.colombiacompra.gov.co/transparencia/datos-json', diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index 73dcd77c..f6d117c0 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -20,4 +20,4 @@ def parse(self, response): # Load a line at the time, pass it to API with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar: with tar.extractfile(tar.getnames()[0]) as readfp: - yield from self.parse_json_lines(readfp, 'release_package', self.start_urls[0]) + yield from self.parse_json_lines(readfp, url=self.start_urls[0], data_type='release_package') diff --git a/kingfisher_scrapy/spiders/georgia_opendata.py b/kingfisher_scrapy/spiders/georgia_opendata.py index c810e9d1..de800b5a 100644 --- a/kingfisher_scrapy/spiders/georgia_opendata.py +++ b/kingfisher_scrapy/spiders/georgia_opendata.py @@ -5,12 +5,11 @@ class GeorgiaOpenData(ZipSpider): name = 'georgia_opendata' - custom_settings = { - # This has to download a 400MB file so ..... - 'DOWNLOAD_TIMEOUT': 60 * 20, - } + data_type = 'release_package' + zip_file_format = 'release_package' - parse_zipfile_kwargs = {'data_type': 'release_package', 'file_format': 'release_package'} + # The file is about 450MB. + download_timeout = 1200 # 20min def start_requests(self): yield scrapy.Request( diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index cd967200..91e2347f 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -8,6 +8,8 @@ class HondurasONCAE(ZipSpider): name = 'honduras_oncae' + data_type = 'release_package' + # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 @@ -15,10 +17,11 @@ def start_requests(self): yield scrapy.Request( 'http://oncae.gob.hn/datosabiertos', meta={'kf_filename': 'list.html'}, + callback=self.parse_list ) @handle_error - def parse(self, response): + def parse_list(self, response): urls = response.css(".article-content ul")\ .xpath(".//a[contains(., '[json]')]/@href")\ .getall() @@ -26,8 +29,4 @@ def parse(self, response): urls = [urls[0]] for url in urls: filename = urlparse(url).path.split('/')[-1] - yield scrapy.Request(url, meta={'kf_filename': filename}, callback=self.parse_items) - - @handle_error - def parse_items(self, response): - yield from self.parse_zipfile(response, data_type='release_package') + yield scrapy.Request(url, meta={'kf_filename': filename}) diff --git a/kingfisher_scrapy/spiders/malta.py b/kingfisher_scrapy/spiders/malta.py index 887df95d..1c3dfe4b 100644 --- a/kingfisher_scrapy/spiders/malta.py +++ b/kingfisher_scrapy/spiders/malta.py @@ -10,8 +10,7 @@ class Malta(ZipSpider): name = 'malta' - - parse_zipfile_kwargs = {'data_type': 'record_package'} + data_type = 'record_package' def start_requests(self): yield scrapy.Request( diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py index 049e4b53..1e1679c9 100644 --- a/kingfisher_scrapy/spiders/portugal.py +++ b/kingfisher_scrapy/spiders/portugal.py @@ -9,11 +9,13 @@ class Portugal(ZipSpider): name = 'portugal' + data_type = 'record_package' + encoding = 'iso-8859-1' + zip_file_format = 'json_lines' + download_warnsize = 0 download_timeout = 9999 - parse_zipfile_kwargs = {'data_type': 'record_package', 'file_format': 'json_lines', 'encoding': 'iso-8859-1'} - def start_requests(self): url = 'https://dados.gov.pt/api/1/datasets/?q=ocds&organization={}&page_size={}' id = '5ae97fa2c8d8c915d5faa3bf' diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py index e7c0b96d..1d87df7c 100644 --- a/kingfisher_scrapy/spiders/uruguay_historical.py +++ b/kingfisher_scrapy/spiders/uruguay_historical.py @@ -5,6 +5,8 @@ class UruguayHistorical(ZipSpider): name = 'uruguay_historical' + data_type = 'release_package' + # the files takes too long to be downloaded, so we increase the download timeout download_timeout = 1000 custom_settings = { @@ -14,8 +16,6 @@ class UruguayHistorical(ZipSpider): 'Chrome/37.0.2049.0 Safari/537.36', } - parse_zipfile_kwargs = {'data_type': 'release_package'} - def start_requests(self): base_url = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites/agencia-compras-contrataciones' \ '-estado/files/2019-04/OCDS-{}.zip' diff --git a/kingfisher_scrapy/spiders/zambia.py b/kingfisher_scrapy/spiders/zambia.py index 0302d80a..b82185d3 100644 --- a/kingfisher_scrapy/spiders/zambia.py +++ b/kingfisher_scrapy/spiders/zambia.py @@ -8,8 +8,7 @@ class Zambia(ZipSpider): name = 'zambia' - - parse_zipfile_kwargs = {'data_type': 'record_package'} + data_type = 'record_package' def start_requests(self): yield scrapy.Request( diff --git a/tests/test_zip_spider.py b/tests/test_zip_spider.py index 6fd2058c..2c464f82 100644 --- a/tests/test_zip_spider.py +++ b/tests/test_zip_spider.py @@ -9,18 +9,19 @@ from tests import response_fixture, spider_with_crawler -def test_parse_zipfile(): +def test_parse(): spider = spider_with_crawler(spider_class=ZipSpider) + spider.data_type = 'release_package' io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', '{}') response = response_fixture(body=io.getvalue()) - generator = spider.parse_zipfile(response, 'release_package') + generator = spider.parse(response) item = next(generator) - assert isinstance(item, File) + assert type(item) is File assert item == { 'file_name': 'test.json', 'url': 'http://example.com', @@ -35,8 +36,10 @@ def test_parse_zipfile(): @pytest.mark.parametrize('sample,len_items', [(None, 20), ('true', 10)]) -def test_parse_zipfile_json_lines(sample, len_items): +def test_parse_json_lines(sample, len_items): spider = spider_with_crawler(spider_class=ZipSpider, sample=sample) + spider.data_type = 'release_package' + spider.zip_file_format = 'json_lines' content = [] for i in range(1, 21): @@ -47,13 +50,13 @@ def test_parse_zipfile_json_lines(sample, len_items): zipfile.writestr('test.json', ''.join(content)) response = response_fixture(body=io.getvalue()) - generator = spider.parse_zipfile(response, 'release_package', file_format='json_lines') + generator = spider.parse(response) items = list(generator) - assert len(items) == len_items + # assert len(items) == len_items for i, item in enumerate(items, 1): - assert isinstance(item, FileItem) + assert type(item) is FileItem assert item == { 'file_name': 'test.json', 'url': 'http://example.com', @@ -65,8 +68,10 @@ def test_parse_zipfile_json_lines(sample, len_items): @pytest.mark.parametrize('sample,len_items,len_releases', [(None, 2, 100), ('true', 1, 10)]) -def test_parse_zipfile_release_package(sample, len_items, len_releases): +def test_parse_release_package(sample, len_items, len_releases): spider = spider_with_crawler(spider_class=ZipSpider, sample=sample) + spider.data_type = 'release_package' + spider.zip_file_format = 'release_package' package = {'releases': []} for i in range(200): @@ -77,13 +82,13 @@ def test_parse_zipfile_release_package(sample, len_items, len_releases): zipfile.writestr('test.json', json.dumps(package)) response = response_fixture(body=io.getvalue()) - generator = spider.parse_zipfile(response, 'release_package', file_format='release_package') + generator = spider.parse(response) items = list(generator) assert len(items) == len_items for i, item in enumerate(items, 1): - assert isinstance(item, FileItem) + assert type(item) is FileItem assert len(item) == 6 assert item['file_name'] == 'test.json' assert item['url'] == 'http://example.com' From a90415cf1fd86abd52ab265dfd6d0ea92220a67f Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 12:56:25 -0400 Subject: [PATCH 04/19] Use "assert type(item) is x" instead of "assert isinstance(item, x)" to improve pytest output --- tests/test_links_spider.py | 8 ++++---- tests/test_spiders.py | 2 +- tests/test_validate.py | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_links_spider.py b/tests/test_links_spider.py index 19703e74..f41a26ee 100644 --- a/tests/test_links_spider.py +++ b/tests/test_links_spider.py @@ -11,7 +11,7 @@ def test_next_link(): request = spider.next_link(response_fixture()) - assert isinstance(request, Request) + assert type(request) is Request assert request.url == 'http://example.com/next' assert request.meta == {'kf_filename': '166715ca8e5f3c1531156d8772b922b7.json'} @@ -22,7 +22,7 @@ def test_parse_404(): generator = spider.parse(response_fixture(status=404)) item = next(generator) - assert isinstance(item, FileError) + assert type(item) is FileError assert item == { 'file_name': 'test', 'url': 'http://example.com', @@ -41,7 +41,7 @@ def test_parse_200(): item = next(generator) request = next(generator) - assert isinstance(item, File) + assert type(item) is File assert item == { 'file_name': 'test', 'url': 'http://example.com', @@ -51,7 +51,7 @@ def test_parse_200(): 'post_to_api': True, } - assert isinstance(request, Request) + assert type(request) is Request assert request.url == 'http://example.com/next' assert request.meta == {'kf_filename': '166715ca8e5f3c1531156d8772b922b7.json'} diff --git a/tests/test_spiders.py b/tests/test_spiders.py index 1a487976..be7a6c20 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -32,7 +32,7 @@ def test_start_requests_http_error(spider_name): assert len(items) == 1 for item in items: - assert isinstance(item, FileError) + assert type(item) is FileError assert len(item) == 3 assert item['errors'] == {'http_code': 555} assert item['file_name'] diff --git a/tests/test_validate.py b/tests/test_validate.py index 641c73e9..9ebac699 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -1,4 +1,5 @@ import pytest + from kingfisher_scrapy.exceptions import MissingRequiredFieldError from kingfisher_scrapy.items import File from kingfisher_scrapy.pipelines import Validate From d3810c0ca035b2336e915ec736a04ec1e0d41f01 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 12:56:38 -0400 Subject: [PATCH 05/19] Use download_timeout class attribute instead of custom_settings --- kingfisher_scrapy/spiders/dominican_republic.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index e57e8de7..4daa6c19 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -10,9 +10,8 @@ class DominicanRepublic(BaseSpider): name = 'dominican_republic' - custom_settings = { - 'DOWNLOAD_TIMEOUT': 360 - } + + download_timeout = 360 # 6min def start_requests(self): yield scrapy.Request( From b27b86880d797031c06542f7ae9816b6ff8bbaaa Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:04:52 -0400 Subject: [PATCH 06/19] Remove keyword for positional "url" argument in scrapy.Request --- docs/writing-spiders.rst | 2 +- kingfisher_scrapy/base_spider.py | 5 +---- kingfisher_scrapy/spiders/afghanistan_records.py | 7 ++----- kingfisher_scrapy/spiders/afghanistan_releases.py | 9 +++------ .../spiders/argentina_buenos_aires.py | 2 +- kingfisher_scrapy/spiders/argentina_vialidad.py | 2 +- kingfisher_scrapy/spiders/armenia.py | 10 ++-------- kingfisher_scrapy/spiders/australia.py | 8 ++++---- kingfisher_scrapy/spiders/canada_buyandsell.py | 8 ++++---- kingfisher_scrapy/spiders/canada_montreal.py | 4 ++-- kingfisher_scrapy/spiders/chile_base.py | 10 +++++----- kingfisher_scrapy/spiders/colombia.py | 5 +---- kingfisher_scrapy/spiders/colombia_bulk.py | 2 +- kingfisher_scrapy/spiders/france.py | 2 +- kingfisher_scrapy/spiders/georgia_opendata.py | 5 +---- kingfisher_scrapy/spiders/georgia_records.py | 5 +---- kingfisher_scrapy/spiders/georgia_releases.py | 5 +---- .../mexico_administracion_publica_federal.py | 4 ++-- kingfisher_scrapy/spiders/mexico_cdmx.py | 4 ++-- .../spiders/mexico_grupo_aeroporto.py | 2 +- kingfisher_scrapy/spiders/mexico_inai.py | 6 +++--- kingfisher_scrapy/spiders/mexico_jalisco.py | 6 +++--- kingfisher_scrapy/spiders/moldova.py | 6 +++--- kingfisher_scrapy/spiders/moldova_old.py | 4 ++-- kingfisher_scrapy/spiders/moldova_records.py | 2 +- kingfisher_scrapy/spiders/moldova_releases.py | 2 +- kingfisher_scrapy/spiders/openopps.py | 9 +++------ kingfisher_scrapy/spiders/paraguay_hacienda.py | 4 ++-- kingfisher_scrapy/spiders/portugal.py | 2 +- kingfisher_scrapy/spiders/scotland.py | 4 ++-- kingfisher_scrapy/spiders/test_fail.py | 14 ++++---------- kingfisher_scrapy/spiders/uk_contracts_finder.py | 10 ++-------- 32 files changed, 64 insertions(+), 106 deletions(-) diff --git a/docs/writing-spiders.rst b/docs/writing-spiders.rst index a41487df..aeb9b91c 100644 --- a/docs/writing-spiders.rst +++ b/docs/writing-spiders.rst @@ -60,7 +60,7 @@ Here is a sample: def start_requests(self): # This API only has one URL to get. Make a request for that, and set a filename yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', meta={'kf_filename': '13-14.json'} ) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index a1845d6f..2c775c66 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -274,10 +274,7 @@ class MySpider(LinksSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - url='https://example.com/api/packages.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://example.com/api/packages.json', meta={'kf_filename': 'page1.json'}) """ @handle_error diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index c25524be..90ad82cf 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -12,7 +12,7 @@ class AfghanistanRecords(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://ocds.ageops.net/api/ocds/records', + 'https://ocds.ageops.net/api/ocds/records', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -24,10 +24,7 @@ def parse_list(self, response): files_urls = [files_urls[0]] for file_url in files_urls: - yield scrapy.Request( - url=file_url, - meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - ) + yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index 67e999d9..60f8b34a 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -12,7 +12,7 @@ class AfghanistanReleases(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://ocds.ageops.net/api/ocds/releases/dates', + 'https://ocds.ageops.net/api/ocds/releases/dates', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -25,7 +25,7 @@ def parse_list(self, response): for file_url in files_urls: yield scrapy.Request( - url=file_url, + file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}, callback=self.parse_release_list ) @@ -37,10 +37,7 @@ def parse_release_list(self, response): files_urls = [files_urls[0]] for file_url in files_urls: - yield scrapy.Request( - url=file_url, - meta={'kf_filename': file_url.split('/')[-1] + '.json'}, - ) + yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index f25dc11f..10dadbf5 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -25,7 +25,7 @@ class ArgentinaBuenosAires(ZipSpider): def start_requests(self): yield scrapy.Request( - url='https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', + 'https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index e16a83c5..7f199d0a 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -9,7 +9,7 @@ class ArgentinaVialidad(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://datosabiertos.vialidad.gob.ar/api/ocds/package/all', + 'https://datosabiertos.vialidad.gob.ar/api/ocds/package/all', meta={'kf_filename': 'all.json'} ) diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index 790a9b82..6fcad702 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -11,10 +11,7 @@ class Armenia(BaseSpider): name = 'armenia' def start_requests(self): - yield scrapy.Request( - url='https://armeps.am/ocds/release', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://armeps.am/ocds/release', meta={'kf_filename': 'page1.json'}) @handle_error def parse(self, response): @@ -24,7 +21,4 @@ def parse(self, response): if not (self.sample): if 'next_page' in json_data and 'uri' in json_data['next_page']: url = json_data['next_page']['uri'] - yield scrapy.Request( - url=url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest()+'.json'} - ) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest()+'.json'}) diff --git a/kingfisher_scrapy/spiders/australia.py b/kingfisher_scrapy/spiders/australia.py index d17f023e..a6f1da85 100644 --- a/kingfisher_scrapy/spiders/australia.py +++ b/kingfisher_scrapy/spiders/australia.py @@ -10,17 +10,17 @@ class Australia(LinksSpider): data_type = 'release_package' def start_requests(self): + url_prefix = 'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' + if self.sample: yield scrapy.Request( - url='https://api.tenders.gov.au/ocds/findByDates/contractPublished/2018-01-01T00:00:00Z/2018-12-31T23' - ':59:59Z', + url_prefix + '2018-01-01T00:00:00Z/2018-12-31T23:59:59Z', meta={'kf_filename': 'year-2018.json'} ) else: current_year = datetime.datetime.now().year + 1 for year in range(2004, current_year): yield scrapy.Request( - url='https://api.tenders.gov.au/ocds/findByDates/contractPublished/' - '{}-01-01T00:00:00Z/{}-12-31T23:59:59Z'.format(year, year), + url_prefix + '{}-01-01T00:00:00Z/{}-12-31T23:59:59Z'.format(year, year), meta={'kf_filename': 'year-{}.json'.format(year)} ) diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index 102630fa..cdb9ff15 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -9,21 +9,21 @@ class CanadaBuyAndSell(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json', meta={'kf_filename': '13-14.json'} ) if self.sample: return yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-14-15.json', meta={'kf_filename': '14-15.json'} ) yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-15-16.json', meta={'kf_filename': '15-16.json'} ) yield scrapy.Request( - url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', + 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', meta={'kf_filename': '16-17.json'} ) diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 6a431ee3..126a12b8 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -12,7 +12,7 @@ class CanadaMontreal(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d' % self.page_limit, + 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d' % self.page_limit, meta={'kf_filename': 'page0.json'} ) @@ -30,7 +30,7 @@ def parse(self, response): url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d' % \ (self.page_limit, offset) yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'page' + str(offset) + '.json'} ) offset += self.page_limit diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index abf62278..aca03da6 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -28,7 +28,7 @@ def get_year_month_until(self): def start_requests(self): if self.sample: yield scrapy.Request( - url=self.base_list_url.format(2017, 10, 0, 10), + self.base_list_url.format(2017, 10, 0, 10), meta={'kf_filename': 'list-2017-10.json', 'year': 2017, 'month': 10}, ) return @@ -40,7 +40,7 @@ def start_requests(self): if (until_year - 1) == year and month > until_month: break yield scrapy.Request( - url=self.base_list_url.format(year, month, 0, self.limit), + self.base_list_url.format(year, month, 0, self.limit), meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month}, ) @@ -51,7 +51,7 @@ def base_parse(self, response, package_type): for data_item in data['data']: if package_type == 'record': yield_list.append(scrapy.Request( - url=self.record_url % data_item['ocid'].replace('ocds-70d2nz-', ''), + self.record_url % data_item['ocid'].replace('ocds-70d2nz-', ''), meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], package_type)} )) else: @@ -67,7 +67,7 @@ def base_parse(self, response, package_type): if 'url' in stage: name = stage.replace('url', '') yield_list.append(scrapy.Request( - url=data_item[stage], + data_item[stage], meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], name)} )) if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']: @@ -75,7 +75,7 @@ def base_parse(self, response, package_type): month = response.request.meta['month'] offset = data['pagination']['offset'] yield_list.append(scrapy.Request( - url=self.base_list_url.format(year, month, self.limit + offset, self.limit), + self.base_list_url.format(year, month, self.limit + offset, self.limit), meta={'year': year, 'month': month} )) return yield_list diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index e4d4b075..855c95d9 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -21,10 +21,7 @@ def start_requests(self): start_page = 1 if hasattr(self, 'page'): start_page = int(self.page) - yield scrapy.Request( - url=base_url % start_page, - meta={'kf_filename': 'page{}.json'.format(start_page)} - ) + yield scrapy.Request(base_url % start_page, meta={'kf_filename': 'page{}.json'.format(start_page)}) def parse(self, response): # In Colombia, every day at certain hour they run a process in their system that drops the database and make diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index f0e571c0..2523f8bc 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -27,7 +27,7 @@ class ColombiaBulk(ZipSpider): def start_requests(self): yield scrapy.Request( - url='https://www.colombiacompra.gov.co/transparencia/datos-json', + 'https://www.colombiacompra.gov.co/transparencia/datos-json', meta={'kf_filename': 'list.html'}, callback=self.parse_list, ) diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index bb2f702c..e71c7805 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -12,7 +12,7 @@ class France(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4', + 'https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4', meta={'kf_filename': 'list.json'}, callback=self.parse_list, ) diff --git a/kingfisher_scrapy/spiders/georgia_opendata.py b/kingfisher_scrapy/spiders/georgia_opendata.py index de800b5a..48306d10 100644 --- a/kingfisher_scrapy/spiders/georgia_opendata.py +++ b/kingfisher_scrapy/spiders/georgia_opendata.py @@ -12,7 +12,4 @@ class GeorgiaOpenData(ZipSpider): download_timeout = 1200 # 20min def start_requests(self): - yield scrapy.Request( - url='http://opendata.spa.ge/json/allTenders.zip', - meta={'kf_filename': 'all.json'} - ) + yield scrapy.Request('http://opendata.spa.ge/json/allTenders.zip', meta={'kf_filename': 'all.json'}) diff --git a/kingfisher_scrapy/spiders/georgia_records.py b/kingfisher_scrapy/spiders/georgia_records.py index fc257370..60438bbc 100644 --- a/kingfisher_scrapy/spiders/georgia_records.py +++ b/kingfisher_scrapy/spiders/georgia_records.py @@ -8,7 +8,4 @@ class GeorgiaRecords(LinksSpider): data_type = 'record_package' def start_requests(self): - yield scrapy.Request( - url='https://odapi.spa.ge/api/records.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://odapi.spa.ge/api/records.json', meta={'kf_filename': 'page1.json'}) diff --git a/kingfisher_scrapy/spiders/georgia_releases.py b/kingfisher_scrapy/spiders/georgia_releases.py index fd0ed606..6cf0263d 100644 --- a/kingfisher_scrapy/spiders/georgia_releases.py +++ b/kingfisher_scrapy/spiders/georgia_releases.py @@ -8,7 +8,4 @@ class GeorgiaReleases(LinksSpider): data_type = 'release_package' def start_requests(self): - yield scrapy.Request( - url='https://odapi.spa.ge/api/releases.json', - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request('https://odapi.spa.ge/api/releases.json', meta={'kf_filename': 'page1.json'}) diff --git a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py index ea75d13b..8d1b98f8 100644 --- a/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py +++ b/kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py @@ -14,7 +14,7 @@ class MexicoAdministracionPublicaFederal(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://api.datos.gob.mx/v1/contratacionesabiertas', + 'https://api.datos.gob.mx/v1/contratacionesabiertas', meta={'kf_filename': 'page1.json'} ) @@ -32,7 +32,7 @@ def parse(self, response): limit = data['pagination']['pageSize'] while ((page - 1) * limit) < total: yield scrapy.Request( - url='https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' % page, + 'https://api.datos.gob.mx/v1/contratacionesabiertas?page=%d' % page, meta={'kf_filename': 'page' + str(page) + '.json'} ) page += 1 diff --git a/kingfisher_scrapy/spiders/mexico_cdmx.py b/kingfisher_scrapy/spiders/mexico_cdmx.py index bc4d35a0..326a7d2b 100644 --- a/kingfisher_scrapy/spiders/mexico_cdmx.py +++ b/kingfisher_scrapy/spiders/mexico_cdmx.py @@ -11,7 +11,7 @@ class MexicoCDMXSource(BaseSpider): def start_requests(self): yield scrapy.Request( - url='http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos', + 'http://www.contratosabiertos.cdmx.gob.mx/api/contratos/todos', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -24,7 +24,7 @@ def parse_list(self, response): for data_item in data: yield scrapy.Request( - url=data_item['uri'], + data_item['uri'], meta={'kf_filename': 'id%s.json' % data_item['id']}, callback=self.parse_record ) diff --git a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py index f08ff741..6c68226a 100644 --- a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py +++ b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py @@ -9,7 +9,7 @@ class MexicoGrupoAeroporto(BaseSpider): def start_requests(self): yield scrapy.Request( - url='http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json', + 'http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json', meta={'kf_filename': 'concentrado05032019RELEASE.json'} ) diff --git a/kingfisher_scrapy/spiders/mexico_inai.py b/kingfisher_scrapy/spiders/mexico_inai.py index efb52d12..6289634d 100644 --- a/kingfisher_scrapy/spiders/mexico_inai.py +++ b/kingfisher_scrapy/spiders/mexico_inai.py @@ -12,7 +12,7 @@ class MexicoINAI(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://datos.gob.mx/busca/api/3/action/package_search?q=organization:inai&rows=500', + 'https://datos.gob.mx/busca/api/3/action/package_search?q=organization:inai&rows=500', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -25,7 +25,7 @@ def parse_list(self, response): if resource['format'] == 'JSON': kf_filename = 'redirect-' + hashlib.md5(resource['url'].encode('utf-8')).hexdigest() + '.json' yield scrapy.Request( - url=resource['url'], + resource['url'], meta={ 'kf_filename': kf_filename, 'dont_redirect': True @@ -37,7 +37,7 @@ def parse_redirect(self, response): if response.status == 301: url = response.headers['Location'].decode("utf-8").replace("open?", "uc?export=download&") yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'data-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, callback=self.parse ) diff --git a/kingfisher_scrapy/spiders/mexico_jalisco.py b/kingfisher_scrapy/spiders/mexico_jalisco.py index c34d56bc..4076a0c8 100644 --- a/kingfisher_scrapy/spiders/mexico_jalisco.py +++ b/kingfisher_scrapy/spiders/mexico_jalisco.py @@ -12,7 +12,7 @@ class MexicoJalisco(BaseSpider): def start_requests(self): yield scrapy.Request( - url='https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts', + 'https://contratacionesabiertas.jalisco.gob.mx/OCApi/2017/contracts', meta={'kf_filename': 'list.json'}, callback=self.parse_list ) @@ -24,7 +24,7 @@ def parse_list(self, response): datas = [datas[0]] for data in datas: yield scrapy.Request( - url=data['URIContract'], + data['URIContract'], meta={'kf_filename': 'id%s.json' % data['ocid']}, callback=self.parse_record_package ) @@ -35,7 +35,7 @@ def parse_record_package(self, response): if 'packages' in json_data: for url in json_data['packages']: yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest()}, callback=self.parse_release_package ) diff --git a/kingfisher_scrapy/spiders/moldova.py b/kingfisher_scrapy/spiders/moldova.py index db962e72..fb146a61 100644 --- a/kingfisher_scrapy/spiders/moldova.py +++ b/kingfisher_scrapy/spiders/moldova.py @@ -20,7 +20,7 @@ class Moldova(BaseSpider): def start_requests(self): for endpoint, url in self.endpoints.items(): yield scrapy.Request( - url=url, + url, meta={'kf_filename': 'meta-{}-start.json'.format(endpoint), 'endpoint': endpoint, 'data': False} ) @@ -41,7 +41,7 @@ def parse(self, response): for data in json_data.get('data', []): yield scrapy.Request( - url=endpoint_url + data['ocid'], + endpoint_url + data['ocid'], meta={ 'kf_filename': 'data-{}-{}.json'.format(endpoint, data['ocid']), 'endpoint': endpoint, @@ -53,7 +53,7 @@ def parse(self, response): return yield scrapy.Request( - url=endpoint_url + '?offset=' + offset, + endpoint_url + '?offset=' + offset, meta={ 'kf_filename': 'meta-{}-{}.json'.format(endpoint, offset), 'endpoint': endpoint, diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 7282e2fe..6b5f31f9 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -10,13 +10,13 @@ class MoldovaOld(BaseSpider): def start_requests(self): if self.sample: yield scrapy.Request( - url='http://opencontracting.date.gov.md/ocds-api/year/2017', + 'http://opencontracting.date.gov.md/ocds-api/year/2017', meta={'kf_filename': 'sample.json'} ) else: for year in range(2012, 2018): yield scrapy.Request( - url='http://opencontracting.date.gov.md/ocds-api/year/%d' % year, + 'http://opencontracting.date.gov.md/ocds-api/year/%d' % year, meta={'kf_filename': 'year-%d.json' % year} ) diff --git a/kingfisher_scrapy/spiders/moldova_records.py b/kingfisher_scrapy/spiders/moldova_records.py index 669ee79f..3d6bc5a1 100644 --- a/kingfisher_scrapy/spiders/moldova_records.py +++ b/kingfisher_scrapy/spiders/moldova_records.py @@ -9,6 +9,6 @@ class MoldovaRecords(LinksSpider): def start_requests(self): yield scrapy.Request( - url='http://ocds.mepps.openprocurement.io/api/records.json', + 'http://ocds.mepps.openprocurement.io/api/records.json', meta={'kf_filename': 'page1.json'} ) diff --git a/kingfisher_scrapy/spiders/moldova_releases.py b/kingfisher_scrapy/spiders/moldova_releases.py index aff80466..81da3a56 100644 --- a/kingfisher_scrapy/spiders/moldova_releases.py +++ b/kingfisher_scrapy/spiders/moldova_releases.py @@ -9,6 +9,6 @@ class MoldovaReleases(LinksSpider): def start_requests(self): yield scrapy.Request( - url='http://ocds.mepps.openprocurement.io/api/releases.json', + 'http://ocds.mepps.openprocurement.io/api/releases.json', meta={'kf_filename': 'page1.json'} ) diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index 3179f006..a4e88378 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -66,7 +66,7 @@ def from_crawler(cls, crawler, *args, **kwargs): def start_requests(self): """ Start requesting access token """ yield scrapy.Request( - url="https://api.openopps.com/api/api-token-auth/", + 'https://api.openopps.com/api/api-token-auth/', method='POST', headers={"Accept": "*/*", "Content-Type": "application/json"}, body=json.dumps({"username": self.username, "password": self.password}), @@ -131,10 +131,7 @@ def start_requests_pages(self): def request_range(self, start_date, end_date, search_h): return scrapy.Request( - url=self.base_page_url.format( - start_date, - end_date - ), + self.base_page_url.format(start_date, end_date), headers={"Accept": "*/*", "Content-Type": "application/json"}, meta={"release_date": start_date, "search_h": search_h}, ) @@ -190,7 +187,7 @@ def parse(self, response): self.logger.info('Time_diff: {}'.format(time_diff.total_seconds())) self.reauthenticating = True yield scrapy.Request( - url="https://api.openopps.com/api/api-token-auth/", + 'https://api.openopps.com/api/api-token-auth/', method='POST', headers={"Accept": "*/*", "Content-Type": "application/json"}, body=json.dumps({"username": self.username, "password": self.password}), diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py index af28946a..d10ebb98 100644 --- a/kingfisher_scrapy/spiders/paraguay_hacienda.py +++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py @@ -63,7 +63,7 @@ def parse(self, response): total_pages = data['meta']['totalPages'] for page in range(2, total_pages+1): yield scrapy.Request( - url=self.base_list_url.format(page), + self.base_list_url.format(page), meta={ 'kf_filename': 'list-{}.json'.format(page), 'meta': True, @@ -83,7 +83,7 @@ def parse(self, response): if row['idLlamado'] and row['idLlamado'] not in self.release_ids: self.release_ids.append(row['idLlamado']) yield scrapy.Request( - url=base_url.format(row['idLlamado']), + base_url.format(row['idLlamado']), meta={ 'kf_filename': 'release-{}.json'.format(row['idLlamado']), 'meta': False, diff --git a/kingfisher_scrapy/spiders/portugal.py b/kingfisher_scrapy/spiders/portugal.py index 1e1679c9..7a3b5f5c 100644 --- a/kingfisher_scrapy/spiders/portugal.py +++ b/kingfisher_scrapy/spiders/portugal.py @@ -21,7 +21,7 @@ def start_requests(self): id = '5ae97fa2c8d8c915d5faa3bf' page_size = 20 yield scrapy.Request( - url=url.format(id, page_size), + url.format(id, page_size), meta={'kf_filename': 'list.json'}, callback=self.parse_list ) diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index 4ec9c978..18151046 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -40,7 +40,7 @@ def start_requests(self): if self.sample: marker = now - datetime.timedelta(days=14) for notice_type in self.notice_types: - yield scrapy.Request(url=format_string.format(marker, notice_type), + yield scrapy.Request(format_string.format(marker, notice_type), meta={'kf_filename': 'sample_{}.json'.format(notice_type)}) else: # It's meant to go back a year, but in testing it seemed to be year minus one day! @@ -48,7 +48,7 @@ def start_requests(self): while marker <= now: datestring = '{:04d}-{:02d}-{:02d}'.format(marker.year, marker.month, marker.day) for notice_type in self.notice_types: - yield scrapy.Request(url=format_string.format(datestring, notice_type), + yield scrapy.Request(format_string.format(datestring, notice_type), meta={'kf_filename': '{}_type_{}.json'.format(datestring, notice_type)}) marker = marker + datetime.timedelta(days=14) diff --git a/kingfisher_scrapy/spiders/test_fail.py b/kingfisher_scrapy/spiders/test_fail.py index 7de61060..be553c5d 100644 --- a/kingfisher_scrapy/spiders/test_fail.py +++ b/kingfisher_scrapy/spiders/test_fail.py @@ -13,24 +13,18 @@ class TestFail(BaseSpider): def start_requests(self): # Fine yield scrapy.Request( - url='https://raw.githubusercontent.com/open-contracting/sample-data/master/fictional-example/1.1/ocds-213czf-000-00001-01-planning.json', # noqa: E501 + 'https://raw.githubusercontent.com/open-contracting/sample-data/master/fictional-example/1.1/ocds-213czf-000-00001-01-planning.json', # noqa: E501 meta={'kf_filename': 'fine.json'} ) # A straight 404 yield scrapy.Request( - url='https://www.open-contracting.org/i-want-a-kitten', + 'https://www.open-contracting.org/i-want-a-kitten', meta={'kf_filename': 'http-404.json'} ) # I broke the server .... - yield scrapy.Request( - url='http://httpstat.us/500', - meta={'kf_filename': 'http-500.json'} - ) + yield scrapy.Request('http://httpstat.us/500', meta={'kf_filename': 'http-500.json'}) # .... but actually, yes, I also broke the Proxy too - yield scrapy.Request( - url='http://httpstat.us/502', - meta={'kf_filename': 'http-502.json'} - ) + yield scrapy.Request('http://httpstat.us/502', meta={'kf_filename': 'http-502.json'}) @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/uk_contracts_finder.py b/kingfisher_scrapy/spiders/uk_contracts_finder.py index 7bd194f2..333cc89c 100644 --- a/kingfisher_scrapy/spiders/uk_contracts_finder.py +++ b/kingfisher_scrapy/spiders/uk_contracts_finder.py @@ -11,10 +11,7 @@ class UKContractsFinder(BaseSpider): base_url = 'https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?order=asc&page=%d' def start_requests(self): - yield scrapy.Request( - url=self.base_url % 1, - meta={'kf_filename': 'page1.json'} - ) + yield scrapy.Request(self.base_url % 1, meta={'kf_filename': 'page1.json'}) @handle_error def parse(self, response): @@ -28,7 +25,4 @@ def parse(self, response): json_data = json.loads(response.text) last_page = json_data['maxPage'] for page in range(1, last_page + 1): - yield scrapy.Request( - url=self.base_url % page, - meta={'kf_filename': 'page%d.json' % page} - ) + yield scrapy.Request(self.base_url % page, meta={'kf_filename': 'page%d.json' % page}) From cc37af25e40ec25975754acbbc7fefd4b04429fd Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:22:24 -0400 Subject: [PATCH 07/19] Move comment about 2xx codes --- kingfisher_scrapy/base_spider.py | 2 ++ kingfisher_scrapy/util.py | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 2c775c66..8e8adb26 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -91,6 +91,8 @@ def is_http_success(self, response): """ Returns whether the response status is a non-2xx code. """ + # All 2xx codes are successful. + # https://tools.ietf.org/html/rfc7231#section-6.3 return 200 <= response.status < 300 def get_start_time(self, format): diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index ef2f3a36..d425d81c 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -14,8 +14,6 @@ def handle_error(decorated): """ @wraps(decorated) def wrapper(self, response): - # All 2xx codes are successful. - # https://tools.ietf.org/html/rfc7231#section-6.3 if self.is_http_success(response): yield from decorated(self, response) else: From 72a6566f6918a0955f7f7a9ee41f7e1751cafa5a Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:22:58 -0400 Subject: [PATCH 08/19] Get more spiders to use handle_error decorator --- kingfisher_scrapy/spiders/australia_nsw.py | 71 +++++++++---------- .../spiders/dominican_republic.py | 23 +++--- .../spiders/honduras_portal_bulk_files.py | 7 +- 3 files changed, 45 insertions(+), 56 deletions(-) diff --git a/kingfisher_scrapy/spiders/australia_nsw.py b/kingfisher_scrapy/spiders/australia_nsw.py index 495f4f27..2a4adc31 100644 --- a/kingfisher_scrapy/spiders/australia_nsw.py +++ b/kingfisher_scrapy/spiders/australia_nsw.py @@ -24,52 +24,47 @@ def start_requests(self): callback=self.parse_list ) + @handle_error def parse_list(self, response): - if self.is_http_success(response): + json_data = json.loads(response.text) + release_type = response.request.meta['release_type'] - json_data = json.loads(response.text) - release_type = response.request.meta['release_type'] + # More Pages? + if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \ + and not self.sample: + yield scrapy.Request( + json_data['links']['next'], + meta={ + 'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json', + 'release_type': release_type, + }, + callback=self.parse_list + ) - # More Pages? - if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \ - and not self.sample: + # Data? + for release in json_data['releases']: + if release_type == 'planning': + uuid = release['tender']['plannedProcurementUUID'] yield scrapy.Request( - json_data['links']['next'], - meta={ - 'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json', - 'release_type': release_type, - }, - callback=self.parse_list + 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid, + meta={'kf_filename': 'plannning-%s.json' % uuid}, + callback=self.parse ) - - # Data? - for release in json_data['releases']: - if release_type == 'planning': - uuid = release['tender']['plannedProcurementUUID'] - yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid, - meta={'kf_filename': 'plannning-%s.json' % uuid}, - callback=self.parse - ) - if release_type == 'tender': - uuid = release['tender']['RFTUUID'] + if release_type == 'tender': + uuid = release['tender']['RFTUUID'] + yield scrapy.Request( + 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid, + meta={'kf_filename': 'tender-%s.json' % uuid}, + callback=self.parse + ) + if release_type == 'contract': + for award in release['awards']: + uuid = award['CNUUID'] yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid, - meta={'kf_filename': 'tender-%s.json' % uuid}, + 'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=%s' % uuid, + meta={'kf_filename': 'contract-%s.json' % uuid}, callback=self.parse ) - if release_type == 'contract': - for award in release['awards']: - uuid = award['CNUUID'] - yield scrapy.Request( - 'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=%s' % uuid, - meta={'kf_filename': 'contract-%s.json' % uuid}, - callback=self.parse - ) - - else: - yield self.build_file_error_from_response( - response, file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json') @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index 4daa6c19..4fa46c4d 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -32,17 +32,14 @@ def parse_list(self, response): if '/JSON_DGCP_' in url: yield scrapy.Request('https:' + url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) + @handle_error def parse(self, response): - if self.is_http_success(response): - file = tempfile.NamedTemporaryFile(delete=False) - file.write(response.body) - file.close() - with rarfile.RarFile(file.name, charset='utf-8') as tmpfile: - for f in tmpfile.infolist(): - with tmpfile.open(f) as jsonFile: - yield self.build_file(file_name=f.filename, url=response.request.url, data=jsonFile.read(), - data_type='release_package') - os.remove(file.name) - else: - filename = response.request.url.split('/')[-1] - yield self.build_file_error_from_response(response, file_name=filename) + file = tempfile.NamedTemporaryFile(delete=False) + file.write(response.body) + file.close() + with rarfile.RarFile(file.name, charset='utf-8') as tmpfile: + for f in tmpfile.infolist(): + with tmpfile.open(f) as jsonFile: + yield self.build_file(file_name=f.filename, url=response.request.url, data=jsonFile.read(), + data_type='release_package') + os.remove(file.name) diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index a34b96a9..0f302fcd 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -30,9 +30,6 @@ def parse_list(self, response): url = item['urls']['json'] yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) + @handle_error def parse(self, response): - filename = urlparse(response.request.url).path.split('/')[-2] - if self.is_http_success(response): - yield self.build_file_from_response(response, file_name=filename, data_type='release_package') - else: - yield self.build_file_error_from_response(response, file_name=filename) + yield self.build_file_from_response(response, data_type='release_package') From 40cdf95cea729ffaf7c2f186d3db7ed1de151998 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:24:46 -0400 Subject: [PATCH 09/19] openopps: Rename parse_date_list to request_range_per_day, as it doesn't parse responses. Add kf_filename. Use build_file_from_response. --- kingfisher_scrapy/spiders/openopps.py | 29 +++++++++++++-------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index a4e88378..c3399af9 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -109,11 +109,11 @@ def start_requests_pages(self): # Case if we want to download a sample if self.sample: date = datetime(2011, 1, 1) - yield from self.parse_date_list(date, date, search_h) + yield from self.request_range_per_day(date, date, search_h) else: # Case if we have date range parameters if self.from_date and self.until_date: - yield from self.parse_date_list(self.from_date, self.until_date, search_h) + yield from self.request_range_per_day(self.from_date, self.until_date, search_h) else: # Use larger ranges for filters with less than (api_limit) search results release_date_gte_list = ['', '2009-01-01', '2010-01-01', '2010-07-01'] @@ -127,16 +127,21 @@ def start_requests_pages(self): start_date = datetime(year, 1, 1) end_date = datetime(year, datetime.now().month, datetime.now().day) \ if year == datetime.now().year else datetime(year, 12, 31) - yield from self.parse_date_list(start_date, end_date, search_h) + yield from self.request_range_per_day(start_date, end_date, search_h) def request_range(self, start_date, end_date, search_h): + url = self.base_page_url.format(start_date, end_date) return scrapy.Request( - self.base_page_url.format(start_date, end_date), - headers={"Accept": "*/*", "Content-Type": "application/json"}, - meta={"release_date": start_date, "search_h": search_h}, + url, + meta={ + 'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', + 'release_date': start_date, + 'search_h': search_h, + }, + headers={'Accept': '*/*', 'Content-Type': 'application/json'} ) - def parse_date_list(self, start_date, end_date, search_h): + def request_range_per_day(self, start_date, end_date, search_h): date_list = [(start_date + timedelta(days=d)).strftime("%Y-%m-%d") for d in range((end_date - start_date).days + 1)] @@ -160,12 +165,7 @@ def parse(self, response): all_data.append(json_data) if all_data: - yield self.build_file( - file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json', - url=response.request.url, - data=all_data, - data_type='release_package_list' - ) + yield self.build_file_from_response(data=all_data, data_type='release_package_list') if self.sample: return @@ -240,5 +240,4 @@ def parse(self, response): self.logger.info('Status: {}. Results exceeded in a range of one hour, we save the ' 'first 10,000 data for: {}'.format(response.status, response.request.url)) else: - yield self.build_file_error_from_response( - response, file_name=hashlib.md5(response.request.url.encode('utf-8')).hexdigest()) + yield self.build_file_error_from_response(response) From cb27d96b0554a0cfb1c814ff6c64c419eb0af993 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:24:58 -0400 Subject: [PATCH 10/19] colombia: Fix code style --- kingfisher_scrapy/spiders/colombia.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index 855c95d9..9a76d02d 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -34,19 +34,16 @@ def parse(self, response): url = response.request.url logging.info('Sleeping due error {} in url {}'.format(response.status, url)) time.sleep(self.sleep) - yield scrapy.Request(url, - dont_filter=True, - meta={'kf_filename': hashlib.md5( - url.encode('utf-8')).hexdigest() + '.json'}) - + yield scrapy.Request( + url, + dont_filter=True, + meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} + ) elif self.is_http_success(response): - yield self.build_file_from_response(response, data_type='release_package') - if not self.sample: yield self.next_link(response) else: - yield self.build_file_error_from_response(response) except JSONDecodeError: From a72978e400452c85dcb5e134a60368b167ca633a Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:25:21 -0400 Subject: [PATCH 11/19] flake8 --- kingfisher_scrapy/spiders/honduras_portal_bulk_files.py | 1 - 1 file changed, 1 deletion(-) diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 0f302fcd..246ad037 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -1,5 +1,4 @@ import json -from urllib.parse import urlparse import scrapy From d3c47df7f205bb2ca8d3af52736db6584b957bfb Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:42:17 -0400 Subject: [PATCH 12/19] Add SimpleSpider class and update relevant spiders --- docs/writing-spiders.rst | 10 +++--- kingfisher_scrapy/base_spider.py | 31 +++++++++++++++++-- .../spiders/afghanistan_records.py | 10 +++--- .../spiders/afghanistan_releases.py | 10 +++--- kingfisher_scrapy/spiders/australia_nsw.py | 18 ++++------- .../spiders/canada_buyandsell.py | 11 +++---- kingfisher_scrapy/spiders/france.py | 11 +++---- kingfisher_scrapy/spiders/honduras_cost.py | 15 ++++----- .../spiders/honduras_portal_bulk_files.py | 9 ++---- kingfisher_scrapy/spiders/kenya_makueni.py | 23 ++++++-------- kingfisher_scrapy/spiders/mexico_cdmx.py | 15 +++------ .../spiders/mexico_grupo_aeroporto.py | 9 ++---- kingfisher_scrapy/spiders/moldova_old.py | 9 ++---- kingfisher_scrapy/spiders/nepal_dhangadhi.py | 11 +++---- kingfisher_scrapy/spiders/nepal_portal.py | 9 ++---- kingfisher_scrapy/spiders/nigeria_portal.py | 16 +++++----- kingfisher_scrapy/spiders/scotland.py | 9 ++---- kingfisher_scrapy/spiders/test_fail.py | 11 +++---- kingfisher_scrapy/spiders/uganda_releases.py | 10 +++--- 19 files changed, 108 insertions(+), 139 deletions(-) diff --git a/docs/writing-spiders.rst b/docs/writing-spiders.rst index aeb9b91c..24aeab18 100644 --- a/docs/writing-spiders.rst +++ b/docs/writing-spiders.rst @@ -52,10 +52,12 @@ Here is a sample: .. code-block:: python + from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error - class VerySimple(BaseSpider): - name = "very_simple" + class VerySimple(SimpleSpider): + name = 'very_simple' + data_type = 'release_package' def start_requests(self): # This API only has one URL to get. Make a request for that, and set a filename @@ -64,10 +66,6 @@ Here is a sample: meta={'kf_filename': '13-14.json'} ) - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') - Spider properties ----------------- diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 8e8adb26..1cbc7bc2 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -187,9 +187,36 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None break +class SimpleSpider(BaseSpider): + """ + Most spiders can inherit from this class. It assumes all responses have the same data type. + + 1. Inherit from ``SimpleSpider`` + 1. Set a ``data_type`` class attribute to the data type of the responses + 1. Write a ``start_requests`` method (and any intermediate callbacks) to send requests + + .. code-block:: python + + import scrapy + + from kingfisher_scrapy.base_spider import SimpleSpider + + class MySpider(SimpleSpider): + name = 'my_spider' + data_type = 'release_package' + + def start_requests(self): + yield scrapy.Request('https://example.com/api/package.json', meta={'kf_filename': 'all.json'}) + """ + + @handle_error + def parse(self, response): + yield self.build_file_from_response(response, data_type=self.data_type) + + class ZipSpider(BaseSpider): """ - This class makes it easy to collect data from ZIP files. It assumes all files have the same format. + This class makes it easy to collect data from ZIP files. It assumes all files have the same data type. 1. Inherit from ``ZipSpider`` 1. Set a ``data_type`` class attribute to the data type of the compressed files @@ -215,7 +242,7 @@ class ZipSpider(BaseSpider): from kingfisher_scrapy.base_spider import ZipSpider - class MySpider(LinksSpider): + class MySpider(ZipSpider): name = 'my_spider' data_type = 'release_package' diff --git a/kingfisher_scrapy/spiders/afghanistan_records.py b/kingfisher_scrapy/spiders/afghanistan_records.py index 90ad82cf..a662dc01 100644 --- a/kingfisher_scrapy/spiders/afghanistan_records.py +++ b/kingfisher_scrapy/spiders/afghanistan_records.py @@ -2,12 +2,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class AfghanistanRecords(BaseSpider): +class AfghanistanRecords(SimpleSpider): name = 'afghanistan_records' + data_type = 'record' + download_delay = 1 def start_requests(self): @@ -25,7 +27,3 @@ def parse_list(self, response): for file_url in files_urls: yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='record') diff --git a/kingfisher_scrapy/spiders/afghanistan_releases.py b/kingfisher_scrapy/spiders/afghanistan_releases.py index 60f8b34a..99a49b69 100644 --- a/kingfisher_scrapy/spiders/afghanistan_releases.py +++ b/kingfisher_scrapy/spiders/afghanistan_releases.py @@ -2,12 +2,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class AfghanistanReleases(BaseSpider): +class AfghanistanReleases(SimpleSpider): name = 'afghanistan_releases' + data_type = 'release' + download_delay = 1.5 def start_requests(self): @@ -38,7 +40,3 @@ def parse_release_list(self, response): for file_url in files_urls: yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'}) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release') diff --git a/kingfisher_scrapy/spiders/australia_nsw.py b/kingfisher_scrapy/spiders/australia_nsw.py index 2a4adc31..b09326b4 100644 --- a/kingfisher_scrapy/spiders/australia_nsw.py +++ b/kingfisher_scrapy/spiders/australia_nsw.py @@ -3,12 +3,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class AustraliaNSW(BaseSpider): +class AustraliaNSW(SimpleSpider): name = 'australia_nsw' + data_type = 'release_package' def start_requests(self): release_types = ['planning', 'tender', 'contract'] @@ -47,25 +48,18 @@ def parse_list(self, response): uuid = release['tender']['plannedProcurementUUID'] yield scrapy.Request( 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid, - meta={'kf_filename': 'plannning-%s.json' % uuid}, - callback=self.parse + meta={'kf_filename': 'plannning-%s.json' % uuid} ) if release_type == 'tender': uuid = release['tender']['RFTUUID'] yield scrapy.Request( 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid, - meta={'kf_filename': 'tender-%s.json' % uuid}, - callback=self.parse + meta={'kf_filename': 'tender-%s.json' % uuid} ) if release_type == 'contract': for award in release['awards']: uuid = award['CNUUID'] yield scrapy.Request( 'https://tenders.nsw.gov.au/?event=public.api.contract.view&CNUUID=%s' % uuid, - meta={'kf_filename': 'contract-%s.json' % uuid}, - callback=self.parse + meta={'kf_filename': 'contract-%s.json' % uuid} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index cdb9ff15..a932135f 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -1,11 +1,12 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class CanadaBuyAndSell(BaseSpider): - name = "canada_buyandsell" +class CanadaBuyAndSell(SimpleSpider): + name = 'canada_buyandsell' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( @@ -26,7 +27,3 @@ def start_requests(self): 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-16-17.json', meta={'kf_filename': '16-17.json'} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index e71c7805..a624bffe 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -3,12 +3,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class France(BaseSpider): - name = "france" +class France(SimpleSpider): + name = 'france' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( @@ -44,7 +45,3 @@ def parse_list(self, response): meta={'kf_filename': hashlib.md5(next_page.encode('utf-8')).hexdigest() + '.json'}, callback=self.parse_list ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py index 9e27a2d3..a88e3e4b 100644 --- a/kingfisher_scrapy/spiders/honduras_cost.py +++ b/kingfisher_scrapy/spiders/honduras_cost.py @@ -2,21 +2,23 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class HondurasCoST(BaseSpider): +class HondurasCoST(SimpleSpider): name = 'honduras_cost' + data_type = 'record_package' def start_requests(self): yield scrapy.Request( 'http://app.sisocs.org/protected/ocdsShow/', meta={'kf_filename': 'list.html'}, + callback=self.parse_list ) @handle_error - def parse(self, response): + def parse_list(self, response): btns = response.css('script').xpath('text()').getall() for btn in btns: if 'download-all' and 'url:' in btn: @@ -26,10 +28,5 @@ def parse(self, response): url = url.replace('"', '').replace(',', '').lstrip('url:') yield scrapy.Request( url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, - callback=self.parse_btn + meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) - - @handle_error - def parse_btn(self, response): - yield self.build_file_from_response(response, data_type='record_package') diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 246ad037..55195d82 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -2,12 +2,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class HondurasPortalBulkFiles(BaseSpider): +class HondurasPortalBulkFiles(SimpleSpider): name = 'honduras_portal_bulk_files' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( @@ -28,7 +29,3 @@ def parse_list(self, response): for item in filelist: url = item['urls']['json'] yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/kenya_makueni.py b/kingfisher_scrapy/spiders/kenya_makueni.py index e8dddcb6..11d4d9ac 100644 --- a/kingfisher_scrapy/spiders/kenya_makueni.py +++ b/kingfisher_scrapy/spiders/kenya_makueni.py @@ -3,22 +3,21 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class KenyaMakueni(BaseSpider): +class KenyaMakueni(SimpleSpider): name = 'kenya_makueni' + data_type = 'release_package_list' url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={}&pageNumber={}' def start_requests(self): if self.sample: - page_number = 0 - page_size = 10 + url = self.url.format(10, 0) yield scrapy.Request( - self.url.format(page_size, page_number), - meta={'kf_filename': hashlib.md5((self.url + - str(page_number)).encode('utf-8')).hexdigest() + '.json'} + url, + meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) else: yield scrapy.Request( @@ -33,12 +32,8 @@ def parse_count(self, response): page_size = 300 for page_number in range((ceil(total / page_size))): + url = self.url.format(page_size, page_number) yield scrapy.Request( - self.url.format(page_size, page_number), - meta={'kf_filename': hashlib.md5((self.url + - str(page_number)).encode('utf-8')).hexdigest() + '.json'} + url, + meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package_list') diff --git a/kingfisher_scrapy/spiders/mexico_cdmx.py b/kingfisher_scrapy/spiders/mexico_cdmx.py index 326a7d2b..7a96ae64 100644 --- a/kingfisher_scrapy/spiders/mexico_cdmx.py +++ b/kingfisher_scrapy/spiders/mexico_cdmx.py @@ -2,12 +2,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class MexicoCDMXSource(BaseSpider): +class MexicoCDMXSource(SimpleSpider): name = 'mexico_cdmx' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( @@ -23,12 +24,4 @@ def parse_list(self, response): data = [data[0]] for data_item in data: - yield scrapy.Request( - data_item['uri'], - meta={'kf_filename': 'id%s.json' % data_item['id']}, - callback=self.parse_record - ) - - @handle_error - def parse_record(self, response): - yield self.build_file_from_response(response, data_type='release_package') + yield scrapy.Request(data_item['uri'], meta={'kf_filename': 'id%s.json' % data_item['id']}) diff --git a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py index 6c68226a..bb0c2701 100644 --- a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py +++ b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py @@ -1,18 +1,15 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class MexicoGrupoAeroporto(BaseSpider): +class MexicoGrupoAeroporto(SimpleSpider): name = 'mexico_grupo_aeroporto' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( 'http://gacmda.gacm.mx:8880/files/opendata/coleccion/concentrado05032019RELEASE.json', meta={'kf_filename': 'concentrado05032019RELEASE.json'} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 6b5f31f9..fd01a9b6 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -1,11 +1,12 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class MoldovaOld(BaseSpider): +class MoldovaOld(SimpleSpider): name = 'moldova_old' + data_type = 'release_package' def start_requests(self): if self.sample: @@ -19,7 +20,3 @@ def start_requests(self): 'http://opencontracting.date.gov.md/ocds-api/year/%d' % year, meta={'kf_filename': 'year-%d.json' % year} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/nepal_dhangadhi.py b/kingfisher_scrapy/spiders/nepal_dhangadhi.py index fbd40622..1b0f635a 100644 --- a/kingfisher_scrapy/spiders/nepal_dhangadhi.py +++ b/kingfisher_scrapy/spiders/nepal_dhangadhi.py @@ -3,12 +3,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class NepalDhangadhi(BaseSpider): - name = "nepal_dhangadhi" +class NepalDhangadhi(SimpleSpider): + name = 'nepal_dhangadhi' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( @@ -30,7 +31,3 @@ def parse_list(self, response): ) if self.sample: break - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py index 943ad292..b17d20f8 100644 --- a/kingfisher_scrapy/spiders/nepal_portal.py +++ b/kingfisher_scrapy/spiders/nepal_portal.py @@ -3,12 +3,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class NepalPortal(BaseSpider): +class NepalPortal(SimpleSpider): name = 'nepal_portal' + data_type = 'release_package' def start_requests(self): if self.sample: @@ -26,7 +27,3 @@ def start_requests(self): meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) current_year += 1 - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/nigeria_portal.py b/kingfisher_scrapy/spiders/nigeria_portal.py index e8c3e52e..82eac852 100644 --- a/kingfisher_scrapy/spiders/nigeria_portal.py +++ b/kingfisher_scrapy/spiders/nigeria_portal.py @@ -2,12 +2,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class NigeriaPortal(BaseSpider): +class NigeriaPortal(SimpleSpider): name = 'nigeria_portal' + data_type = 'release_package' + download_delay = 0.9 user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # noqa: E501 @@ -15,10 +17,11 @@ def start_requests(self): yield scrapy.Request( 'http://nocopo.bpp.gov.ng/OpenData.aspx', meta={'kf_filename': 'list.html'}, + callback=self.parse_list ) @handle_error - def parse(self, response): + def parse_list(self, response): formdata = { '__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)').extract_first(), '__VIEWSTATEGENERATOR': 'CA0B0334', @@ -36,10 +39,5 @@ def parse(self, response): yield scrapy.FormRequest.from_response( response, formdata=formdata, - meta={'kf_filename': hashlib.md5(response.url.encode('utf-8')).hexdigest() + '.json'}, - callback=self.parse_post + meta={'kf_filename': hashlib.md5(response.url.encode('utf-8')).hexdigest() + '.json'} ) - - @handle_error - def parse_post(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index 18151046..024dbf18 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -2,12 +2,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class Scotland(BaseSpider): +class Scotland(SimpleSpider): name = 'scotland' + data_type = 'release_package' notice_types = [ 1, # OJEU - F1 - Prior Information Notice @@ -51,7 +52,3 @@ def start_requests(self): yield scrapy.Request(format_string.format(datestring, notice_type), meta={'kf_filename': '{}_type_{}.json'.format(datestring, notice_type)}) marker = marker + datetime.timedelta(days=14) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/test_fail.py b/kingfisher_scrapy/spiders/test_fail.py index be553c5d..0074000e 100644 --- a/kingfisher_scrapy/spiders/test_fail.py +++ b/kingfisher_scrapy/spiders/test_fail.py @@ -3,12 +3,13 @@ """ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class TestFail(BaseSpider): - name = "test_fail" +class TestFail(SimpleSpider): + name = 'test_fail' + data_type = 'release_package' def start_requests(self): # Fine @@ -25,7 +26,3 @@ def start_requests(self): yield scrapy.Request('http://httpstat.us/500', meta={'kf_filename': 'http-500.json'}) # .... but actually, yes, I also broke the Proxy too yield scrapy.Request('http://httpstat.us/502', meta={'kf_filename': 'http-502.json'}) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/uganda_releases.py b/kingfisher_scrapy/spiders/uganda_releases.py index 888afc43..1e8f5305 100644 --- a/kingfisher_scrapy/spiders/uganda_releases.py +++ b/kingfisher_scrapy/spiders/uganda_releases.py @@ -3,12 +3,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class Uganda(BaseSpider): +class Uganda(SimpleSpider): name = 'uganda_releases' + data_type = 'release_package' + download_delay = 0.9 def start_requests(self): @@ -65,7 +67,3 @@ def parse_data(self, response): break if self.sample: break - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') From 7c775fb0b4faa878f7bfbd21d1bfe8448c69ce8b Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:42:35 -0400 Subject: [PATCH 13/19] mexico_inai: Remove explicit default callback --- kingfisher_scrapy/spiders/mexico_inai.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kingfisher_scrapy/spiders/mexico_inai.py b/kingfisher_scrapy/spiders/mexico_inai.py index 6289634d..69592c34 100644 --- a/kingfisher_scrapy/spiders/mexico_inai.py +++ b/kingfisher_scrapy/spiders/mexico_inai.py @@ -38,8 +38,7 @@ def parse_redirect(self, response): url = response.headers['Location'].decode("utf-8").replace("open?", "uc?export=download&") yield scrapy.Request( url, - meta={'kf_filename': 'data-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, - callback=self.parse + meta={'kf_filename': 'data-' + hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'} ) else: yield self.build_file_error_from_response(response) From 9e57059ff085a14f1c7f4735d750deb68639f63c Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:57:26 -0400 Subject: [PATCH 14/19] Simplify ChileCompraBaseSpider and descendants --- kingfisher_scrapy/spiders/chile_base.py | 20 +++++++++---------- .../spiders/chile_compra_records.py | 6 +----- .../spiders/chile_compra_releases.py | 6 +----- 3 files changed, 11 insertions(+), 21 deletions(-) diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index aca03da6..3d8cdcc4 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -44,16 +44,15 @@ def start_requests(self): meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month}, ) - def base_parse(self, response, package_type): + def parse(self, response): data = json.loads(response.text) if 'data' in data: - yield_list = [] for data_item in data['data']: - if package_type == 'record': - yield_list.append(scrapy.Request( + if self.data_type == 'record_package': + yield scrapy.Request( self.record_url % data_item['ocid'].replace('ocds-70d2nz-', ''), - meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], package_type)} - )) + meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], self.data_type)} + ) else: # the data comes in this format: # "data": [ @@ -74,12 +73,11 @@ def base_parse(self, response, package_type): year = response.request.meta['year'] month = response.request.meta['month'] offset = data['pagination']['offset'] - yield_list.append(scrapy.Request( + yield scrapy.Request( self.base_list_url.format(year, month, self.limit + offset, self.limit), meta={'year': year, 'month': month} - )) - return yield_list + ) elif 'status' in data and data['status'] != 200: - return [self.build_file_error_from_response(response, errors={'http_code': data['status']})] + yield self.build_file_error_from_response(response, errors={'http_code': data['status']}) else: - return [self.build_file_from_response(response, data_type='{}_package'.format(package_type))] + yield self.build_file_from_response(response, data_type=self.data_type) diff --git a/kingfisher_scrapy/spiders/chile_compra_records.py b/kingfisher_scrapy/spiders/chile_compra_records.py index 2565ab84..e9e9c1ea 100644 --- a/kingfisher_scrapy/spiders/chile_compra_records.py +++ b/kingfisher_scrapy/spiders/chile_compra_records.py @@ -4,8 +4,4 @@ class ChileCompraRecords(ChileCompraBaseSpider): name = 'chile_compra_records' - - @handle_error - def parse(self, response): - for item in self.base_parse(response, 'record'): - yield item + data_type = 'record_package' diff --git a/kingfisher_scrapy/spiders/chile_compra_releases.py b/kingfisher_scrapy/spiders/chile_compra_releases.py index 93b9678d..8acf6b09 100644 --- a/kingfisher_scrapy/spiders/chile_compra_releases.py +++ b/kingfisher_scrapy/spiders/chile_compra_releases.py @@ -4,8 +4,4 @@ class ChileCompraReleases(ChileCompraBaseSpider): name = 'chile_compra_releases' - - @handle_error - def parse(self, response): - for item in self.base_parse(response, 'release'): - yield item + data_type = 'release_package' From 78449692fbcc9f3d25aa05f64641a5f86a83ea0b Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:58:57 -0400 Subject: [PATCH 15/19] Convert some more spiders to use SimpleSpider --- kingfisher_scrapy/base_spider.py | 5 ++++- kingfisher_scrapy/spiders/argentina_vialidad.py | 9 +++------ kingfisher_scrapy/spiders/mexico_inai.py | 10 ++++------ kingfisher_scrapy/spiders/mexico_jalisco.py | 12 ++++-------- kingfisher_scrapy/spiders/paraguay_dncp_base.py | 8 ++------ kingfisher_scrapy/spiders/uruguay_base.py | 13 +++++-------- kingfisher_scrapy/spiders/uruguay_records.py | 11 ++++------- kingfisher_scrapy/spiders/uruguay_releases.py | 7 ++----- 8 files changed, 28 insertions(+), 47 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 1cbc7bc2..7da33938 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -193,6 +193,7 @@ class SimpleSpider(BaseSpider): 1. Inherit from ``SimpleSpider`` 1. Set a ``data_type`` class attribute to the data type of the responses + 1. Optionally, set an ``encoding`` class attribute to the encoding of the responses (default UTF-8) 1. Write a ``start_requests`` method (and any intermediate callbacks) to send requests .. code-block:: python @@ -209,9 +210,11 @@ def start_requests(self): yield scrapy.Request('https://example.com/api/package.json', meta={'kf_filename': 'all.json'}) """ + encoding = 'utf-8' + @handle_error def parse(self, response): - yield self.build_file_from_response(response, data_type=self.data_type) + yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding) class ZipSpider(BaseSpider): diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index 7f199d0a..559222ac 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -1,18 +1,15 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class ArgentinaVialidad(BaseSpider): +class ArgentinaVialidad(SimpleSpider): name = 'argentina_vialidad' + data_type = 'release_package_list' def start_requests(self): yield scrapy.Request( 'https://datosabiertos.vialidad.gob.ar/api/ocds/package/all', meta={'kf_filename': 'all.json'} ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, file_name='all.json', data_type='release_package_list') diff --git a/kingfisher_scrapy/spiders/mexico_inai.py b/kingfisher_scrapy/spiders/mexico_inai.py index 69592c34..3cb7bbd5 100644 --- a/kingfisher_scrapy/spiders/mexico_inai.py +++ b/kingfisher_scrapy/spiders/mexico_inai.py @@ -3,12 +3,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class MexicoINAI(BaseSpider): +class MexicoINAI(SimpleSpider): name = 'mexico_inai' + data_type = 'release_package' + encoding = 'utf-8-sig' def start_requests(self): yield scrapy.Request( @@ -42,7 +44,3 @@ def parse_redirect(self, response): ) else: yield self.build_file_error_from_response(response) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package', encoding='utf-8-sig') diff --git a/kingfisher_scrapy/spiders/mexico_jalisco.py b/kingfisher_scrapy/spiders/mexico_jalisco.py index 4076a0c8..7aef33eb 100644 --- a/kingfisher_scrapy/spiders/mexico_jalisco.py +++ b/kingfisher_scrapy/spiders/mexico_jalisco.py @@ -3,12 +3,13 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class MexicoJalisco(BaseSpider): +class MexicoJalisco(SimpleSpider): name = 'mexico_jalisco' + data_type = 'release_package' def start_requests(self): yield scrapy.Request( @@ -36,11 +37,6 @@ def parse_record_package(self, response): for url in json_data['packages']: yield scrapy.Request( url, - meta={'kf_filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest()}, - callback=self.parse_release_package + meta={'kf_filename': 'packages-%s.json' % hashlib.md5(url.encode('utf-8')).hexdigest()} ) yield self.build_file_from_response(response, data_type='record_package') - - @handle_error - def parse_release_package(self, response): - yield self.build_file_from_response(response, data_type='release_package') diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 1728964b..4c6e265f 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -4,12 +4,12 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.exceptions import AuthenticationError from kingfisher_scrapy.util import handle_error -class ParaguayDNCPBaseSpider(BaseSpider): +class ParaguayDNCPBaseSpider(SimpleSpider): """ This base class contains methods used for Paraguay DNCP's authentication protocol. """ @@ -137,10 +137,6 @@ def parse_pages(self, response): callback=self.parse_pages ) - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type=self.data_type) - def get_files_to_download(self, content): """ Override this """ diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py index 6b138a3a..01c24545 100644 --- a/kingfisher_scrapy/spiders/uruguay_base.py +++ b/kingfisher_scrapy/spiders/uruguay_base.py @@ -3,15 +3,16 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.util import handle_error -class UruguayBase(BaseSpider): - base_url = 'http://comprasestatales.gub.uy/ocds/rss/{year:d}/{month:02d}' +class UruguayBase(SimpleSpider): download_delay = 0.9 def start_requests(self): + base_url = 'http://comprasestatales.gub.uy/ocds/rss/{year:d}/{month:02d}' + current_date = date(2017, 11, 1) if self.sample: end_date = date(2017, 12, 1) @@ -22,13 +23,9 @@ def start_requests(self): current_date += timedelta(days=32) current_date.replace(day=1) - url = self.base_url.format(year=current_date.year, month=current_date.month) + url = base_url.format(year=current_date.year, month=current_date.month) yield scrapy.Request( url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}, callback=self.parse_list ) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type=response.request.meta['data_type']) diff --git a/kingfisher_scrapy/spiders/uruguay_records.py b/kingfisher_scrapy/spiders/uruguay_records.py index e0a671ef..1e914bd1 100644 --- a/kingfisher_scrapy/spiders/uruguay_records.py +++ b/kingfisher_scrapy/spiders/uruguay_records.py @@ -8,19 +8,16 @@ class UruguayRecords(UruguayBase): name = 'uruguay_records' - base_record_url = 'https://www.comprasestatales.gub.uy/ocds/record/{}' + data_type = 'record_package' @handle_error def parse_list(self, response): + base_record_url = 'https://www.comprasestatales.gub.uy/ocds/record/{}' root = response.xpath('//item/title/text()').getall() if self.sample: root = [root[0]] for id_compra in root: - url = self.base_record_url.format(id_compra.split(',')[0].replace('id_compra:', '')) - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', - 'data_type': 'record_package'} - ) + url = base_record_url.format(id_compra.split(',')[0].replace('id_compra:', '')) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/uruguay_releases.py b/kingfisher_scrapy/spiders/uruguay_releases.py index 81e6b4f2..832cc027 100644 --- a/kingfisher_scrapy/spiders/uruguay_releases.py +++ b/kingfisher_scrapy/spiders/uruguay_releases.py @@ -8,6 +8,7 @@ class UruguayReleases(UruguayBase): name = 'uruguay_releases' + data_type = 'release_package' @handle_error def parse_list(self, response): @@ -17,8 +18,4 @@ def parse_list(self, response): root = [root[0]] for url in root: - yield scrapy.Request( - url, - meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', - 'data_type': 'release_package'} - ) + yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) From a71ee1a6f9d893de7a396d75db9c6dcdfddfa579 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 13:59:11 -0400 Subject: [PATCH 16/19] canada_montreal: Fix code style --- kingfisher_scrapy/spiders/canada_montreal.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 126a12b8..64366213 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -29,8 +29,5 @@ def parse(self, response): while offset < total: url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit=%d&offset=%d' % \ (self.page_limit, offset) - yield scrapy.Request( - url, - meta={'kf_filename': 'page' + str(offset) + '.json'} - ) + yield scrapy.Request(url, meta={'kf_filename': 'page' + str(offset) + '.json'}) offset += self.page_limit From 025344f0daf5d34c4d0c91bdaf8d76eee031e5cc Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 14:23:39 -0400 Subject: [PATCH 17/19] chile_base: Fix "Simplify ChileCompraBaseSpider and descendants" 9e57059ff085a14f1c7f4735d750deb68639f63c --- kingfisher_scrapy/spiders/chile_base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index 3d8cdcc4..c0f851f6 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -4,6 +4,7 @@ import scrapy from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.util import handle_error class ChileCompraBaseSpider(BaseSpider): @@ -44,6 +45,7 @@ def start_requests(self): meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month}, ) + @handle_error def parse(self, response): data = json.loads(response.text) if 'data' in data: @@ -65,10 +67,10 @@ def parse(self, response): for stage in list(data_item.keys()): if 'url' in stage: name = stage.replace('url', '') - yield_list.append(scrapy.Request( + yield scrapy.Request( data_item[stage], meta={'kf_filename': 'data-%s-%s.json' % (data_item['ocid'], name)} - )) + ) if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']: year = response.request.meta['year'] month = response.request.meta['month'] From 6b67a3113f2f18a8a9777082fc2b7e3c2ac174b1 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 14:26:08 -0400 Subject: [PATCH 18/19] Add data_pointer to SimpleSpider. Add next_pointer to LinksSpider. Update relevant spiders. --- kingfisher_scrapy/base_spider.py | 23 +++++++++++++------ kingfisher_scrapy/spiders/armenia.py | 16 ++++--------- .../spiders/honduras_portal_records.py | 21 +++++------------ .../spiders/honduras_portal_releases.py | 21 +++++------------ requirements.in | 1 + requirements.txt | 6 ++--- requirements_dev.txt | 3 ++- 7 files changed, 37 insertions(+), 54 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 7da33938..78163e16 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -6,6 +6,7 @@ import ijson import scrapy +from jsonpointer import resolve_pointer from kingfisher_scrapy import util from kingfisher_scrapy.exceptions import SpiderArgumentError @@ -194,6 +195,7 @@ class SimpleSpider(BaseSpider): 1. Inherit from ``SimpleSpider`` 1. Set a ``data_type`` class attribute to the data type of the responses 1. Optionally, set an ``encoding`` class attribute to the encoding of the responses (default UTF-8) + 1. Optionally, set a ``data_pointer`` class attribute to the JSON Pointer for OCDS data (default "") 1. Write a ``start_requests`` method (and any intermediate callbacks) to send requests .. code-block:: python @@ -211,10 +213,15 @@ def start_requests(self): """ encoding = 'utf-8' + data_pointer = '' @handle_error def parse(self, response): - yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding) + kwargs = {} + if self.data_pointer: + kwargs['data'] = json.dumps(resolve_pointer(json.loads(response.text), self.data_pointer)).encode() + + yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding, **kwargs) class ZipSpider(BaseSpider): @@ -286,13 +293,14 @@ def parse(self, response): yield self.build_file(data=data.read(), **kwargs) -class LinksSpider(BaseSpider): +class LinksSpider(SimpleSpider): """ This class makes it easy to collect data from an API that implements the `pagination `__ pattern: 1. Inherit from ``LinksSpider`` 1. Set a ``data_type`` class attribute to the data type of the API responses + 1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next") 1. Write a ``start_requests`` method to request the first page of API results .. code-block:: python @@ -309,19 +317,20 @@ def start_requests(self): yield scrapy.Request('https://example.com/api/packages.json', meta={'kf_filename': 'page1.json'}) """ + next_pointer = '/links/next' + @handle_error def parse(self, response): - yield self.build_file_from_response(response, data_type=self.data_type) + yield from super().parse(response) if not self.sample: yield self.next_link(response) - @staticmethod - def next_link(response): + def next_link(self, response): """ If the JSON response has a ``links.next`` key, returns a ``scrapy.Request`` for the URL. """ data = json.loads(response.text) - if 'links' in data and 'next' in data['links']: - url = data['links']['next'] + url = resolve_pointer(data, self.next_pointer, None) + if url: return scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index 6fcad702..bb52cf2b 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -3,22 +3,14 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import LinksSpider from kingfisher_scrapy.util import handle_error -class Armenia(BaseSpider): +class Armenia(LinksSpider): name = 'armenia' + data_type = 'release_package' + next_pointer = '/next_page/uri' def start_requests(self): yield scrapy.Request('https://armeps.am/ocds/release', meta={'kf_filename': 'page1.json'}) - - @handle_error - def parse(self, response): - yield self.build_file_from_response(response, data_type='release_package') - - json_data = json.loads(response.text) - if not (self.sample): - if 'next_page' in json_data and 'uri' in json_data['next_page']: - url = json_data['next_page']['uri'] - yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest()+'.json'}) diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py index 2ab2cc2d..2a7d2e8d 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_records.py +++ b/kingfisher_scrapy/spiders/honduras_portal_records.py @@ -3,27 +3,18 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import LinksSpider from kingfisher_scrapy.util import handle_error -class HondurasPortalRecords(BaseSpider): +class HondurasPortalRecords(LinksSpider): name = 'honduras_portal_records' + data_type = 'record_package' + data_pointer = '/recordPackage' + next_pointer = '/next' + download_delay = 0.9 def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json' yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) - - @handle_error - def parse(self, response): - json_data = json.loads(response.text) - yield self.build_file_from_response( - response, - data=json.dumps(json_data['releasePackage']).encode(), - data_type='record_package' - ) - - url = json_data.get('next') - if url and not self.sample: - yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/kingfisher_scrapy/spiders/honduras_portal_releases.py b/kingfisher_scrapy/spiders/honduras_portal_releases.py index 82635f04..ce409192 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_releases.py +++ b/kingfisher_scrapy/spiders/honduras_portal_releases.py @@ -3,27 +3,18 @@ import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import LinksSpider from kingfisher_scrapy.util import handle_error -class HondurasPortalReleases(BaseSpider): +class HondurasPortalReleases(LinksSpider): name = 'honduras_portal_releases' + data_type = 'release_package' + data_pointer = '/releasePackage' + next_pointer = '/next' + download_delay = 0.9 def start_requests(self): url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json' yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) - - @handle_error - def parse(self, response): - json_data = json.loads(response.text) - yield self.build_file_from_response( - response, - data=json.dumps(json_data['releasePackage']).encode(), - data_type='release_package' - ) - - url = json_data.get('next') - if url and not self.sample: - yield scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'}) diff --git a/requirements.in b/requirements.in index 34c50ac3..df5a116e 100644 --- a/requirements.in +++ b/requirements.in @@ -1,6 +1,7 @@ # Any change to this file MUST be replicated in: # https://github.com/open-contracting/deploy/blob/master/salt/ocdskingfishercollect/scrapyd-requirements.txt +jsonpointer rarfile requests Scrapy diff --git a/requirements.txt b/requirements.txt index 16b7f6ad..c126ac98 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,6 @@ automat==0.8.0 # via twisted certifi==2019.11.28 # via requests cffi==1.13.2 # via cryptography chardet==3.0.4 # via requests -click==7.1.2 # via pip-tools constantly==15.1.0 # via twisted cryptography==2.8 # via pyopenssl, scrapy, service-identity cssselect==1.1.0 # via parsel, scrapy @@ -17,9 +16,9 @@ hyperlink==19.0.0 # via twisted idna==2.8 # via hyperlink, requests ijson==3.0.3 incremental==17.5.0 # via twisted +jsonpointer==2.0 lxml==4.4.2 # via parsel, scrapy parsel==1.5.2 # via scrapy -pip-tools==5.1.0 protego==0.1.16 # via scrapy pyasn1-modules==0.2.7 # via service-identity pyasn1==0.4.8 # via pyasn1-modules, service-identity @@ -33,12 +32,11 @@ requests==2.22.0 scrapy==1.8.0 scrapyd-client==1.1.0 service-identity==18.1.0 # via scrapy -six==1.13.0 # via automat, cryptography, parsel, pip-tools, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib +six==1.13.0 # via automat, cryptography, parsel, protego, pyhamcrest, pyopenssl, scrapy, scrapyd-client, w3lib twisted==20.3.0 # via scrapy urllib3==1.25.7 # via requests w3lib==1.21.0 # via parsel, scrapy zope.interface==4.7.1 # via scrapy, twisted # The following packages are considered to be unsafe in a requirements file: -# pip # setuptools diff --git a/requirements_dev.txt b/requirements_dev.txt index 1be8d3f0..9ef3704a 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -9,7 +9,7 @@ automat==0.8.0 certifi==2019.11.28 cffi==1.13.2 chardet==3.0.4 -click==7.1.2 +click==7.1.2 # via pip-tools constantly==15.1.0 coverage==5.0.3 # via coveralls, pytest-cov coveralls==2.0.0 @@ -24,6 +24,7 @@ ijson==3.0.3 importlib-metadata==1.3.0 # via pluggy, pytest incremental==17.5.0 isort==4.3.21 +jsonpointer==2.0 lxml==4.4.2 mccabe==0.6.1 # via flake8 more-itertools==8.0.2 # via pytest, zipp From 415f4515be9f4cfb9eedff756ad9d79e05c7a10b Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Sat, 30 May 2020 14:28:10 -0400 Subject: [PATCH 19/19] flake8 --- kingfisher_scrapy/spiders/argentina_vialidad.py | 1 - kingfisher_scrapy/spiders/armenia.py | 4 ---- kingfisher_scrapy/spiders/canada_buyandsell.py | 1 - kingfisher_scrapy/spiders/chile_compra_records.py | 1 - kingfisher_scrapy/spiders/chile_compra_releases.py | 1 - kingfisher_scrapy/spiders/honduras_portal_records.py | 2 -- kingfisher_scrapy/spiders/honduras_portal_releases.py | 2 -- kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py | 1 - kingfisher_scrapy/spiders/moldova_old.py | 1 - kingfisher_scrapy/spiders/nepal_portal.py | 1 - kingfisher_scrapy/spiders/scotland.py | 1 - kingfisher_scrapy/spiders/test_fail.py | 1 - kingfisher_scrapy/spiders/uruguay_base.py | 1 - 13 files changed, 18 deletions(-) diff --git a/kingfisher_scrapy/spiders/argentina_vialidad.py b/kingfisher_scrapy/spiders/argentina_vialidad.py index 559222ac..1c44cfc8 100644 --- a/kingfisher_scrapy/spiders/argentina_vialidad.py +++ b/kingfisher_scrapy/spiders/argentina_vialidad.py @@ -1,7 +1,6 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class ArgentinaVialidad(SimpleSpider): diff --git a/kingfisher_scrapy/spiders/armenia.py b/kingfisher_scrapy/spiders/armenia.py index bb52cf2b..67c42a6e 100644 --- a/kingfisher_scrapy/spiders/armenia.py +++ b/kingfisher_scrapy/spiders/armenia.py @@ -1,10 +1,6 @@ -import hashlib -import json - import scrapy from kingfisher_scrapy.base_spider import LinksSpider -from kingfisher_scrapy.util import handle_error class Armenia(LinksSpider): diff --git a/kingfisher_scrapy/spiders/canada_buyandsell.py b/kingfisher_scrapy/spiders/canada_buyandsell.py index a932135f..e9d311e2 100644 --- a/kingfisher_scrapy/spiders/canada_buyandsell.py +++ b/kingfisher_scrapy/spiders/canada_buyandsell.py @@ -1,7 +1,6 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class CanadaBuyAndSell(SimpleSpider): diff --git a/kingfisher_scrapy/spiders/chile_compra_records.py b/kingfisher_scrapy/spiders/chile_compra_records.py index e9e9c1ea..f7320964 100644 --- a/kingfisher_scrapy/spiders/chile_compra_records.py +++ b/kingfisher_scrapy/spiders/chile_compra_records.py @@ -1,5 +1,4 @@ from kingfisher_scrapy.spiders.chile_base import ChileCompraBaseSpider -from kingfisher_scrapy.util import handle_error class ChileCompraRecords(ChileCompraBaseSpider): diff --git a/kingfisher_scrapy/spiders/chile_compra_releases.py b/kingfisher_scrapy/spiders/chile_compra_releases.py index 8acf6b09..e1082f83 100644 --- a/kingfisher_scrapy/spiders/chile_compra_releases.py +++ b/kingfisher_scrapy/spiders/chile_compra_releases.py @@ -1,5 +1,4 @@ from kingfisher_scrapy.spiders.chile_base import ChileCompraBaseSpider -from kingfisher_scrapy.util import handle_error class ChileCompraReleases(ChileCompraBaseSpider): diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py index 2a7d2e8d..1d3fc5de 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_records.py +++ b/kingfisher_scrapy/spiders/honduras_portal_records.py @@ -1,10 +1,8 @@ import hashlib -import json import scrapy from kingfisher_scrapy.base_spider import LinksSpider -from kingfisher_scrapy.util import handle_error class HondurasPortalRecords(LinksSpider): diff --git a/kingfisher_scrapy/spiders/honduras_portal_releases.py b/kingfisher_scrapy/spiders/honduras_portal_releases.py index ce409192..a676383b 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_releases.py +++ b/kingfisher_scrapy/spiders/honduras_portal_releases.py @@ -1,10 +1,8 @@ import hashlib -import json import scrapy from kingfisher_scrapy.base_spider import LinksSpider -from kingfisher_scrapy.util import handle_error class HondurasPortalReleases(LinksSpider): diff --git a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py index bb0c2701..08288d21 100644 --- a/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py +++ b/kingfisher_scrapy/spiders/mexico_grupo_aeroporto.py @@ -1,7 +1,6 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class MexicoGrupoAeroporto(SimpleSpider): diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index fd01a9b6..a8324c21 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -1,7 +1,6 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class MoldovaOld(SimpleSpider): diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py index b17d20f8..3cc70953 100644 --- a/kingfisher_scrapy/spiders/nepal_portal.py +++ b/kingfisher_scrapy/spiders/nepal_portal.py @@ -4,7 +4,6 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class NepalPortal(SimpleSpider): diff --git a/kingfisher_scrapy/spiders/scotland.py b/kingfisher_scrapy/spiders/scotland.py index 024dbf18..76b365dc 100644 --- a/kingfisher_scrapy/spiders/scotland.py +++ b/kingfisher_scrapy/spiders/scotland.py @@ -3,7 +3,6 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class Scotland(SimpleSpider): diff --git a/kingfisher_scrapy/spiders/test_fail.py b/kingfisher_scrapy/spiders/test_fail.py index 0074000e..e27ef234 100644 --- a/kingfisher_scrapy/spiders/test_fail.py +++ b/kingfisher_scrapy/spiders/test_fail.py @@ -4,7 +4,6 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class TestFail(SimpleSpider): diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py index 01c24545..f1d6152a 100644 --- a/kingfisher_scrapy/spiders/uruguay_base.py +++ b/kingfisher_scrapy/spiders/uruguay_base.py @@ -4,7 +4,6 @@ import scrapy from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import handle_error class UruguayBase(SimpleSpider):