From 24a6720de4807808d2820da615e9e8e9df71f3cc Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 29 May 2020 17:29:07 -0400 Subject: [PATCH 1/6] Add kf_filename meta to all initial requests --- kingfisher_scrapy/spiders/chile_base.py | 14 ++++++-------- kingfisher_scrapy/spiders/colombia_bulk.py | 3 ++- kingfisher_scrapy/spiders/digiwhist_base.py | 9 ++++++++- kingfisher_scrapy/spiders/dominican_republic.py | 9 ++++++--- kingfisher_scrapy/spiders/france.py | 5 +++-- kingfisher_scrapy/spiders/honduras_cost.py | 7 ++++++- kingfisher_scrapy/spiders/honduras_oncae.py | 8 ++++++-- .../spiders/honduras_portal_bulk_files.py | 5 +++-- kingfisher_scrapy/spiders/nepal_dhangadhi.py | 5 +++-- kingfisher_scrapy/spiders/nigeria_portal.py | 7 ++++++- kingfisher_scrapy/spiders/uruguay_historical.py | 4 +--- 11 files changed, 50 insertions(+), 26 deletions(-) diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index b04d113a4..8c242de0d 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -25,16 +25,14 @@ def get_year_month_until(self): until_month = 12 if self.start_year != datetime.datetime.now().year else until_month return until_year, until_month - def get_sample_request(self): - return scrapy.Request( - url=self.base_list_url.format(2017, 10, 0, 10), - meta={'year': 2017, 'month': 10} - ) - def start_requests(self): if self.sample: - yield self.get_sample_request() + yield scrapy.Request( + url=self.base_list_url.format(2017, 10, 0, 10), + meta={'kf_filename': 'list-2017-10.json', 'year': 2017, 'month': 10}, + ) return + until_year, until_month = self.get_year_month_until() for year in range(self.start_year, until_year): for month in range(1, 13): @@ -43,7 +41,7 @@ def start_requests(self): break yield scrapy.Request( url=self.base_list_url.format(year, month, 0, self.limit), - meta={'year': year, 'month': month} + meta={'kf_filename': 'list-{}-{:02d}.json'.format(year, month), 'year': year, 'month': month}, ) def base_parse(self, response, package_type): diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 8b5dcc755..874b3c97e 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -26,7 +26,8 @@ class ColombiaBulk(ZipSpider): def start_requests(self): yield scrapy.Request( url='https://www.colombiacompra.gov.co/transparencia/datos-json', - callback=self.parse_list + meta={'kf_filename': 'list.html'}, + callback=self.parse_list, ) @handle_error diff --git a/kingfisher_scrapy/spiders/digiwhist_base.py b/kingfisher_scrapy/spiders/digiwhist_base.py index c587e8186..a35d418c7 100644 --- a/kingfisher_scrapy/spiders/digiwhist_base.py +++ b/kingfisher_scrapy/spiders/digiwhist_base.py @@ -1,14 +1,21 @@ import tarfile from io import BytesIO +import scrapy + from kingfisher_scrapy.base_spider import BaseSpider from kingfisher_scrapy.util import handle_error class DigiwhistBase(BaseSpider): + def start_requests(self): + # See scrapy.spiders.Spider.start_requests + for url in self.start_urls: + yield scrapy.Request(url, dont_filter=True, meta={'kf_filename': 'file.tar.gz'}) + @handle_error def parse(self, response): - yield self.build_file_from_response(response, 'file.tar.gz', post_to_api=False) + yield self.build_file_from_response(response, response.request.meta['kf_filename'], post_to_api=False) # Load a line at the time, pass it to API with tarfile.open(fileobj=BytesIO(response.body), mode="r:gz") as tar: diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index ac4a14e80..fe732e0e5 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -15,11 +15,14 @@ class DominicanRepublic(BaseSpider): } def start_requests(self): - yield scrapy.Request('https://www.dgcp.gob.do/estandar-mundial-ocds/', - callback=self.parse_main_page) + yield scrapy.Request( + 'https://www.dgcp.gob.do/estandar-mundial-ocds/', + meta={'kf_filename': 'list.html'}, + callback=self.parse_list, + ) @handle_error - def parse_main_page(self, response): + def parse_list(self, response): urls = response.css('.fileLink::attr(href)').getall() json_urls = list(filter(lambda x: '/JSON_DGCP_' in x, urls)) diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index 4d7fcc13f..bf2989ddb 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -13,11 +13,12 @@ class France(BaseSpider): def start_requests(self): yield scrapy.Request( url='https://www.data.gouv.fr/api/1/datasets/?organization=534fff75a3a7292c64a77de4', - callback=self.parse_item + meta={'kf_filename': 'list.json'}, + callback=self.parse_list, ) @handle_error - def parse_item(self, response): + def parse_list(self, response): json_data = json.loads(response.text) data = json_data['data'] for item in data: diff --git a/kingfisher_scrapy/spiders/honduras_cost.py b/kingfisher_scrapy/spiders/honduras_cost.py index 950c2e141..6d3fb9fda 100644 --- a/kingfisher_scrapy/spiders/honduras_cost.py +++ b/kingfisher_scrapy/spiders/honduras_cost.py @@ -8,7 +8,12 @@ class HondurasCoST(BaseSpider): name = 'honduras_cost' - start_urls = ['http://app.sisocs.org/protected/ocdsShow/'] + + def start_requests(self): + yield scrapy.Request( + 'http://app.sisocs.org/protected/ocdsShow/', + meta={'kf_filename': 'list.html'}, + ) @handle_error def parse(self, response): diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 7912b3728..cd9672006 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -8,11 +8,15 @@ class HondurasONCAE(ZipSpider): name = 'honduras_oncae' - start_urls = ['http://oncae.gob.hn/datosabiertos'] - # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 + def start_requests(self): + yield scrapy.Request( + 'http://oncae.gob.hn/datosabiertos', + meta={'kf_filename': 'list.html'}, + ) + @handle_error def parse(self, response): urls = response.css(".article-content ul")\ diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 01efbed89..889624f5c 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -13,11 +13,12 @@ class HondurasPortalBulkFiles(BaseSpider): def start_requests(self): yield scrapy.Request( 'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json', - callback=self.parse_json_list + meta={'kf_filename': 'list.json'}, + callback=self.parse_list, ) @handle_error - def parse_json_list(self, response): + def parse_list(self, response): filelist = json.loads(response.text) if self.sample: diff --git a/kingfisher_scrapy/spiders/nepal_dhangadhi.py b/kingfisher_scrapy/spiders/nepal_dhangadhi.py index 9cb5536bf..342e9d208 100644 --- a/kingfisher_scrapy/spiders/nepal_dhangadhi.py +++ b/kingfisher_scrapy/spiders/nepal_dhangadhi.py @@ -13,11 +13,12 @@ class NepalDhangadhi(BaseSpider): def start_requests(self): yield scrapy.Request( 'https://admin.ims.susasan.org/api/static-data/dhangadhi', - callback=self.parse_item, + meta={'kf_filename': 'list.json'}, + callback=self.parse_list, ) @handle_error - def parse_item(self, response): + def parse_list(self, response): url = 'https://admin.ims.susasan.org/ocds/json/dhangadhi-{}.json' json_data = json.loads(response.text) fiscal_years = json_data['data']['fiscal_years'] diff --git a/kingfisher_scrapy/spiders/nigeria_portal.py b/kingfisher_scrapy/spiders/nigeria_portal.py index f5a2b5db7..8b532503f 100644 --- a/kingfisher_scrapy/spiders/nigeria_portal.py +++ b/kingfisher_scrapy/spiders/nigeria_portal.py @@ -8,10 +8,15 @@ class NigeriaPortal(BaseSpider): name = 'nigeria_portal' - start_urls = ['http://nocopo.bpp.gov.ng/OpenData.aspx'] download_delay = 0.9 user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # noqa: E501 + def start_requests(self): + yield scrapy.Request( + 'http://nocopo.bpp.gov.ng/OpenData.aspx', + meta={'kf_filename': 'list.html'}, + ) + @handle_error def parse(self, response): formdata = { diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py index cb7b77901..e7c0b96d0 100644 --- a/kingfisher_scrapy/spiders/uruguay_historical.py +++ b/kingfisher_scrapy/spiders/uruguay_historical.py @@ -23,6 +23,4 @@ def start_requests(self): if self.sample: end_year = 2003 for year in range(2002, end_year): - yield scrapy.Request( - url=base_url.format(year) - ) + yield scrapy.Request(base_url.format(year), meta={'kf_filename': 'OCDS-{}.zip'.format(year)}) From 9dbc0ea1459daa7a0b91b25b53b3ac5b5c17d43e Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 29 May 2020 17:30:01 -0400 Subject: [PATCH 2/6] Test that, if an initial request errors, it returns a FileError item with a file_name key --- tests/test_spiders.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_spiders.py b/tests/test_spiders.py index d6c68a88a..4cb4c5478 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -35,8 +35,9 @@ def test_start_requests_http_error(spider_name): assert len(items) == 1 for item in items: assert isinstance(item, FileError) - assert len(item) <= 3 + assert len(item) == 3 assert item['errors'] == {'http_code': 555} + assert item['file_name'] assert item['url'] except CloseSpider as e: warnings.warn('{}: {}'.format(spidercls.name, e.reason)) @@ -46,4 +47,4 @@ def test_start_requests_http_error(spider_name): def test_start_urls_start_requests(spider_name): spidercls = runner.spider_loader.load(spider_name) - assert hasattr(spidercls, 'start_urls') ^ method_is_overridden(spidercls, scrapy.Spider, 'start_requests') + assert 'start_urls' not in spidercls.__dict__ or 'start_requests' not in spidercls.__dict__ From e9441b5623f7caee6d90dcdf7ed938ad093419b5 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 29 May 2020 17:32:03 -0400 Subject: [PATCH 3/6] Add validation for required fields in items --- kingfisher_scrapy/base_spider.py | 10 +++---- kingfisher_scrapy/exceptions.py | 6 +++- kingfisher_scrapy/extensions.py | 2 +- kingfisher_scrapy/items.py | 48 +++++++++++++++++++++++++++----- kingfisher_scrapy/middlewares.py | 5 ---- kingfisher_scrapy/pipelines.py | 7 +++-- kingfisher_scrapy/settings.py | 6 ++-- 7 files changed, 60 insertions(+), 24 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 7cc90d4d0..b97dbec2e 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -129,14 +129,14 @@ def build_file_item(self, number, data, data_type, url, encoding, file_name): }) def build_file_error_from_response(self, response, **kwargs): - file_error = { + item = FileError({ 'url': response.request.url, 'errors': {'http_code': response.status}, - } + }) if 'kf_filename' in response.request.meta: - file_error['file_name'] = response.request.meta['kf_filename'] - file_error.update(kwargs) - return FileError(file_error) + item['file_name'] = response.request.meta['kf_filename'] + item.update(kwargs) + return item def _get_package_metadata(self, f, skip_key): """ diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py index 04f1a6c08..da83a3a5f 100644 --- a/kingfisher_scrapy/exceptions.py +++ b/kingfisher_scrapy/exceptions.py @@ -7,4 +7,8 @@ class AuthenticationError(KingfisherScrapyError): class SpiderArgumentError(KingfisherScrapyError): - """Raises when a spider argument's value is invalid""" + """Raised when a spider argument's value is invalid""" + + +class MissingRequiredFieldError(KingfisherScrapyError, KeyError): + """Raised when an item is missing a required field""" diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 10d77e243..297dd9afe 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -40,7 +40,7 @@ def item_scraped(self, item, spider): metadata = { 'url': item['url'], 'data_type': item['data_type'], - 'encoding': item['encoding'], + 'encoding': item.get('encoding', 'utf-8'), } self._write_file(path + '.fileinfo', metadata, spider) diff --git a/kingfisher_scrapy/items.py b/kingfisher_scrapy/items.py index 29c2fef17..a05d5c2bf 100644 --- a/kingfisher_scrapy/items.py +++ b/kingfisher_scrapy/items.py @@ -1,9 +1,26 @@ +# https://docs.scrapy.org/en/latest/topics/items.html import scrapy +from kingfisher_scrapy.exceptions import MissingRequiredFieldError -class File(scrapy.Item): + +class KingfisherItem(scrapy.Item): file_name = scrapy.Field() url = scrapy.Field() + + def validate(self): + """ + Raises an error if any required field is missing. + + :raises kingfisher_scrapy.extensions.MissingRequiredFieldError: if any required field is missing + """ + if hasattr(self, 'required'): + for field in self.required: + if field not in self: + raise MissingRequiredFieldError(field) + + +class File(KingfisherItem): data = scrapy.Field() data_type = scrapy.Field() encoding = scrapy.Field() @@ -15,17 +32,34 @@ class File(scrapy.Item): path = scrapy.Field() files_store = scrapy.Field() + required = [ + 'file_name', + 'url', + 'data', + 'data_type', + ] + -class FileItem(scrapy.Item): +class FileItem(KingfisherItem): number = scrapy.Field() - file_name = scrapy.Field() - url = scrapy.Field() data = scrapy.Field() data_type = scrapy.Field() encoding = scrapy.Field() + required = [ + 'number', + 'file_name', + 'url', + 'data', + 'data_type', + ] -class FileError(scrapy.Item): - file_name = scrapy.Field() - url = scrapy.Field() + +class FileError(KingfisherItem): errors = scrapy.Field() + + required = [ + 'file_name', + 'url', + 'errors', + ] diff --git a/kingfisher_scrapy/middlewares.py b/kingfisher_scrapy/middlewares.py index 1d8aba0c2..b0e7cc5e5 100644 --- a/kingfisher_scrapy/middlewares.py +++ b/kingfisher_scrapy/middlewares.py @@ -1,8 +1,3 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your spider middleware -# -# See documentation in: # https://docs.scrapy.org/en/latest/topics/spider-middleware.html import logging diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py index 5ab05e299..2fc034159 100644 --- a/kingfisher_scrapy/pipelines.py +++ b/kingfisher_scrapy/pipelines.py @@ -2,8 +2,11 @@ # https://docs.scrapy.org/en/latest/topics/signals.html#item-signals -class KingfisherScrapyPipeline: +class Validate: def process_item(self, item, spider): - item.validate() + if hasattr(item, 'validate'): + # We call this in the item pipeline to guarantee that all items are validated. However, its backtrace isn't + # as helpful for debugging, so we could also call it in ``BaseSpider`` if this becomes an issue. + item.validate() return item diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index a37532416..53fc8d389 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -75,9 +75,9 @@ # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'kingfisher_scrapy.pipelines.KingfisherScrapyPipeline': 300, -#} +ITEM_PIPELINES = { + 'kingfisher_scrapy.pipelines.Validate': 300, +} # To send items to Kingfishet Process, set this to, for example, "http://kingfisher.example.com" (no trailing slash). KINGFISHER_API_URI = os.getenv('KINGFISHER_API_URI') From 78c0aa405b1c8713d1715215159d6f30e081dd1d Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 29 May 2020 17:34:35 -0400 Subject: [PATCH 4/6] flake8 --- tests/test_spiders.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_spiders.py b/tests/test_spiders.py index 4cb4c5478..1a4879765 100644 --- a/tests/test_spiders.py +++ b/tests/test_spiders.py @@ -1,11 +1,9 @@ import warnings import pytest -import scrapy from scrapy.crawler import Crawler, CrawlerRunner from scrapy.exceptions import CloseSpider from scrapy.http import Response -from scrapy.utils.deprecate import method_is_overridden from scrapy.utils.project import get_project_settings from kingfisher_scrapy.items import FileError From a6327e504d43a895b3b9e56fa503a12afd7272d8 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 29 May 2020 18:20:12 -0400 Subject: [PATCH 5/6] Add kf_filename meta to all requests that can yield file errors --- .../spiders/argentina_buenos_aires.py | 2 +- kingfisher_scrapy/spiders/australia_nsw.py | 17 ++++++++---- .../spiders/dominican_republic.py | 2 +- kingfisher_scrapy/spiders/france.py | 3 ++- .../spiders/honduras_portal_bulk_files.py | 6 +++-- .../spiders/indonesia_bandung.py | 1 + kingfisher_scrapy/spiders/openopps.py | 27 +++++++++++-------- .../spiders/paraguay_dncp_base.py | 11 ++++++-- .../spiders/paraguay_hacienda.py | 19 ++++++++++--- 9 files changed, 61 insertions(+), 27 deletions(-) diff --git a/kingfisher_scrapy/spiders/argentina_buenos_aires.py b/kingfisher_scrapy/spiders/argentina_buenos_aires.py index a434a0164..d1847e3d6 100644 --- a/kingfisher_scrapy/spiders/argentina_buenos_aires.py +++ b/kingfisher_scrapy/spiders/argentina_buenos_aires.py @@ -34,4 +34,4 @@ def parse_list(self, response): data = json.loads(response.text) for resource in data['result']['resources']: if resource['format'].upper() == 'JSON': - yield scrapy.Request(url=resource['url']) + yield scrapy.Request(resource['url'], meta={'kf_filename': resource['url'].rsplit('/', 1)[-1]}) diff --git a/kingfisher_scrapy/spiders/australia_nsw.py b/kingfisher_scrapy/spiders/australia_nsw.py index d2313d454..7a3d53478 100644 --- a/kingfisher_scrapy/spiders/australia_nsw.py +++ b/kingfisher_scrapy/spiders/australia_nsw.py @@ -17,7 +17,10 @@ def start_requests(self): for release_type in release_types: yield scrapy.Request( url.format(release_type, page_limit), - meta={'release_type': release_type}, + meta={ + 'kf_filename': '{}.json'.format(release_type), + 'release_type': release_type, + }, callback=self.parse_list ) @@ -25,33 +28,37 @@ def parse_list(self, response): if self.is_http_success(response): json_data = json.loads(response.text) + release_type = response.request.meta['release_type'] # More Pages? if 'links' in json_data and isinstance(json_data['links'], dict) and 'next' in json_data['links'] \ and not self.sample: yield scrapy.Request( json_data['links']['next'], - meta={'release_type': response.request.meta['release_type']}, + meta={ + 'kf_filename': hashlib.md5(json_data['links']['next'].encode('utf-8')).hexdigest() + '.json', + 'release_type': release_type, + }, callback=self.parse_list ) # Data? for release in json_data['releases']: - if response.request.meta['release_type'] == 'planning': + if release_type == 'planning': uuid = release['tender']['plannedProcurementUUID'] yield scrapy.Request( 'https://tenders.nsw.gov.au/?event=public.api.planning.view&PlannedProcurementUUID=%s' % uuid, meta={'kf_filename': 'plannning-%s.json' % uuid}, callback=self.parse ) - if response.request.meta['release_type'] == 'tender': + if release_type == 'tender': uuid = release['tender']['RFTUUID'] yield scrapy.Request( 'https://tenders.nsw.gov.au/?event=public.api.tender.view&RFTUUID=%s' % uuid, meta={'kf_filename': 'tender-%s.json' % uuid}, callback=self.parse ) - if response.request.meta['release_type'] == 'contract': + if release_type == 'contract': for award in release['awards']: uuid = award['CNUUID'] yield scrapy.Request( diff --git a/kingfisher_scrapy/spiders/dominican_republic.py b/kingfisher_scrapy/spiders/dominican_republic.py index fe732e0e5..7962a749d 100644 --- a/kingfisher_scrapy/spiders/dominican_republic.py +++ b/kingfisher_scrapy/spiders/dominican_republic.py @@ -31,7 +31,7 @@ def parse_list(self, response): for url in json_urls: if '/JSON_DGCP_' in url: - yield scrapy.Request('https:' + url) + yield scrapy.Request('https:' + url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) def parse(self, response): if self.is_http_success(response): diff --git a/kingfisher_scrapy/spiders/france.py b/kingfisher_scrapy/spiders/france.py index bf2989ddb..203b9ba0e 100644 --- a/kingfisher_scrapy/spiders/france.py +++ b/kingfisher_scrapy/spiders/france.py @@ -41,7 +41,8 @@ def parse_list(self, response): if next_page: yield scrapy.Request( next_page, - callback=self.parse_item + meta={'kf_filename': hashlib.md5(next_page.encode('utf-8')).hexdigest() + '.json'}, + callback=self.parse_list ) @handle_error diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 889624f5c..e60582bae 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -22,11 +22,13 @@ def parse_list(self, response): filelist = json.loads(response.text) if self.sample: - yield scrapy.Request(filelist[0]['urls']['json']) + url = filelist[0]['urls']['json'] + yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) else: for item in filelist: - yield scrapy.Request(item['urls']['json']) + url = item['urls']['json'] + yield scrapy.Request(url, meta={'kf_filename': url.rsplit('/', 1)[-1]}) def parse(self, response): filename = urlparse(response.request.url).path.split('/')[-2] diff --git a/kingfisher_scrapy/spiders/indonesia_bandung.py b/kingfisher_scrapy/spiders/indonesia_bandung.py index cec93b1e1..0b6c7a683 100644 --- a/kingfisher_scrapy/spiders/indonesia_bandung.py +++ b/kingfisher_scrapy/spiders/indonesia_bandung.py @@ -39,6 +39,7 @@ def parse_data(self, response): if next_page_url: yield scrapy.Request( next_page_url, + meta={'kf_filename': next_page_url.rsplit('/', 1)[-1] + '.json'}, callback=self.parse_data ) diff --git a/kingfisher_scrapy/spiders/openopps.py b/kingfisher_scrapy/spiders/openopps.py index 930edc130..79bb8bb59 100644 --- a/kingfisher_scrapy/spiders/openopps.py +++ b/kingfisher_scrapy/spiders/openopps.py @@ -175,9 +175,13 @@ def parse(self, response): next_url = results.get('next') if next_url: yield scrapy.Request( - url=next_url, - headers={"Accept": "*/*", "Content-Type": "application/json"}, - meta={"release_date": release_date, "search_h": search_h}, + next_url, + meta={ + 'kf_filename': hashlib.md5(next_url.encode('utf-8')).hexdigest() + '.json', + 'release_date': release_date, + 'search_h': search_h, + }, + headers={'Accept': '*/*', 'Content-Type': 'application/json'} ) # Tells if we have to re-authenticate before the token expires @@ -221,15 +225,16 @@ def parse(self, response): self.logger.info('Changing filters, split in {}: {}.'.format(parts, response.request.url)) for i in range(len(start_hour_list)): + url = self.base_page_url.format(start_hour_list[i], end_hour_list[i]) yield scrapy.Request( - url=self.base_page_url.format( - start_hour_list[i], - end_hour_list[i] - ), - headers={"Accept": "*/*", "Content-Type": "application/json"}, - meta={"release_date": start_hour_list[i], # release_date with star hour - "last_hour": end_hour_list[i], # release_date with last hour - "search_h": split_h}, # new search range + url, + meta={ + 'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json', + 'release_date': start_hour_list[i], # release_date with star hour + 'last_hour': end_hour_list[i], # release_date with last hour + 'search_h': split_h, # new search range + }, + headers={'Accept': '*/*', 'Content-Type': 'application/json'} ) else: # Message for pages that exceed the 10,000 search results in the range of one hour diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 1725e0504..69b2d4a27 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -51,10 +51,15 @@ def from_crawler(cls, crawler, *args, **kwargs): def start_requests(self): if self.from_date: + from_date = self.from_date.strftime(self.date_format) self.base_page_url = '{}/search/processes?tipo_fecha=fecha_release&fecha_desde={}'\ - .format(self.base_url, self.from_date.strftime(self.date_format)) + .format(self.base_url, from_date) yield scrapy.Request( self.base_page_url, + meta={ + 'kf_filename': '{}-1.json'.format(from_date), + 'from_date': from_date, + }, # send duplicate requests when the token expired and in the continuation of last_request saved. dont_filter=True, callback=self.parse_pages @@ -123,9 +128,11 @@ def parse_pages(self, response): ) pagination = content['pagination'] if pagination['current_page'] < pagination['total_pages'] and not self.sample: - url = '{}&page={}'.format(self.base_page_url, pagination['current_page'] + 1) + page = pagination['current_page'] + 1 + url = '{}&page={}'.format(self.base_page_url, page) yield scrapy.Request( url, + meta={'kf_filename': '{}-{}.json'.format(response.request.meta['from_date'], page)}, dont_filter=True, callback=self.parse_pages ) diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py index cbc98cf4e..b0db40f4d 100644 --- a/kingfisher_scrapy/spiders/paraguay_hacienda.py +++ b/kingfisher_scrapy/spiders/paraguay_hacienda.py @@ -44,9 +44,13 @@ def start_requests(self): # so we first iterate over this list that is paginated yield scrapy.Request( self.base_list_url.format(1), + meta={ + 'kf_filename': 'list-1.json', + 'meta': True, + 'first': True, + }, # send duplicate requests when the token expired and in the continuation of last_request saved. dont_filter=True, - meta={'meta': True, 'first': True} ) @handle_error @@ -60,7 +64,11 @@ def parse(self, response): for page in range(2, total_pages+1): yield scrapy.Request( url=self.base_list_url.format(page), - meta={'meta': True, 'first': False}, + meta={ + 'kf_filename': 'list-{}.json'.format(page), + 'meta': True, + 'first': False, + }, dont_filter=True ) @@ -76,8 +84,11 @@ def parse(self, response): self.release_ids.append(row['idLlamado']) yield scrapy.Request( url=base_url.format(row['idLlamado']), - meta={'meta': False, 'first': False, - 'kf_filename': 'release-{}.json'.format(row['idLlamado'])}, + meta={ + 'kf_filename': 'release-{}.json'.format(row['idLlamado']), + 'meta': False, + 'first': False, + }, dont_filter=True ) else: From b2d0b88ca49540b552d93241d1d4605956f78a6c Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 29 May 2020 18:29:27 -0400 Subject: [PATCH 6/6] Add test for Validate item pipeline --- tests/test_validate.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tests/test_validate.py diff --git a/tests/test_validate.py b/tests/test_validate.py new file mode 100644 index 000000000..641c73e9d --- /dev/null +++ b/tests/test_validate.py @@ -0,0 +1,24 @@ +import pytest +from kingfisher_scrapy.exceptions import MissingRequiredFieldError +from kingfisher_scrapy.items import File +from kingfisher_scrapy.pipelines import Validate + + +def test_process_item(): + pipeline = Validate() + item = File({ + 'file_name': '', + 'data': '', + 'data_type': '', + 'url': '', + }) + + assert pipeline.process_item(item, None) == item + + +def test_process_item_error(): + pipeline = Validate() + item = File() + + with pytest.raises(MissingRequiredFieldError): + pipeline.process_item(item, None)