From ade41c423bcb2496dd4975af8529c6612f00fad0 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Tue, 1 Sep 2020 12:36:59 -0400 Subject: [PATCH 1/2] Add new dominican_republic scraper --- .../spiders/dominican_republic_portal.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 kingfisher_scrapy/spiders/dominican_republic_portal.py diff --git a/kingfisher_scrapy/spiders/dominican_republic_portal.py b/kingfisher_scrapy/spiders/dominican_republic_portal.py new file mode 100644 index 000000000..0f7adfb76 --- /dev/null +++ b/kingfisher_scrapy/spiders/dominican_republic_portal.py @@ -0,0 +1,40 @@ +import scrapy + +from kingfisher_scrapy.base_spider import LinksSpider +from kingfisher_scrapy.util import parameters + + +class DominicanRepublicPortal(LinksSpider): + """ + API documentation + http://148.101.176.123:48080/ocdsdr/docs + Spider arguments + sample + Downloads the first release package returned by the main endpoint. + from_date + Download only data from this date onward (YYYY-MM-DD format). + If ``until_date`` is provided, defaults to '2018-01-01'. + until_date + Download only data until this date (YYYY-MM-DD format). + If ``from_date`` is provided, defaults to today. + """ + name = 'dominican_republic_portal' + data_type = 'release_package' + default_from_date = '2018-01-01' + next_page_formatter = staticmethod(parameters('page')) + + def start_requests(self): + url = 'http://148.101.176.123:48080/ocdsdr/api/v1/releases' + if self.from_date and self.until_date: + url = url + '/byDatesBetween/{}/{}'.format( + self.from_date.strftime('%Y-%m-%d'), + self.until_date.strftime('%Y-%m-%d') + ) + yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_response) + + def parse_response(self, response): + if not self.is_http_success(response) and response.body == b'{"detail":"Not Found"}': + self.logger.info( + f'No data found for url: {response.request.url}. ' + f'Date range: {self.from_date.strftime("%Y-%m-%d")} to {self.until_date.strftime("%Y-%m-%d")}.' + ) From 149fcdccaa74676600d1027bf16a9fde2d005677 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Wed, 2 Sep 2020 09:58:12 -0400 Subject: [PATCH 2/2] Changes --- ...n_republic_portal.py => dominican_republic_api.py} | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) rename kingfisher_scrapy/spiders/{dominican_republic_portal.py => dominican_republic_api.py} (72%) diff --git a/kingfisher_scrapy/spiders/dominican_republic_portal.py b/kingfisher_scrapy/spiders/dominican_republic_api.py similarity index 72% rename from kingfisher_scrapy/spiders/dominican_republic_portal.py rename to kingfisher_scrapy/spiders/dominican_republic_api.py index 0f7adfb76..9484cf363 100644 --- a/kingfisher_scrapy/spiders/dominican_republic_portal.py +++ b/kingfisher_scrapy/spiders/dominican_republic_api.py @@ -18,7 +18,7 @@ class DominicanRepublicPortal(LinksSpider): Download only data until this date (YYYY-MM-DD format). If ``from_date`` is provided, defaults to today. """ - name = 'dominican_republic_portal' + name = 'dominican_republic_api' data_type = 'release_package' default_from_date = '2018-01-01' next_page_formatter = staticmethod(parameters('page')) @@ -30,11 +30,4 @@ def start_requests(self): self.from_date.strftime('%Y-%m-%d'), self.until_date.strftime('%Y-%m-%d') ) - yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_response) - - def parse_response(self, response): - if not self.is_http_success(response) and response.body == b'{"detail":"Not Found"}': - self.logger.info( - f'No data found for url: {response.request.url}. ' - f'Date range: {self.from_date.strftime("%Y-%m-%d")} to {self.until_date.strftime("%Y-%m-%d")}.' - ) + yield scrapy.Request(url, meta={'file_name': 'page-1.json'})