From b0f2c40d2543daf247576c6e02472a390b6d5f61 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Mon, 5 Oct 2020 10:33:56 -0300 Subject: [PATCH 1/5] Add Nicaragua Solid Waste spider --- .../spiders/nicaragua_solid_waste.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 kingfisher_scrapy/spiders/nicaragua_solid_waste.py diff --git a/kingfisher_scrapy/spiders/nicaragua_solid_waste.py b/kingfisher_scrapy/spiders/nicaragua_solid_waste.py new file mode 100644 index 000000000..ac8d753b5 --- /dev/null +++ b/kingfisher_scrapy/spiders/nicaragua_solid_waste.py @@ -0,0 +1,35 @@ +from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.util import components + + +class NicaraguaSolidWaste(SimpleSpider): + """ + Spider arguments + sample + Download only data released on 2013-01-23 + from_date + Download only data from this date onward (YYYY-MM-DD format). + If ``until_date`` is provided, defaults to '2000-01-01'. + until_date + Download only data until this date (YYYY-MM-DD format). + If ``from_date`` is provided, defaults to today. + """ + name = 'nicaragua_solid_waste' + data_type = 'release_package' + default_from_date = '2000-01-01' + url = 'http://www.gekoware.com/swmp/api/ocds/' + + def start_requests(self): + if self.sample: + # date parameter setting to get a one release from 2013 + url = self.url + '20130123/20130123' + else: + if self.from_date and self.until_date: + # date parameter obtained + url = self.url + self.from_date.strftime("%Y%m%d") + '/' + self.until_date.strftime("%Y%m%d") + else: + # date parameter setting to get all releases + url = self.url + '20000101/20201231' + + # url looks like http://www.gekoware.com/swmp/api/ocds/20190101/20201005 + yield self.build_request(url, formatter=components(-2)) From 91b7ff2a38e5303655f50ef100bcb18a609680db Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Mon, 5 Oct 2020 12:36:45 -0300 Subject: [PATCH 2/5] Update changes --- kingfisher_scrapy/spiders/nicaragua_solid_waste.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kingfisher_scrapy/spiders/nicaragua_solid_waste.py b/kingfisher_scrapy/spiders/nicaragua_solid_waste.py index ac8d753b5..9f88d026f 100644 --- a/kingfisher_scrapy/spiders/nicaragua_solid_waste.py +++ b/kingfisher_scrapy/spiders/nicaragua_solid_waste.py @@ -17,19 +17,17 @@ class NicaraguaSolidWaste(SimpleSpider): name = 'nicaragua_solid_waste' data_type = 'release_package' default_from_date = '2000-01-01' - url = 'http://www.gekoware.com/swmp/api/ocds/' + date_required = True + url = 'http://www.gekoware.com/swmp/api/ocds/{}/{}' def start_requests(self): if self.sample: - # date parameter setting to get a one release from 2013 - url = self.url + '20130123/20130123' + # date parameter setting to get one release from 2013 + url = self.url.format('20130123', '20130123') else: if self.from_date and self.until_date: # date parameter obtained - url = self.url + self.from_date.strftime("%Y%m%d") + '/' + self.until_date.strftime("%Y%m%d") - else: - # date parameter setting to get all releases - url = self.url + '20000101/20201231' + url = self.url.format(self.from_date.strftime("%Y%m%d"), self.until_date.strftime("%Y%m%d")) # url looks like http://www.gekoware.com/swmp/api/ocds/20190101/20201005 yield self.build_request(url, formatter=components(-2)) From d6e5f8709f1b99bf472657a7ab2723c3d4e7530c Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Mon, 5 Oct 2020 12:37:27 -0300 Subject: [PATCH 3/5] Add date_required spider attribute --- kingfisher_scrapy/base_spider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index e966864d3..cb3841592 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -130,7 +130,7 @@ def from_crawler(cls, crawler, *args, **kwargs): raise SpiderArgumentError('spider argument crawl_time: invalid date value: {}'.format(e)) # Checks Spider date ranges arguments - if spider.from_date or spider.until_date: + if spider.from_date or spider.until_date or spider.date_required: if not spider.from_date: # Default to `default_from_date` class attribute. spider.from_date = spider.default_from_date From fccbd4a70f617863974234916f802269ea4035bd Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Mon, 5 Oct 2020 12:46:03 -0300 Subject: [PATCH 4/5] Fix warnings --- kingfisher_scrapy/base_spider.py | 2 ++ kingfisher_scrapy/spiders/nicaragua_solid_waste.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index cb3841592..2d754506f 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -76,6 +76,7 @@ class BaseSpider(scrapy.Spider): ocds_version = '1.1' date_format = 'date' + date_required = None def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None, keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, qs=None, *args, @@ -96,6 +97,7 @@ def __init__(self, sample=None, note=None, from_date=None, until_date=None, craw self.qs = qs self.date_format = self.VALID_DATE_FORMATS[self.date_format] + self.date_required = self.date_required self.pluck = bool(package_pointer or release_pointer) if self.qs and hasattr(self, 'start_requests'): diff --git a/kingfisher_scrapy/spiders/nicaragua_solid_waste.py b/kingfisher_scrapy/spiders/nicaragua_solid_waste.py index 9f88d026f..e410a622e 100644 --- a/kingfisher_scrapy/spiders/nicaragua_solid_waste.py +++ b/kingfisher_scrapy/spiders/nicaragua_solid_waste.py @@ -21,13 +21,14 @@ class NicaraguaSolidWaste(SimpleSpider): url = 'http://www.gekoware.com/swmp/api/ocds/{}/{}' def start_requests(self): + url = self.url if self.sample: # date parameter setting to get one release from 2013 - url = self.url.format('20130123', '20130123') + url = url.format('20130123', '20130123') else: if self.from_date and self.until_date: # date parameter obtained - url = self.url.format(self.from_date.strftime("%Y%m%d"), self.until_date.strftime("%Y%m%d")) + url = url.format(self.from_date.strftime("%Y%m%d"), self.until_date.strftime("%Y%m%d")) # url looks like http://www.gekoware.com/swmp/api/ocds/20190101/20201005 yield self.build_request(url, formatter=components(-2)) From 37d4763b9853c8f55c5ba780f01a7c0efac74de5 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Mon, 5 Oct 2020 14:05:02 -0300 Subject: [PATCH 5/5] Update changes --- kingfisher_scrapy/base_spider.py | 7 ++++--- kingfisher_scrapy/spiders/nicaragua_solid_waste.py | 6 ++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 2d754506f..0ee87526e 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -76,7 +76,8 @@ class BaseSpider(scrapy.Spider): ocds_version = '1.1' date_format = 'date' - date_required = None + # Set `date_required` to True in class attribute to always set the `from` and `until` date parameters. + date_required = False def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None, keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, qs=None, *args, @@ -131,14 +132,14 @@ def from_crawler(cls, crawler, *args, **kwargs): except ValueError as e: raise SpiderArgumentError('spider argument crawl_time: invalid date value: {}'.format(e)) - # Checks Spider date ranges arguments + # Checks Spider date ranges arguments and `date_required` class attribute. if spider.from_date or spider.until_date or spider.date_required: if not spider.from_date: # Default to `default_from_date` class attribute. spider.from_date = spider.default_from_date try: if isinstance(spider.from_date, str): - # convert to date format, if needed + # Convert to date format, if needed. spider.from_date = datetime.strptime(spider.from_date, spider.date_format) except ValueError as e: raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) diff --git a/kingfisher_scrapy/spiders/nicaragua_solid_waste.py b/kingfisher_scrapy/spiders/nicaragua_solid_waste.py index e410a622e..e2bd97fc9 100644 --- a/kingfisher_scrapy/spiders/nicaragua_solid_waste.py +++ b/kingfisher_scrapy/spiders/nicaragua_solid_waste.py @@ -26,9 +26,7 @@ def start_requests(self): # date parameter setting to get one release from 2013 url = url.format('20130123', '20130123') else: - if self.from_date and self.until_date: - # date parameter obtained - url = url.format(self.from_date.strftime("%Y%m%d"), self.until_date.strftime("%Y%m%d")) - + # date parameter obtained + url = url.format(self.from_date.strftime("%Y%m%d"), self.until_date.strftime("%Y%m%d")) # url looks like http://www.gekoware.com/swmp/api/ocds/20190101/20201005 yield self.build_request(url, formatter=components(-2))