From ff6180eebf94affce0ccd740a81faafe87d07dbd Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 13 Jul 2020 13:26:09 -0400 Subject: [PATCH 1/5] Add until_date to paraguay_dncp spider Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/spiders/paraguay_dncp_base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index f54175b3..0fa1d035 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -21,7 +21,7 @@ class ParaguayDNCPBaseSpider(SimpleSpider): last_request = None request_time_limit = 13 # in minutes base_url = 'https://contrataciones.gov.py/datos/api/v3/doc' - base_page_url = f'{base_url}/search/processes?fecha_desde=2010-01-01' + base_page_url = '{}/search/processes?tipo_fecha=fecha_release&fecha_desde={}&fecha_hasta={}' auth_url = f'{base_url}/oauth/token' request_token = None max_attempts = 10 @@ -49,10 +49,13 @@ def from_crawler(cls, crawler, *args, **kwargs): return spider def start_requests(self): - if self.from_date: + if self.from_date or self.until_date: self.from_date = self.from_date.strftime(self.date_format) - self.base_page_url = '{}/search/processes?tipo_fecha=fecha_release&fecha_desde={}'\ - .format(self.base_url, self.from_date) + self.until_date = self.until_date.strftime(self.date_format) + self.base_page_url = self.base_page_url.format(self.base_url, self.from_date, self.until_date) + else: + self.base_page_url = self.base_page_url.format(self.base_url, self.default_from_date, + datetime.now().strftime(self.date_format)) yield self.build_request( self.base_page_url, formatter=parameters('fecha_desde'), From 1e7235b27c2178145727ebf5bd12b3ea1713fc19 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 13 Jul 2020 15:17:38 -0400 Subject: [PATCH 2/5] Re-use logic in BaseSpider.from_crawler instead of repeating in start_requests --- .../spiders/paraguay_dncp_base.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 0fa1d035..2db36665 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -36,9 +36,12 @@ class ParaguayDNCPBaseSpider(SimpleSpider): } @classmethod - def from_crawler(cls, crawler, *args, **kwargs): - spider = super(ParaguayDNCPBaseSpider, cls).from_crawler(crawler, date_format='datetime', - *args, **kwargs) + def from_crawler(cls, crawler, from_date=None, until_date=None, *args, **kwargs): + if not from_date: + from_date = cls.default_from_date + + spider = super().from_crawler(crawler, date_format='datetime', from_date=from_date, until_date=until_date, + *args, **kwargs) spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN') @@ -49,13 +52,9 @@ def from_crawler(cls, crawler, *args, **kwargs): return spider def start_requests(self): - if self.from_date or self.until_date: - self.from_date = self.from_date.strftime(self.date_format) - self.until_date = self.until_date.strftime(self.date_format) - self.base_page_url = self.base_page_url.format(self.base_url, self.from_date, self.until_date) - else: - self.base_page_url = self.base_page_url.format(self.base_url, self.default_from_date, - datetime.now().strftime(self.date_format)) + self.base_page_url = self.base_page_url.format(self.base_url, self.from_date.strftime(self.date_format), + self.until_date.strftime(self.date_format)) + yield self.build_request( self.base_page_url, formatter=parameters('fecha_desde'), From edba9cb40d645f485ed41fcf6847c298afec652d Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 13 Jul 2020 15:23:47 -0400 Subject: [PATCH 3/5] paraguay_dncp_base: Remove unused keyword arguments --- kingfisher_scrapy/spiders/paraguay_dncp_base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 2db36665..5c06041b 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -36,12 +36,11 @@ class ParaguayDNCPBaseSpider(SimpleSpider): } @classmethod - def from_crawler(cls, crawler, from_date=None, until_date=None, *args, **kwargs): + def from_crawler(cls, crawler, from_date=None, *args, **kwargs): if not from_date: from_date = cls.default_from_date - spider = super().from_crawler(crawler, date_format='datetime', from_date=from_date, until_date=until_date, - *args, **kwargs) + spider = super().from_crawler(crawler, date_format='datetime', from_date=from_date, *args, **kwargs) spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN') From 36e19b60275767986574128355b9fa6ce9ddb6c5 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Mon, 13 Jul 2020 16:06:47 -0400 Subject: [PATCH 4/5] Remove unused from_date meta Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/spiders/paraguay_dncp_base.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 5c06041b..ce751424 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -57,9 +57,6 @@ def start_requests(self): yield self.build_request( self.base_page_url, formatter=parameters('fecha_desde'), - meta={ - 'from_date': self.from_date, - }, # send duplicate requests when the token expired and in the continuation of last_request saved. dont_filter=True, callback=self.parse_pages From 8fdbd6dda97e85b9dd664d8e88d246804d0a3e27 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Mon, 13 Jul 2020 16:24:36 -0400 Subject: [PATCH 5/5] Avoid modification of class variable by instance method --- kingfisher_scrapy/spiders/paraguay_dncp_base.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index ce751424..cd3e9524 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -5,7 +5,7 @@ from kingfisher_scrapy.base_spider import SimpleSpider from kingfisher_scrapy.exceptions import AuthenticationError -from kingfisher_scrapy.util import components, handle_http_error, parameters +from kingfisher_scrapy.util import components, handle_http_error, parameters, replace_parameter class ParaguayDNCPBaseSpider(SimpleSpider): @@ -21,7 +21,6 @@ class ParaguayDNCPBaseSpider(SimpleSpider): last_request = None request_time_limit = 13 # in minutes base_url = 'https://contrataciones.gov.py/datos/api/v3/doc' - base_page_url = '{}/search/processes?tipo_fecha=fecha_release&fecha_desde={}&fecha_hasta={}' auth_url = f'{base_url}/oauth/token' request_token = None max_attempts = 10 @@ -51,11 +50,12 @@ def from_crawler(cls, crawler, from_date=None, *args, **kwargs): return spider def start_requests(self): - self.base_page_url = self.base_page_url.format(self.base_url, self.from_date.strftime(self.date_format), - self.until_date.strftime(self.date_format)) + url = f'{self.base_url}/search/processes?tipo_fecha=fecha_release&' \ + f'fecha_desde={self.from_date.strftime(self.date_format)}&' \ + f'fecha_hasta={self.until_date.strftime(self.date_format)}' yield self.build_request( - self.base_page_url, + url, formatter=parameters('fecha_desde'), # send duplicate requests when the token expired and in the continuation of last_request saved. dont_filter=True, @@ -119,7 +119,7 @@ def parse_pages(self, response): pagination = content['pagination'] if pagination['current_page'] < pagination['total_pages'] and not self.sample: page = pagination['current_page'] + 1 - url = f'{self.base_page_url}&page={page}' + url = replace_parameter(response.request.url, 'page', page) yield self.build_request( url, formatter=parameters('fecha_desde', 'page'),