diff --git a/kingfisher_scrapy/spiders/panama_dgcp_bulk.py b/kingfisher_scrapy/spiders/panama_dgcp_bulk.py index 6a828370..78f16636 100644 --- a/kingfisher_scrapy/spiders/panama_dgcp_bulk.py +++ b/kingfisher_scrapy/spiders/panama_dgcp_bulk.py @@ -1,5 +1,5 @@ from kingfisher_scrapy.base_spiders import SimpleSpider -from kingfisher_scrapy.util import parameters +from kingfisher_scrapy.util import parameters, date_range_by_interval class PanamaDGCPBulk(SimpleSpider): @@ -27,5 +27,10 @@ class PanamaDGCPBulk(SimpleSpider): data_type = 'record_package' def start_requests(self): - yield self.build_request(f'https://ocds.panamacompraencifras.gob.pa/Descarga?DateFrom={self.from_date}&DateTo=' - f'{self.until_date}&FileType=json', formatter=parameters('DateFrom', 'DateTo')) + # The API returns error 400 for intervals longer than a month and timeout for a month. + for start_date, end_date in date_range_by_interval(self.from_date, self.until_date, 15): + yield self.build_request( + f'https://ocds.panamacompraencifras.gob.pa/Descarga?DateFrom={start_date.strftime(self.date_format)}' + f'&DateTo={end_date.strftime(self.date_format)}&FileType=json', + formatter=parameters('DateFrom', 'DateTo') + ) diff --git a/kingfisher_scrapy/spiders/paraguay_dncp_base.py b/kingfisher_scrapy/spiders/paraguay_dncp_base.py index 1495896b..68368474 100644 --- a/kingfisher_scrapy/spiders/paraguay_dncp_base.py +++ b/kingfisher_scrapy/spiders/paraguay_dncp_base.py @@ -1,12 +1,12 @@ import json from abc import abstractmethod -from datetime import datetime, timedelta +from datetime import datetime import scrapy from kingfisher_scrapy.base_spiders import SimpleSpider from kingfisher_scrapy.exceptions import AccessTokenError, MissingEnvVarError -from kingfisher_scrapy.util import components, handle_http_error, parameters, replace_parameters +from kingfisher_scrapy.util import components, handle_http_error, parameters, replace_parameters, date_range_by_interval class ParaguayDNCPBase(SimpleSpider): @@ -55,16 +55,8 @@ def start_requests(self): ) def urls_builder(self): - # ElasticSearch doesn't allow search sizes greater than 10000, so we request half-month at the time. - interval = timedelta(days=30) - end_date = self.until_date - # In reverse chronological order - while end_date > self.from_date: - # If there is less than or equal to one interval left, start from the `from_date`. - if end_date - self.from_date <= interval: - start_date = self.from_date - else: - start_date = end_date - interval + # ElasticSearch doesn't allow search sizes greater than 10000, so we request a month at the time. + for start_date, end_date in date_range_by_interval(self.from_date, self.until_date, 30): # We request active/complete tenders and planned ones separately to ensure we don't exceed the 10000 # results per request limit. url_base = f'{self.url_prefix}search/processes?fecha_desde={start_date.strftime(self.date_format)}' \ @@ -73,7 +65,6 @@ def urls_builder(self): url_tender = f'{url_base}&tipo_fecha=publicacion_llamado' # And the planned ones with the "fecha_release" and tender.id=planned filters. url_planning = f'{url_base}&tender.id=planned&tipo_fecha=fecha_release' - end_date = start_date - timedelta(seconds=1) yield from [url_tender, url_planning] def build_access_token_request(self, attempt=0):