Skip to content

Commit

Permalink
spiders: update panama_dgcp_bulk and paraguay_dncp_base to use date b…
Browse files Browse the repository at this point in the history
…y interval
  • Loading branch information
yolile committed Jan 31, 2023
1 parent 66ba974 commit c6bfd2a
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 16 deletions.
11 changes: 8 additions & 3 deletions kingfisher_scrapy/spiders/panama_dgcp_bulk.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.util import parameters
from kingfisher_scrapy.util import parameters, date_range_by_interval


class PanamaDGCPBulk(SimpleSpider):
Expand Down Expand Up @@ -27,5 +27,10 @@ class PanamaDGCPBulk(SimpleSpider):
data_type = 'record_package'

def start_requests(self):
yield self.build_request(f'https://ocds.panamacompraencifras.gob.pa/Descarga?DateFrom={self.from_date}&DateTo='
f'{self.until_date}&FileType=json', formatter=parameters('DateFrom', 'DateTo'))
# The API returns error 400 for intervals longer than a month and timeout for a month.
for start_date, end_date in date_range_by_interval(self.from_date, self.until_date, 15):
yield self.build_request(
f'https://ocds.panamacompraencifras.gob.pa/Descarga?DateFrom={start_date.strftime(self.date_format)}'
f'&DateTo={end_date.strftime(self.date_format)}&FileType=json',
formatter=parameters('DateFrom', 'DateTo')
)
17 changes: 4 additions & 13 deletions kingfisher_scrapy/spiders/paraguay_dncp_base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import json
from abc import abstractmethod
from datetime import datetime, timedelta
from datetime import datetime

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.exceptions import AccessTokenError, MissingEnvVarError
from kingfisher_scrapy.util import components, handle_http_error, parameters, replace_parameters
from kingfisher_scrapy.util import components, handle_http_error, parameters, replace_parameters, date_range_by_interval


class ParaguayDNCPBase(SimpleSpider):
Expand Down Expand Up @@ -55,16 +55,8 @@ def start_requests(self):
)

def urls_builder(self):
# ElasticSearch doesn't allow search sizes greater than 10000, so we request half-month at the time.
interval = timedelta(days=30)
end_date = self.until_date
# In reverse chronological order
while end_date > self.from_date:
# If there is less than or equal to one interval left, start from the `from_date`.
if end_date - self.from_date <= interval:
start_date = self.from_date
else:
start_date = end_date - interval
# ElasticSearch doesn't allow search sizes greater than 10000, so we request a month at the time.
for start_date, end_date in date_range_by_interval(self.from_date, self.until_date, 30):
# We request active/complete tenders and planned ones separately to ensure we don't exceed the 10000
# results per request limit.
url_base = f'{self.url_prefix}search/processes?fecha_desde={start_date.strftime(self.date_format)}' \
Expand All @@ -73,7 +65,6 @@ def urls_builder(self):
url_tender = f'{url_base}&tipo_fecha=publicacion_llamado'
# And the planned ones with the "fecha_release" and tender.id=planned filters.
url_planning = f'{url_base}&tender.id=planned&tipo_fecha=fecha_release'
end_date = start_date - timedelta(seconds=1)
yield from [url_tender, url_planning]

def build_access_token_request(self, attempt=0):
Expand Down

0 comments on commit c6bfd2a

Please sign in to comment.