Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(spiders): use date filters as part of file names #1026

Merged
merged 11 commits into from
Sep 20, 2023
7 changes: 4 additions & 3 deletions kingfisher_scrapy/spiders/australia.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ class Australia(LinksSpider):
formatter = staticmethod(parameters('cursor'))

def start_requests(self):
url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \
f'{self.from_date.strftime(self.date_format)}Z/{self.until_date.strftime(self.date_format)}Z'
from_date = self.from_date.strftime(self.date_format)
until_date = self.until_date.strftime(self.date_format)
url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/{from_date}Z/{until_date}Z'

yield scrapy.Request(url, meta={'file_name': 'start.json'})
yield scrapy.Request(url, meta={'file_name': f'{until_date}.json'}) # reverse chronological order
7 changes: 4 additions & 3 deletions kingfisher_scrapy/spiders/colombia_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ class ColombiaAPI(LinksSpider):
formatter = staticmethod(parameters('_id'))

def start_requests(self):
url = 'https://apiocds.colombiacompra.gov.co/apiCCE2.0/rest/releases/dates/' \
f'{self.from_date.strftime(self.date_format)}/{self.until_date.strftime(self.date_format)}'
from_date = self.from_date.strftime(self.date_format)
until_date = self.until_date.strftime(self.date_format)
url = f'https://apiocds.colombiacompra.gov.co/apiCCE2.0/rest/releases/dates/{from_date}/{until_date}'

yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
yield scrapy.Request(url, meta={'file_name': f'{from_date}.json'})
8 changes: 6 additions & 2 deletions kingfisher_scrapy/spiders/kyrgyzstan.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ class Kyrgyzstan(LinksSpider):
def start_requests(self):
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
url = 'http://ocds.zakupki.gov.kg/api/tendering'
if self.from_date:
from_date = self.from_date.strftime(self.date_format)
# The API requires the timezone and seconds in the since parameter.
url = f'{url}?since={self.from_date.strftime(self.date_format)}.00%2B06:00'
yield scrapy.Request(url, meta={'file_name': 'start.json'})
url = f'{url}?since={from_date}.00%2B06:00'
self.formatter = staticmethod(parameters('offset', 'since'))
else:
from_date = '1970-01-01T00:00:00'
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
yield scrapy.Request(url, meta={'file_name': f'{from_date}.json'})
11 changes: 7 additions & 4 deletions kingfisher_scrapy/spiders/portugal_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,13 @@ class PortugalBase(LinksSpider):
def start_requests(self):
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
url = self.start_url
if self.from_date and self.until_date:
url = f'{url}?contractStartDate={self.from_date.strftime(self.date_format)}' \
f'&contractEndDate={self.until_date.strftime(self.date_format)}'

yield scrapy.Request(url, meta={'file_name': 'offset-1.json'})
from_date = self.from_date.strftime(self.date_format)
until_date = self.until_date.strftime(self.date_format)
url = f'{url}?contractStartDate={from_date}&contractEndDate={until_date}'
self.formatter = staticmethod(parameters('offset', 'contractStartDate'))
else:
from_date = self.default_from_date
yield scrapy.Request(url, meta={'file_name': f'{from_date}.json'})

def is_http_retryable(self, response):
return response.status != 404
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@ class SouthAfricaNationalTreasuryAPI(LinksSpider):
data_type = 'release_package'

# LinksSpider
formatter = staticmethod(parameters('PageNumber'))
formatter = staticmethod(parameters('PageNumber', 'dateFrom'))

def start_requests(self):

yield scrapy.Request('https://ocds-api.etenders.gov.za/api/OCDSReleases?PageNumber=1&PageSize=50&'
f'dateFrom={self.from_date}&dateTo={self.until_date}', meta={'file_name': 'start.json'})
f'dateFrom={self.from_date}&dateTo={self.until_date}',
meta={'file_name': f'{self.from_date}.json'})
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from datetime import datetime

import scrapy

from kingfisher_scrapy.base_spiders import LinksSpider
Expand All @@ -12,7 +14,6 @@ class UnitedKingdomContractsFinderBase(LinksSpider):

# BaseSpider
date_format = 'datetime'
date_required = True
default_from_date = '2014-01-01T00:00:00'
encoding = 'iso-8859-1'
max_attempts = 5
Expand All @@ -32,8 +33,10 @@ def start_requests(self):
from_date = self.from_date.strftime(self.date_format)
until_date = self.until_date.strftime(self.date_format)
url = f'{url}&publishedFrom={from_date}&publishedTo={until_date}'

yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_page)
else:
until_date = datetime.utcnow().strftime(self.date_format)
yield scrapy.Request(url, meta={'file_name': f'{until_date}.json'}, # reverse chronological order
callback=self.parse_page)

@handle_http_error
def parse(self, response):
Expand Down
8 changes: 6 additions & 2 deletions kingfisher_scrapy/spiders/united_kingdom_fts.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from datetime import datetime

import scrapy

from kingfisher_scrapy.base_spiders import LinksSpider
Expand Down Expand Up @@ -36,8 +38,10 @@ def start_requests(self):
from_date = self.from_date.strftime(self.date_format)
until_date = self.until_date.strftime(self.date_format)
url = f'{url}?updatedFrom={from_date}&updatedTo={until_date}'

yield scrapy.Request(url, meta={'file_name': 'start.json'}, headers={'Accept': 'application/json'})
else:
until_date = datetime.utcnow().strftime(self.date_format)
yield scrapy.Request(url, meta={'file_name': f'{until_date}.json'}, # reverse chronological order
headers={'Accept': 'application/json'})

@handle_http_error
def parse(self, response):
Expand Down