-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
spiders: update uganda_releases with new endpoints
- Loading branch information
Showing
1 changed file
with
40 additions
and
40 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,62 +1,62 @@ | ||
import scrapy | ||
from urllib.parse import parse_qs, urlparse | ||
|
||
from kingfisher_scrapy.base_spiders import IndexSpider | ||
from kingfisher_scrapy.util import components, handle_http_error, join, parameters | ||
from scrapy.settings.default_settings import RETRY_HTTP_CODES | ||
|
||
from kingfisher_scrapy.base_spiders import PeriodicSpider | ||
from kingfisher_scrapy.util import parameters | ||
|
||
class UgandaReleases(IndexSpider): | ||
|
||
class UgandaReleases(PeriodicSpider): | ||
""" | ||
Domain | ||
Government Procurement Portal (GPP) of Public Procurement and Disposal of Public Assets Authority (PPDA) | ||
Caveats | ||
The domains described in the API documentation must be replaced by https://gpppapi.com | ||
Spider arguments | ||
from_date | ||
Download only data from this year onward (YYYY format). | ||
If ``until_date`` is provided, defaults to '2017'. | ||
The year refers to the start of the fiscal year range, e.g. if ``from_date`` = '2017' then the fiscal year is | ||
'2017-2018' | ||
If ``until_date`` is provided, defaults to '2019'. | ||
The year refers to the start of the fiscal year range, e.g. if ``from_date`` = '2019' then the fiscal year is | ||
'2019-2020' | ||
until_date | ||
Download only data until this year (YYYY format). | ||
If ``from_date`` is provided, defaults to the current year. | ||
The year refers to the start of the fiscal year range, e.g. if ``until_date`` = '2017' then the fiscal year is | ||
'2017-2018' | ||
API documentation | ||
https://docs.google.com/spreadsheets/d/10tVioy-VOQa1FwWoRl5e1pMbGpiymA0iycNcoDFkvks/edit#gid=365266172 | ||
The year refers to the start of the fiscal year range, e.g. if ``until_date`` = '2019' then the fiscal year is | ||
'2019-2020' | ||
Bulk download documentation | ||
https://gpp.ppda.go.ug/public/open-data/ocds/ocds-datasets | ||
""" | ||
name = 'uganda_releases' | ||
download_delay = 30 # to avoid API 429 error "too many request" | ||
custom_settings = { | ||
'CONCURRENT_REQUESTS': 1, | ||
# We cannot get the list of all the files from https://gpp.ppda.go.ug/public/open-data/ocds/ocds-datasets | ||
# because the list is generated in the browser. | ||
# To get all the files, we follow the pattern download?fy={0}-{1}&code=1, iterating de 'code' value until it | ||
# returns HTTP 500 error FileNotFoundException. Therefore, we retry all codes but 500 | ||
'RETRY_HTTP_CODES': filter(lambda status: status != 500, RETRY_HTTP_CODES), | ||
} | ||
|
||
# BaseSpider | ||
date_format = 'year' | ||
default_from_date = '2017' | ||
default_from_date = '2019' | ||
|
||
# SimpleSpider | ||
data_type = 'release_package' | ||
|
||
# IndexSpider | ||
page_count_pointer = '/data/last_page' | ||
parse_list_callback = 'parse_page' | ||
|
||
def start_requests(self): | ||
url = 'https://gpppapi.com/adminapi/public/api/pdes' | ||
yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list) | ||
|
||
@handle_http_error | ||
def parse_page(self, response): | ||
pattern = 'https://gpppapi.com/adminapi/public/api/open-data/v1/releases/{tag}?fy={fy}&pde={pde}' | ||
|
||
for pdes in response.json()['data']['data']: | ||
for plans in pdes['procurement_plans']: | ||
for tag in ('planning', 'tender', 'award', 'contract'): | ||
if self.from_date and self.until_date: | ||
start_year = int(plans['financial_year'].split('-')[0]) | ||
if not (self.from_date.year <= start_year <= self.until_date.year): | ||
continue | ||
yield self.build_request( | ||
pattern.format(tag=tag, fy=plans['financial_year'], pde=plans['pde_id']), | ||
formatter=join(components(-1), parameters('fy', 'pde')) | ||
) | ||
# PeriodicSpider | ||
formatter = staticmethod(parameters('fy', 'code')) | ||
pattern = 'https://gpp.ppda.go.ug/adminapi/public/api/open-data/v2/ocds/download?fy={0}-{1}&code=1' | ||
start_requests_callback = 'build_next' | ||
|
||
def build_next(self, response): | ||
if response.status == 500: | ||
return | ||
elif not self.is_http_success(response): | ||
yield self.build_file_error_from_response(response) | ||
else: | ||
yield from super().parse(response) | ||
next_code = int(parse_qs(urlparse(response.request.url).query)['code'][0])+1 | ||
yield self.build_request(f"{response.request.url.split('code=')[0]}code={next_code}", | ||
formatter=self.formatter, callback=self.build_next) | ||
|
||
def build_urls(self, date): | ||
""" | ||
Yields one or more URLs for the given date. | ||
""" | ||
yield self.pattern.format(date, date+1) |