Skip to content

Commit

Permalink
fix(uganda_releases): Add support for XLSX for 2023-2024. Rate limit …
Browse files Browse the repository at this point in the history
…to avoid 403 error.
  • Loading branch information
jpmckinney committed May 7, 2024
1 parent 3111971 commit 2afb8e6
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 2 deletions.
8 changes: 8 additions & 0 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,14 @@ def process_item(self, item, spider):
return item

input_name = item.file_name

# uganda_releases yields JSON until 2023-2024, using the same URL pattern.
if input_name.endswith('.json'):
if item.data.startswith(b'PK\x03\x04'):
input_name = f'{os.path.splitext(input_name)[0]}.xlsx'
else:
return item

if input_name.endswith('.csv'):
item.file_name = f'{item.file_name[:-4]}.json'
input_format = 'csv'
Expand Down
7 changes: 5 additions & 2 deletions kingfisher_scrapy/spiders/uganda_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,13 @@ class UgandaReleases(PeriodicSpider):
# returns HTTP 500 error FileNotFoundException. Therefore, we retry all codes in RETRY_HTTP_CODES except 500.
'RETRY_HTTP_CODES': [status for status in RETRY_HTTP_CODES if status != 500],
}
# Returns HTTP 403 if too many requests. (0.5 is too short.)
download_delay = 1

# BaseSpider
date_format = 'year'
default_from_date = '2019'
unflatten = True

# SimpleSpider
data_type = 'release_package'
Expand All @@ -49,8 +53,7 @@ def parse(self, response):
else:
yield from super().parse(response)
code = int(get_parameter_value(response.request.url, 'code')) + 1
yield self.build_request(replace_parameters(response.request.url, code=code),
formatter=self.formatter)
yield self.build_request(replace_parameters(response.request.url, code=code), formatter=self.formatter)

def build_urls(self, date):
yield self.pattern.format(date, date + 1)

0 comments on commit 2afb8e6

Please sign in to comment.