-
Notifications
You must be signed in to change notification settings - Fork 12
/
portugal_base.py
52 lines (40 loc) · 2.21 KB
/
portugal_base.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import scrapy
from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.util import parameters
class PortugalBase(LinksSpider):
# BaseSpider
default_from_date = '2010-01-01'
# LinksSpider
next_page_formatter = staticmethod(parameters('offset'))
# We will wait 1, 2, 4, 8, 16 minutes (31 minutes total).
max_retries = 5
half_initial_wait_time = 30
def start_requests(self):
url = self.url
if self.from_date and self.until_date:
url = f'{url}&contractStartDate={self.from_date}&contractEndDate={self.until_date}'
yield scrapy.Request(url, meta={'file_name': 'offset-1.json'})
# https://github.com/scrapy/scrapy/blob/master/scrapy/downloadermiddlewares/retry.py
def parse(self, response):
retries = response.request.meta.get('retries', 0) + 1
wait_time = response.request.meta.get('wait_time', self.half_initial_wait_time) * 2
# Every ~36,000 requests, the API returns HTTP errors. After a few minutes, it starts working again.
# The number of failed attempts in the log messages includes the original request.
# https://github.com/open-contracting/kingfisher-collect/issues/545#issuecomment-762768460
if self.is_http_success(response):
yield from super().parse(response)
elif retries <= self.max_retries:
request = response.request.copy()
request.meta['retries'] = retries
request.meta['wait_time'] = wait_time
request.dont_filter = True
self.logger.debug('Retrying %(request)s in %(wait_time)ds (failed %(failures)d times): HTTP %(status)d',
{'request': response.request, 'failures': retries, 'status': response.status,
'wait_time': wait_time},
extra={'spider': self})
yield request
else:
self.logger.error('Gave up retrying %(request)s (failed %(failures)d times): HTTP %(status)d',
{'request': response.request, 'failures': retries, 'status': response.status},
extra={'spider': self})
yield self.build_file_error_from_response(response)