Skip to content

Commit

Permalink
Merge e2acb31 into 20674ca
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Feb 9, 2021
2 parents 20674ca + e2acb31 commit 7ce40a2
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 11 deletions.
2 changes: 1 addition & 1 deletion kingfisher_scrapy/commands/updatedocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,5 +50,5 @@ def _keyfunc(module):
environment_variables = re.findall(r'^(\S.+)\n ', dedent(section[1]), re.MULTILINE)
infix = f"env {' '.join([f'{variable}=...' for variable in environment_variables])} "

f.write(f'\n.. code-block:: bash\n')
f.write('\n.. code-block:: bash\n')
f.write(f"\n {infix}scrapy crawl {module.__name__.rsplit('.')[-1]}\n")
18 changes: 18 additions & 0 deletions kingfisher_scrapy/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import ijson
import scrapy
from twisted.internet import reactor
from twisted.internet.defer import Deferred

from kingfisher_scrapy import util
from kingfisher_scrapy.items import File, FileItem
Expand Down Expand Up @@ -80,6 +82,22 @@ def process_request(request, spider):
request.headers['Authorization'] = spider.access_token


# https://github.com/ArturGaspar/scrapy-delayed-requests/blob/master/scrapy_delayed_requests.py
class DelayedRequestMiddleware:
"""
Downloader middleware that allows for delaying a request by a set 'wait_time' number of seconds.
A delayed request is useful when an API fails and works again after waiting a few minutes.
"""

def process_request(self, request, spider):
delay = request.meta.get('wait_time', None)
if delay:
d = Deferred()
reactor.callLater(delay, d.callback, None)
return d


class LineDelimitedMiddleware:
"""
If the spider's ``line_delimited`` class attribute is ``True``, yields each line of the File as a FileItem.
Expand Down
6 changes: 3 additions & 3 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'kingfisher_scrapy.middlewares.MyCustomDownloaderMiddleware': 543,
#}
DOWNLOADER_MIDDLEWARES = {
'kingfisher_scrapy.middlewares.DelayedRequestMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
Expand Down
41 changes: 34 additions & 7 deletions kingfisher_scrapy/spiders/portugal_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,48 @@


class PortugalBase(LinksSpider):
# The API return 429 error after a certain number of requests.
download_delay = 1
# The API returns 503 error sometimes.
custom_settings = {
'RETRY_TIMES': 10
}

# BaseSpider
default_from_date = '2010-01-01'

# LinksSpider
next_page_formatter = staticmethod(parameters('offset'))

# We will wait 1, 2, 4, 8, 16 minutes (31 minutes total).
max_retries = 5
half_initial_wait_time = 30

def start_requests(self):
url = self.url
if self.from_date and self.until_date:
url = f'{url}&contractStartDate={self.from_date}&contractEndDate={self.until_date}'

yield scrapy.Request(url, meta={'file_name': 'offset-1.json'})

# https://github.com/scrapy/scrapy/blob/master/scrapy/downloadermiddlewares/retry.py
def parse(self, response):
retries = response.request.meta.get('retries', 0) + 1
wait_time = response.request.meta.get('wait_time', self.half_initial_wait_time) * 2

# Every ~36,000 requests, the API returns HTTP errors. After a few minutes, it starts working again.
# The number of failed attempts in the log messages includes the original request.
# https://github.com/open-contracting/kingfisher-collect/issues/545#issuecomment-762768460
if self.is_http_success(response):
yield from super().parse(response)
elif retries <= self.max_retries:
request = response.request.copy()
request.meta['retries'] = retries
request.meta['wait_time'] = wait_time
request.dont_filter = True

self.logger.debug('Retrying %(request)s in %(wait_time)ds (failed %(failures)d times): HTTP %(status)d',
{'request': response.request, 'failures': retries, 'status': response.status,
'wait_time': wait_time},
extra={'spider': self})

yield request
else:
self.logger.error('Gave up retrying %(request)s (failed %(failures)d times): HTTP %(status)d',
{'request': response.request, 'failures': retries, 'status': response.status},
extra={'spider': self})

yield self.build_file_error_from_response(response)
33 changes: 33 additions & 0 deletions tests/middlewares/test_delayed_request_middleware.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from scrapy import Request
from scrapy.core.downloader import DownloaderMiddlewareManager
from twisted.internet.defer import Deferred
from twisted.trial.unittest import TestCase

from kingfisher_scrapy.middlewares import DelayedRequestMiddleware
from tests import spider_with_crawler


def mock_download_func(spider, request):
return request


def test_delayed_middleware():
spider = spider_with_crawler()
delay_middleware = DelayedRequestMiddleware()
request = Request(f'http://example.com', meta=None)
returned_request = delay_middleware.process_request(request, spider)
assert returned_request is None
downloader_manager = DownloaderMiddlewareManager.from_crawler(spider.crawler)
request = Request(f'http://example.com', meta={'wait_time': 1})
returned_request = delay_middleware.process_request(request, spider)
assert isinstance(returned_request, Deferred)
# we send the request to all the download middlewares including the delayed one
downloaded = downloader_manager.download(mock_download_func, request, spider)
assert isinstance(downloaded, Deferred)
# https://github.com/scrapy/scrapy/blob/28262d4b241744aa7c090702db9a89411e3bbf9a/tests/test_downloadermiddleware.py#L36
results = []
downloaded.addBoth(results.append)
test = TestCase()
test._wait(downloaded)
returned_request = results[0]
assert returned_request.url == request.url
4 changes: 4 additions & 0 deletions tests/test_spiders.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,15 @@ def test_start_requests_http_error(spider_name):
try:
# See scrapy.crawler.Crawler._create_spider
spider = crawler.spidercls.from_crawler(crawler)

for request in spider.start_requests():
# See scrapy.core.scraper.Scraper.call_spider
callback = request.callback or spider.parse

response = Response('http://example.com', status=555, request=request)
# If `max_retries` is set, the spider handles (and retries) error responses.
if hasattr(spider, 'max_retries'):
response.request.meta['retries'] = spider.max_retries
items = list(callback(response))

assert len(items) == 1
Expand Down

0 comments on commit 7ce40a2

Please sign in to comment.