Skip to content

Commit

Permalink
Merge d48c1d6 into 46e145b
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Jan 14, 2021
2 parents 46e145b + d48c1d6 commit 7fcdda8
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 14 deletions.
49 changes: 35 additions & 14 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,29 +148,35 @@ def spider_closed(self, spider, reason):
if reason not in ('finished', 'sample') or spider.pluck or spider.keep_collection_open:
return

data = {
'collection_source': spider.name,
'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
'collection_sample': str(bool(spider.sample)),
}

data = self._build_data_to_send(spider)
return self._request(spider, 'end_collection_store', data['collection_source'], data)

def spider_error(self, failure, response, spider):
"""
Sends an API request to store a file error in Kingfisher Process when a spider callback generates an error.
"""
# https://docs.scrapy.org/en/latest/topics/signals.html#scrapy.signals.spider_error
file_name = response.request.meta.get('file_name', 'spider_error.json')
data = self._build_data_to_send(spider, file_name, response.request.url, failure)
return self._request(spider, 'create_file_error', response.request.url, data)

def item_error(self, item, response, spider, failure):
"""
Sends an API request to store a file error in Kingfisher Process when a item pipeline generates an error.
"""
# https://docs.scrapy.org/en/latest/topics/signals.html#scrapy.signals.item_error
data = self._build_data_to_send(spider, item['file_name'], item['url'], failure)
return self._request(spider, 'create_file_error', item['file_name'], data)

def item_scraped(self, item, spider):
"""
Sends an API request to store the file, file item or file error in Kingfisher Process.
"""

# https://docs.scrapy.org/en/latest/topics/signals.html#scrapy.signals.item_scraped
if not item.get('post_to_api', True) or isinstance(item, PluckedItem):
return

data = {
'collection_source': spider.name,
'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
'collection_sample': str(bool(spider.sample)),
'file_name': item['file_name'],
'url': item['url'],
}
data = self._build_data_to_send(spider, item['file_name'], item['url'])

if isinstance(item, FileError):
data['errors'] = json.dumps(item['errors'])
Expand Down Expand Up @@ -213,6 +219,21 @@ def log_for_status(response):
d.addCallback(log_for_status)
return d

@staticmethod
def _build_data_to_send(spider, file_name=None, url=None, errors=None):
data = {
'collection_source': spider.name,
'collection_data_version': spider.get_start_time('%Y-%m-%d %H:%M:%S'),
'collection_sample': str(bool(spider.sample))
}
if file_name:
data['file_name'] = file_name
if url:
data['url'] = url
if errors:
data['errors'] = json.dumps(errors)
return data


# https://stackoverflow.com/questions/25262765/handle-all-exception-in-scrapy-with-sentry
class SentryLogging:
Expand Down
89 changes: 89 additions & 0 deletions tests/extensions/test_kingfisher_process_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest
import pytest_twisted
from scrapy.exceptions import NotConfigured
from scrapy.http import Request, Response

from kingfisher_scrapy.extensions import KingfisherFilesStore, KingfisherProcessAPI
from kingfisher_scrapy.items import FileError
Expand Down Expand Up @@ -234,6 +235,52 @@ def test_item_scraped_file_error(sample, is_sample, ok, tmpdir, caplog):
assert caplog.records[0].message == message


@pytest_twisted.inlineCallbacks
@pytest.mark.parametrize('sample,is_sample', [(None, False), ('true', True)])
@pytest.mark.parametrize('ok', [True, False])
def test_item_error(sample, is_sample, ok, tmpdir, caplog):
with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked:
mocked.return_value = 200 if ok else 400

spider = spider_with_files_store(tmpdir, sample=sample)
extension = KingfisherProcessAPI.from_crawler(spider.crawler)

item = FileError({
'file_name': 'file.json',
'url': 'https://example.com/remote.json',
'errors': 'ExceptionRaised',
})

response = yield extension.item_error(item, 'ResponseObject', spider, 'ExceptionRaised')
data = yield response.json()

form = {
'collection_source': 'test',
'collection_data_version': '2001-02-03 04:05:06',
'collection_sample': str(is_sample),
'file_name': 'file.json',
'url': 'https://example.com/remote.json',
# Specific to FileError.
'errors': '"ExceptionRaised"',
}

assert data['method'] == 'POST'
assert data['url'] == 'http://httpbin.org/anything/api/v1/submit/file_errors/'
assert data['headers']['Authorization'] == 'ApiKey xxx'
assert data['form'] == form
assert data['args'] == {}
assert data['data'] == ''
assert data['files'] == {}

if not ok:
message = 'create_file_error failed (file.json) with status code: 400'

assert len(caplog.records) == 1
assert caplog.records[0].name == 'test'
assert caplog.records[0].levelname == 'WARNING'
assert caplog.records[0].message == message


@pytest_twisted.inlineCallbacks
@pytest.mark.parametrize('sample,is_sample', [(None, False), ('true', True)])
@pytest.mark.parametrize('ok', [True, False])
Expand Down Expand Up @@ -290,3 +337,45 @@ def test_spider_closed_other_reason(tmpdir):
response = yield extension.spider_closed(spider, 'xxx')

assert response is None


@pytest_twisted.inlineCallbacks
@pytest.mark.parametrize('sample,is_sample', [(None, False), ('true', True)])
@pytest.mark.parametrize('ok', [True, False])
def test_spider_error(sample, is_sample, ok, tmpdir, caplog):
with patch('treq.response._Response.code', new_callable=PropertyMock) as mocked:
mocked.return_value = 200 if ok else 400

spider = spider_with_files_store(tmpdir, sample=sample)
extension = KingfisherProcessAPI.from_crawler(spider.crawler)

scrapy_request = yield Request('https://example.com/remote.json')
scrapy_response = Response('https://example.com/remote.json', request=scrapy_request)
response = yield extension.spider_error('ExceptionRaised', scrapy_response, spider)
data = yield response.json()

form = {
'collection_source': 'test',
'collection_data_version': '2001-02-03 04:05:06',
'collection_sample': str(is_sample),
'file_name': 'spider_error.json',
'url': 'https://example.com/remote.json',
# Specific to FileError.
'errors': '"ExceptionRaised"',
}

assert data['method'] == 'POST'
assert data['url'] == 'http://httpbin.org/anything/api/v1/submit/file_errors/'
assert data['headers']['Authorization'] == 'ApiKey xxx'
assert data['form'] == form
assert data['args'] == {}
assert data['data'] == ''
assert data['files'] == {}

if not ok:
message = 'create_file_error failed (https://example.com/remote.json) with status code: 400'

assert len(caplog.records) == 1
assert caplog.records[0].name == 'test'
assert caplog.records[0].levelname == 'WARNING'
assert caplog.records[0].message == message

0 comments on commit 7fcdda8

Please sign in to comment.