Skip to content

Commit

Permalink
fix: Always yield an item, even if the root_path points to an empty o…
Browse files Browse the repository at this point in the history
…bject.

chore: Move some calculations outside of for-loops.
test: Test RootPathMiddleware with FileItem as input.
docs: Clarify what spider middlewares modify. Add comments to test cases.
  • Loading branch information
jpmckinney committed Jun 7, 2022
1 parent 51d2fea commit 2eec0fd
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 46 deletions.
46 changes: 25 additions & 21 deletions kingfisher_scrapy/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,24 +64,23 @@ def process_spider_output(self, response, result, spider):

class RootPathMiddleware:
"""
If the spider's ``root_path`` class attribute is non-empty, yields an item for the object(s) at the ``root_path``.
Otherwise, yields the original item.
If the objects (releases, records or packages) are within an array, they are combined into a single package.
If the spider's ``root_path`` class attribute is non-empty, replaces the item's ``data`` with the objects at that
prefix; if there are multiple releases, records or packages at that prefix, combines them into a single package,
and updates the item's ``data_type`` if needed. Otherwise, yields the original item.
"""

def process_spider_output(self, response, result, spider):
"""
:returns: a generator of FileItem objects, in which the ``data`` field is parsed JSON
:returns: a generator of File or FileItem objects, in which the ``data`` field is parsed JSON
"""
for item in result:
if not isinstance(item, (File, FileItem)) or not spider.root_path:
yield item
continue

data = item['data']

package = None
is_multiple = 'item' in spider.root_path.split('.')
is_package = 'package' in item['data_type']

if isinstance(data, (dict, list)):
data = util.json_dumps(data).encode()
Expand All @@ -93,6 +92,8 @@ def process_spider_output(self, response, result, spider):
key = 'records'
data_type = 'record_package'

package = {key: [], 'version': spider.ocds_version}

for number, obj in enumerate(util.transcode(spider, ijson.items, data, spider.root_path), 1):
# Avoid reading the rest of a large file, since the rest of the items will be dropped.
if spider.sample and number > spider.sample:
Expand All @@ -107,32 +108,30 @@ def process_spider_output(self, response, result, spider):
# written, the number of messages in RabbitMQ and the number of rows in PostgreSQL.
#
# We fix the packaging to reduce the overhead.
if 'item' in spider.root_path.split('.'):
if is_multiple:
# Assume that the `extensions` are the same for all packages.
if not package:
if 'package' in item['data_type']:
package = obj.copy()
else:
package = {'version': spider.ocds_version}
if number == 1 and is_package:
package = obj.copy()
package[key] = []

if 'package' in item['data_type']:
if is_package:
package[key].extend(obj[key])
else:
package[key].append(obj)
else:
item['data'] = obj
yield item
if package:
item['data_type'] = data_type

if is_multiple:
item['data'] = package
item['data_type'] = data_type
yield item


class AddPackageMiddleware:
"""
If the spider's ``data_type`` class attribute is "release" or "record", wraps the data in a package.
Otherwise, yields the original item.
If the spider's ``data_type`` class attribute is "release" or "record", wraps the item's ``data`` in an appropriate
package, and updates the item's ``data_type``. Otherwise, yields the original item.
"""

def process_spider_output(self, response, result, spider):
Expand All @@ -156,6 +155,7 @@ def process_spider_output(self, response, result, spider):
key = 'releases'
else:
key = 'records'

item['data'] = {key: [data], 'version': spider.ocds_version}
item['data_type'] += '_package'

Expand Down Expand Up @@ -217,12 +217,16 @@ def _get_package_metadata(self, spider, data, skip_key, data_type):

class ReadDataMiddleware:
"""
If the item's ``data`` value is a file pointer, as with ``CompressedFileSpider``, reads it and closes it.
Otherwise, yields the original item.
If the item's ``data`` is a file descriptor, replaces the item's ``data`` with the file's contents and closes the
file descriptor. Otherwise, yields the original item.
.. seealso::
:class:`~kingfisher_scrapy.base_spiders.compressed_file_spider.CompressedFileSpider`
"""
def process_spider_output(self, response, result, spider):
"""
:returns: a generator of FileItem objects, in which the ``data`` field is bytes
:returns: a generator of File objects, in which the ``data`` field is bytes
"""
for item in result:
if not isinstance(item, File) or not hasattr(item['data'], 'read'):
Expand Down
79 changes: 54 additions & 25 deletions tests/test_spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,15 @@ def test_bytes_or_file(middleware_class, attribute, value, override, tmpdir):
generator = middleware.process_spider_output(None, [bytes_item, file_item], spider)
transformed_items = list(generator)

expected = {
'file_name': 'test.json',
'data_type': 'release',
'url': 'http://test.com',
}
expected.update(override)

assert len(transformed_items) == 2
for item in transformed_items:
expected = {
'file_name': 'test.json',
'data_type': 'release',
'url': 'http://test.com',
}
expected.update(override)
assert item == expected


Expand Down Expand Up @@ -130,13 +131,14 @@ def test_encoding(middleware_class, attribute, value, override, tmpdir):
generator = middleware.process_spider_output(None, [bytes_item, file_item], spider)
transformed_items = list(generator)

assert len(transformed_items) == 2
expected = {
'file_name': 'test.json',
'data_type': 'release',
'url': 'http://test.com',
}
'file_name': 'test.json',
'data_type': 'release',
'url': 'http://test.com',
}
expected.update(override)

assert len(transformed_items) == 2
for item in transformed_items:
assert item == expected

Expand Down Expand Up @@ -391,25 +393,52 @@ def test_retry_data_error_middleware(exception):
list(generator)


@pytest.mark.parametrize('root_path,data_type,data,expected,expected_data_type,sample', [
('item', 'release', [{'a': 'b'}], {'releases': [{'a': 'b'}], 'version': '1.1'}, 'release_package', None),
('Release', 'release', {'Release': {'a': 'b'}}, {'a': 'b'}, 'release', None),
('item', 'release_package', [{'releases': [{'a': 'b'}, {'c': 'd'}], 'uri': 'test'},
{'releases': [{'e': 'f'}, {'g': 'h'}], 'uri': 'test'}],
{'releases': [{'a': 'b'}, {'c': 'd'}, {'e': 'f'}, {'g': 'h'}], 'uri': 'test'}, 'release_package', None),
('a.item', 'record', {'a': [{'a': 'b'}, {'a': 'b'}]}, {'records': [{'a': 'b'}, {'a': 'b'}], 'version': '1.1'},
'record_package', None),
('a.item', 'record', {'a': [{'a': 'b'}, {'a': 'b'}]}, {'records': [{'a': 'b'}], 'version': '1.1'},
'record_package', 1)
@pytest.mark.parametrize('root_path,data_type,sample,data,expected_data,expected_data_type', [
# Empty root path.
('', 'my_data_type', None,
{'a': 'b'},
{'a': 'b'}, 'my_data_type'),
# Root path without "item".
('x', 'my_data_type', None,
{'x': {'a': 'b'}},
{'a': 'b'}, 'my_data_type'),
# Root paths with "item" ...
# ... for data_type = "release".
('item', 'release', None,
[{'a': 'b'}, {'c': 'd'}],
{'releases': [{'a': 'b'}, {'c': 'd'}], 'version': '1.1'}, 'release_package'),
# ... and another prefix, for data_type = "record".
('x.item', 'record', None,
{'x': [{'a': 'b'}, {'c': 'd'}]},
{'records': [{'a': 'b'}, {'c': 'd'}], 'version': '1.1'}, 'record_package'),
# ... and "sample".
('x.item', 'record', 1,
{'x': [{'a': 'b'}, {'c': 'd'}]},
{'records': [{'a': 'b'}], 'version': '1.1'}, 'record_package'),
# ... without package metadata, for data_type = "record_package".
('item', 'record_package', None,
[{'records': [{'a': 'b'}, {'c': 'd'}]}, {'records': [{'e': 'f'}, {'g': 'h'}]}],
{'records': [{'a': 'b'}, {'c': 'd'}, {'e': 'f'}, {'g': 'h'}]}, 'record_package'),
# ... with inconsistent package metadata, for data_type = "release_package".
('item', 'release_package', None,
[{'releases': [{'a': 'b'}, {'c': 'd'}], 'x': 'y'}, {'releases': [{'e': 'f'}, {'g': 'h'}]}],
{'releases': [{'a': 'b'}, {'c': 'd'}, {'e': 'f'}, {'g': 'h'}], 'x': 'y'}, 'release_package'),
# ... with an empty object.
('item', 'release', None,
[],
{'releases': [], 'version': '1.1'}, 'release_package'),
])
def test_root_path_middleware(root_path, data_type, data, expected, expected_data_type, sample):
@pytest.mark.parametrize('klass', [File, FileItem])
def test_root_path_middleware(root_path, data_type, sample, data, expected_data, expected_data_type, klass):
spider = spider_with_crawler()
middleware = RootPathMiddleware()
spider.data_type = data_type
spider.root_path = root_path
spider.sample = sample

item = File({
item = klass({
'file_name': 'test.json',
'data': data,
'data_type': data_type,
Expand All @@ -421,6 +450,6 @@ def test_root_path_middleware(root_path, data_type, data, expected, expected_dat

assert len(transformed_items) == 1
for transformed_item in transformed_items:
assert transformed_item['data'] == expected
assert type(item) == type(transformed_item)
assert isinstance(transformed_item, klass)
assert transformed_item['data'] == expected_data
assert transformed_item['data_type'] == expected_data_type

0 comments on commit 2eec0fd

Please sign in to comment.