Skip to content

Commit

Permalink
test: Update RootPathMiddleware and ResizePackageMiddleware tests. Pr…
Browse files Browse the repository at this point in the history
…eserve existing behavior when the root_path points to an empty object.
  • Loading branch information
jpmckinney committed Oct 5, 2023
1 parent 918209b commit c393bea
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 33 deletions.
38 changes: 24 additions & 14 deletions kingfisher_scrapy/spidermiddlewares.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import itertools
import json
from zipfile import BadZipFile

Expand Down Expand Up @@ -116,20 +117,29 @@ def process_spider_output(self, response, result, spider):
key = 'records'
item['data_type'] = 'record_package'

for number, items in enumerate(util.grouper(iterable, group_size(spider)), 1):
if sample_filled(spider, number):
return

if is_package:
# Assume that the `extensions` are the same for all packages.
package = items[0]
for other in items[1:]:
package[key].extend(other[key])
else:
# Omit the None values returned by `grouper(*, fillvalue=None)`.
package = {'version': spider.ocds_version, key: list(filter(None, items))}

yield spider.build_file_item(number, package, item)
try:
head = next(iterable)
except StopIteration:
# Always yield an item, even if the root_path points to an empty object.
# https://github.com/open-contracting/kingfisher-collect/pull/944#issuecomment-1149156552
item['data'] = {'version': spider.ocds_version, key: []}
yield item
else:
iterable = itertools.chain([head], iterable)
for number, items in enumerate(util.grouper(iterable, group_size(spider)), 1):
if sample_filled(spider, number):
return

if is_package:
# Assume that the `extensions` are the same for all packages.
package = items[0]
for other in filter(None, items[1:]):
package[key].extend(other[key])
else:
# Omit the None values returned by `grouper(*, fillvalue=None)`.
package = {'version': spider.ocds_version, key: list(filter(None, items))}

yield spider.build_file_item(number, package, item)
else:
# Iterates at most once.
for number, obj in enumerate(iterable, 1):
Expand Down
71 changes: 52 additions & 19 deletions tests/test_spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,12 @@ def test_passthrough(middleware_class, item):


@pytest.mark.parametrize('middleware_class,attribute,value,override', [
(ConcatenatedJSONMiddleware, 'concatenated_json', True, {'data': {'a': [{'b': 'c'}]}, 'number': 1}),
(LineDelimitedMiddleware, 'line_delimited', True, {'data': b'{"a":[{"b": "c"}]}', 'number': 1}),
(ConcatenatedJSONMiddleware, 'concatenated_json', True,
{'data': {'a': [{'b': 'c'}]}, 'number': 1}),
(LineDelimitedMiddleware, 'line_delimited', True,
{'data': b'{"a":[{"b": "c"}]}', 'number': 1}),
(RootPathMiddleware, 'root_path', 'a.item',
{'data': {'releases': [{'b': 'c'}], 'version': '1.1'}, 'data_type': 'release_package'}),
{'data': {'releases': [{'b': 'c'}], 'version': '1.1'}, 'data_type': 'release_package', 'number': 1}),
(AddPackageMiddleware, 'data_type', 'release',
{'data': {'releases': [{'a': [{'b': 'c'}]}], 'version': '1.1'}, 'data_type': 'release_package'}),
# ResizePackageMiddleware is only used with CompressedFileSpider and BigFileSpider.
Expand Down Expand Up @@ -178,6 +180,8 @@ def test_add_package_middleware(data_type, data, root_path):
'file_name': 'test.json',
'url': 'http://test.com',
}
if 'item' in root_path:
expected['number'] = 1

if 'package' in data_type:
expected['data'] = {f"{data_type[:-8]}s": [{"ocid": "abc"}], "uri": "test"}
Expand All @@ -189,7 +193,7 @@ def test_add_package_middleware(data_type, data, root_path):
assert item == expected


@pytest.mark.parametrize('sample,len_items,len_releases', [(None, 2, 100), (5, 5, 5)])
@pytest.mark.parametrize('sample,len_items,len_releases', [(None, 2, 100), (5, 5, 5), (200, 2, 100)])
@pytest.mark.parametrize('encoding,character', [('utf-8', b'\xc3\x9a'), ('iso-8859-1', b'\xda')])
@pytest.mark.parametrize('data_type, key', [('record_package', 'records'), ('release_package', 'releases')])
def test_resize_package_middleware(sample, len_items, len_releases, encoding, character, data_type, key):
Expand Down Expand Up @@ -395,18 +399,52 @@ def test_retry_data_error_middleware(exception):
list(generator)


@pytest.mark.parametrize('root_path,data_type,sample,data,expected_data,expected_data_type', [
@pytest.mark.parametrize('root_path,data_type,data,expected_data,expected_data_type', [
# Empty root path.
('', 'my_data_type', None,
('', 'my_data_type',
{'a': 'b'},
{'a': 'b'}, 'my_data_type'),
# Root path without "item".
('x', 'my_data_type', None,
('x', 'my_data_type',
{'x': {'a': 'b'}},
{'a': 'b'}, 'my_data_type'),
# Root paths with "item" ...
# ... with an empty array, for data_type = "release".
('item', 'release',
[],
{'releases': [], 'version': '1.1'}, 'release_package'),
# ... with an empty array, for data_type = "record_package".
('item', 'record_package',
[],
{'records': [], 'version': '1.1'}, 'record_package'),
])
@pytest.mark.parametrize('klass', [File, FileItem])
def test_root_path_middleware(root_path, data_type, data, expected_data, expected_data_type, klass):
spider = spider_with_crawler()
middleware = RootPathMiddleware()
spider.data_type = data_type
spider.root_path = root_path

item = klass({
'file_name': 'test.json',
'data': data,
'data_type': data_type,
'url': 'http://test.com',
})

generator = middleware.process_spider_output(None, [item], spider)
transformed_items = list(generator)

assert len(transformed_items) == 1
for transformed_item in transformed_items:
assert isinstance(transformed_item, klass)
assert transformed_item['file_name'] == 'test.json'
assert transformed_item['data'] == expected_data
assert transformed_item['data_type'] == expected_data_type
assert transformed_item['url'] == 'http://test.com'


@pytest.mark.parametrize('root_path,data_type,sample,data,expected_data,expected_data_type', [
# ... for data_type = "release".
('item', 'release', None,
[{'a': 'b'}, {'c': 'd'}],
Expand All @@ -427,17 +465,9 @@ def test_retry_data_error_middleware(exception):
('item', 'release_package', None,
[{'releases': [{'a': 'b'}, {'c': 'd'}], 'x': 'y'}, {'releases': [{'e': 'f'}, {'g': 'h'}]}],
{'releases': [{'a': 'b'}, {'c': 'd'}, {'e': 'f'}, {'g': 'h'}], 'x': 'y'}, 'release_package'),
# ... with an empty object, for data_type = "release".
('item', 'release', None,
[],
{'releases': [], 'version': '1.1'}, 'release_package'),
# ... with an empty object, for data_type = "record_package".
('item', 'record_package', None,
[],
{'records': [], 'version': '1.1'}, 'record_package'),
])
@pytest.mark.parametrize('klass', [File, FileItem])
def test_root_path_middleware(root_path, data_type, sample, data, expected_data, expected_data_type, klass):
def test_root_path_middleware_item(root_path, data_type, sample, data, expected_data, expected_data_type, klass):
spider = spider_with_crawler()
middleware = RootPathMiddleware()
spider.data_type = data_type
Expand All @@ -456,6 +486,9 @@ def test_root_path_middleware(root_path, data_type, sample, data, expected_data,

assert len(transformed_items) == 1
for transformed_item in transformed_items:
assert isinstance(transformed_item, klass)
assert isinstance(transformed_item, FileItem)
assert transformed_item['number'] == 1
assert transformed_item['file_name'] == 'test.json'
assert transformed_item['data'] == expected_data
assert transformed_item['data_type'] == expected_data_type
assert transformed_item['url'] == 'http://test.com'

0 comments on commit c393bea

Please sign in to comment.