Skip to content

Commit

Permalink
spidermiddlewares: support resizing record packages
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Jun 1, 2022
1 parent 000d68e commit a8174fd
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
11 changes: 7 additions & 4 deletions kingfisher_scrapy/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,16 +161,19 @@ def process_spider_output(self, response, result, spider):
size = spider.sample
else:
size = 100

package = self._get_package_metadata(spider, data['package'], 'releases', item['data_type'])
iterable = util.transcode(spider, ijson.items, data['data'], 'releases.item')
if spider.data_type == 'release_package':
package_type = 'releases'
else:
package_type = 'records'
package = self._get_package_metadata(spider, data['package'], package_type, item['data_type'])
iterable = util.transcode(spider, ijson.items, data['data'], f'{package_type}.item')
# We yield release packages containing a maximum of 100 releases.
for number, items in enumerate(util.grouper(iterable, size), 1):
# Avoid reading the rest of a large file, since the rest of the items will be dropped.
if spider.sample and number > spider.sample:
return

package['releases'] = filter(None, items)
package[package_type] = filter(None, items)
data = util.json_dumps(package).encode()

yield spider.build_file_item(number, data, item)
Expand Down
13 changes: 7 additions & 6 deletions tests/test_spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,17 +188,18 @@ def test_add_package_middleware(data_type, data, root_path):

@pytest.mark.parametrize('sample,len_items,len_releases', [(None, 2, 100), (5, 5, 5)])
@pytest.mark.parametrize('encoding,character', [('utf-8', b'\xc3\x9a'), ('iso-8859-1', b'\xda')])
def test_resize_package_middleware(sample, len_items, len_releases, encoding, character):
@pytest.mark.parametrize('package_type', ['record', 'release'])
def test_resize_package_middleware(sample, len_items, len_releases, encoding, character, package_type):
spider = spider_with_crawler(spider_class=CompressedFileSpider, sample=sample)
spider.data_type = 'release_package'
spider.data_type = f'{package_type}_package'
spider.resize_package = True
spider.encoding = encoding

middleware = ResizePackageMiddleware()

package = {'publisher': {'name': 'TIBÚ'}, 'releases': []}
package = {'publisher': {'name': 'TIBÚ'}, f'{package_type}s': []}
for i in range(200):
package['releases'].append({'key': 'TIBÚ'})
package[f'{package_type}s'].append({'key': 'TIBÚ'})

io = BytesIO()
with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
Expand All @@ -220,8 +221,8 @@ def test_resize_package_middleware(sample, len_items, len_releases, encoding, ch
assert item['url'] == 'http://example.com'
assert item['number'] == i
assert isinstance(item['data'], bytes)
assert len(json.loads(item['data'])['releases']) == len_releases
assert item['data_type'] == 'release_package'
assert len(json.loads(item['data'])[f'{package_type}s']) == len_releases
assert item['data_type'] == f'{package_type}_package'


@pytest.mark.parametrize('middleware_class,attribute,separator', [
Expand Down

0 comments on commit a8174fd

Please sign in to comment.