Skip to content

Commit

Permalink
chore(spidermiddlewares): Reduce repetition
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Oct 4, 2023
1 parent 34ffc2e commit 024bb80
Showing 1 changed file with 15 additions and 12 deletions.
27 changes: 15 additions & 12 deletions kingfisher_scrapy/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@
MAX_GROUP_SIZE = 100


# Avoid reading the rest of a large file, since the rest of the items will be dropped.
def sample_filled(spider):
return spider.sample and number > spider.sample


def group_size(spider):
if spider.sample:
return min(spider.sample, MAX_GROUP_SIZE)
return MAX_GROUP_SIZE


class ConcatenatedJSONMiddleware:
"""
If the spider's ``concatenated_json`` class attribute is ``True``, yields each object of the File as a FileItem.
Expand All @@ -29,8 +40,7 @@ def process_spider_output(self, response, result, spider):

# ijson can read from bytes or a file-like object.
for number, obj in enumerate(util.transcode(spider, ijson.items, data, '', multiple_values=True), 1):
# Avoid reading the rest of a large file, since the rest of the items will be dropped.
if spider.sample and number > spider.sample:
if sample_filled(spider):
return

yield spider.build_file_item(number, obj, item)
Expand All @@ -57,8 +67,7 @@ def process_spider_output(self, response, result, spider):
data = data.splitlines(True)

for number, line in enumerate(data, 1):
# Avoid reading the rest of a large file, since the rest of the items will be dropped.
if spider.sample and number > spider.sample:
if sample_filled(spider):
return

yield spider.build_file_item(number, line, item)
Expand Down Expand Up @@ -183,21 +192,15 @@ def process_spider_output(self, response, result, spider):

data = item['data']

if spider.sample:
size = min(spider.sample, MAX_GROUP_SIZE)
else:
size = MAX_GROUP_SIZE
if item['data_type'] == 'release_package':
key = 'releases'
else:
key = 'records'

package = self._get_package_metadata(spider, data['package'], key, item['data_type'])
iterable = util.transcode(spider, ijson.items, data['data'], f'{key}.item')
# We yield packages containing a maximum of 100 releases or records.
for number, items in enumerate(util.grouper(iterable, size), 1):
# Avoid reading the rest of a large file, since the rest of the items will be dropped.
if spider.sample and number > spider.sample:
for number, items in enumerate(util.grouper(iterable, group_size(spider)), 1):
if sample_filled(spider):
return

data = copy.deepcopy(package)
Expand Down

0 comments on commit 024bb80

Please sign in to comment.