Skip to content

Commit

Permalink
fix: Re-use common logic to ensure files are closed
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Apr 11, 2024
1 parent 4f2b503 commit 169cc74
Showing 1 changed file with 17 additions and 11 deletions.
28 changes: 17 additions & 11 deletions kingfisher_scrapy/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ def group_size(spider):
return MAX_GROUP_SIZE


def read_data_from_file_if_any(item):
if hasattr(item.data, 'read'):
content = item.data.read()
item.data.close()
item.data = content


class ConcatenatedJSONMiddleware:
"""
If the spider's ``concatenated_json`` class attribute is ``True``, yields each object of the File as a FileItem.
Expand Down Expand Up @@ -65,9 +72,9 @@ async def process_spider_output(self, response, result, spider):
continue

data = item.data
# Data can be bytes or a file-like object.
# Data can be bytes or a file-like object. If bytes, split into an iterable.
if isinstance(data, bytes):
data = data.splitlines(True)
data = data.splitlines(keepends=True)

for number, line in enumerate(data, 1):
if sample_filled(spider, number):
Expand All @@ -94,11 +101,11 @@ async def process_spider_output(self, response, result, spider):
yield item
continue

if hasattr(item.data, 'read'):
item.data = item.data.read()
read_data_from_file_if_any(item)

try:
json.loads(item.data)

yield item
except json.JSONDecodeError:
spider.crawler.stats.inc_value('invalid_json_count')
Expand Down Expand Up @@ -178,6 +185,7 @@ async def process_spider_output(self, response, result, spider):
return

item.data = obj

yield item


Expand All @@ -196,12 +204,11 @@ async def process_spider_output(self, response, result, spider):
yield item
continue

data = item.data
if hasattr(data, 'read'):
data = data.read()
read_data_from_file_if_any(item)

data = item.data
# If the spider's ``root_path`` class attribute is non-empty, then the JSON data is already parsed.
if not isinstance(data, dict):
if isinstance(data, bytes):
data = json.loads(data)

if item.data_type == 'release':
Expand Down Expand Up @@ -286,9 +293,8 @@ async def process_spider_output(self, response, result, spider):
yield item
continue

data = item.data.read()
item.data.close()
item.data = data
read_data_from_file_if_any(item)

yield item


Expand Down

0 comments on commit 169cc74

Please sign in to comment.