From 9e7ee8b1b04836699088ce4ce23d85db773b8950 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 10 Feb 2021 18:22:51 -0300 Subject: [PATCH 01/15] base_spider: add root_path_max_length doc and read compressed file Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index a56b4c580..c2f3ec216 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -38,6 +38,8 @@ class BaseSpider(scrapy.Spider): If you need to set more arguments for the unflatten command, set a ``unflatten_args`` dict with them. - If the data is not formatted as OCDS (record, release, record package or release package), set a ``root_path`` class attribute to the path to the OCDS data. + - If the JSON file is line-delimited and the root path is to a JSON array root_path_max_length, set a + ``root_path_max_length`` class attribute to the maximum length of the JSON array at the root path. - If the data is line-delimited JSON, add a ``line_delimited = True`` class attribute. If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then @@ -343,7 +345,7 @@ def parse(self, response): if self.resize_package: data = {'data': compressed_file, 'package': archive_file.open(filename)} else: - data = compressed_file + data = compressed_file.read() yield File({ 'file_name': basename, From c46fe0fb0562eeeb4463fc1233058f5da4767abc Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 10 Feb 2021 18:23:25 -0300 Subject: [PATCH 02/15] doc: add root_path_max_length to new spider contribution guide Signed-off-by: Yohanna Lisnichuk --- docs/contributing/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index f59367cc3..65dfb7d8e 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -55,6 +55,7 @@ Since many class attributes that control a spider's behavior, please put the cla unflatten_args = {} line_delimited = True root_path = 'item' + root_path_max_length = 1 skip_pluck = 'A reason' # SimpleSpider From 5cc8c6b9b5ea06378b36c39a8af4c17961d91ce5 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 10 Feb 2021 18:24:04 -0300 Subject: [PATCH 03/15] store extension: change fileitem json names to number + name Otherwise the output is eg: name.json-1 Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/extensions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 68733258e..8a22f1252 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -80,7 +80,7 @@ def item_scraped(self, item, spider): file_name = item['file_name'] if isinstance(item, FileItem): - file_name += f"-{item['number']}" + file_name = f"{item['number']}-{file_name}" path = os.path.join(name, spider.get_start_time('%Y%m%d_%H%M%S'), file_name) From cf949f411bf9779c6baac59e957601826b90b345 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 10 Feb 2021 18:25:15 -0300 Subject: [PATCH 04/15] colombia_bulk: add root_path_max_length Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/spiders/colombia_bulk.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 2df590c58..b844fe7f4 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -20,6 +20,7 @@ class ColombiaBulk(CompressedFileSpider): # BaseSpider line_delimited = True root_path = 'Release' + root_path_max_length = 1 # SimpleSpider data_type = 'release' From 520227da3416c47dd0ec49d704447ff4768a6881 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Wed, 10 Feb 2021 18:25:40 -0300 Subject: [PATCH 05/15] test: update test compressed file tests Signed-off-by: Yohanna Lisnichuk --- tests/test_compressed_file_spider.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/test_compressed_file_spider.py b/tests/test_compressed_file_spider.py index 72f0fc9c4..a336cac95 100644 --- a/tests/test_compressed_file_spider.py +++ b/tests/test_compressed_file_spider.py @@ -28,7 +28,6 @@ def test_parse(): assert item['url'] == 'http://example.com' assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' - assert 'package' not in item['data'] assert item['data'] is not None with pytest.raises(StopIteration): @@ -59,7 +58,6 @@ def test_parse_line_delimited(sample, len_items): assert item['url'] == 'http://example.com' assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' - assert 'package' not in item['data'] assert item['data'] is not None with pytest.raises(StopIteration): @@ -130,7 +128,6 @@ def test_parse_rar_file(): assert item['url'] == 'http://example.com' assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' - assert 'package' not in item['data'] assert item['data'] is not None with pytest.raises(StopIteration): From cae12683326a1cf588c319ae134b945a9984478f Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 11 Feb 2021 08:30:21 -0300 Subject: [PATCH 06/15] Update kingfisher_scrapy/base_spider.py Co-authored-by: James McKinney <26463+jpmckinney@users.noreply.github.com> --- kingfisher_scrapy/base_spider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index c2f3ec216..a0c435a83 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -38,8 +38,8 @@ class BaseSpider(scrapy.Spider): If you need to set more arguments for the unflatten command, set a ``unflatten_args`` dict with them. - If the data is not formatted as OCDS (record, release, record package or release package), set a ``root_path`` class attribute to the path to the OCDS data. - - If the JSON file is line-delimited and the root path is to a JSON array root_path_max_length, set a - ``root_path_max_length`` class attribute to the maximum length of the JSON array at the root path. + - If the JSON file is line-delimited and the root path is to a JSON array, set a ``root_path_max_length`` class + attribute to the maximum length of the JSON array at the root path. - If the data is line-delimited JSON, add a ``line_delimited = True`` class attribute. If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then From bd89a3d7dd82b4caf8c744e8fcf76e24be69e736 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 11 Feb 2021 09:38:42 -0300 Subject: [PATCH 07/15] compressed_spider: use method from util and dont read decompressed file Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index c2f3ec216..d986d18f9 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -12,7 +12,7 @@ from kingfisher_scrapy import util from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError, UnknownArchiveFormatError from kingfisher_scrapy.items import File, FileError -from kingfisher_scrapy.util import add_query_string, handle_http_error +from kingfisher_scrapy.util import add_query_string, handle_http_error, get_file_name_and_extension browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # noqa: E501 @@ -316,8 +316,7 @@ def start_requests(self): @handle_http_error def parse(self, response): - archive_name, archive_format = os.path.splitext(response.request.meta['file_name']) - archive_format = archive_format[1:].lower() + archive_name, archive_format = get_file_name_and_extension(response.request.meta['file_name']) if archive_format == 'zip': cls = ZipFile @@ -345,7 +344,7 @@ def parse(self, response): if self.resize_package: data = {'data': compressed_file, 'package': archive_file.open(filename)} else: - data = compressed_file.read() + data = compressed_file yield File({ 'file_name': basename, From 7e36eebaa45cdab16dddd8e317d07b24785ef397 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 11 Feb 2021 09:39:33 -0300 Subject: [PATCH 08/15] util: add get file and extension method Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/util.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index 3923fee5a..f479655c2 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -1,5 +1,6 @@ import itertools import json +import os from datetime import date from decimal import Decimal from functools import wraps @@ -189,3 +190,14 @@ def default(obj): def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return itertools.zip_longest(*args, fillvalue=fillvalue) + + +def get_file_name_and_extension(filename): + """ + Given a ``filename`` returns its name and extension in two separate strings + >>> get_file_name_and_extension('test.json') + 'test', 'json' + """ + archive_name, archive_format = os.path.splitext(filename) + archive_format = archive_format[1:].lower() + return archive_name, archive_format From 2d9b58ef449eafa4e90dc6bee55c02dc0edeab78 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 11 Feb 2021 09:40:17 -0300 Subject: [PATCH 09/15] kingfisher_file_store: change fileitem file name Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/extensions.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 8a22f1252..abf2cfadf 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -10,7 +10,7 @@ from kingfisher_scrapy import util from kingfisher_scrapy.items import File, FileError, FileItem, PluckedItem from kingfisher_scrapy.kingfisher_process import Client -from kingfisher_scrapy.util import _pluck_filename +from kingfisher_scrapy.util import _pluck_filename, get_file_name_and_extension # https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension @@ -66,7 +66,7 @@ def from_crawler(cls, crawler): def item_scraped(self, item, spider): """ - If the item is a file, writes its data to the filename in the crawl's directory. + If the item is a File or FileItem, writes its data to the filename in the crawl's directory. Returns a dict with the metadata. """ @@ -80,7 +80,8 @@ def item_scraped(self, item, spider): file_name = item['file_name'] if isinstance(item, FileItem): - file_name = f"{item['number']}-{file_name}" + name, extension = get_file_name_and_extension(file_name) + file_name = f"{name}-{item['number']}.{extension}" path = os.path.join(name, spider.get_start_time('%Y%m%d_%H%M%S'), file_name) @@ -88,6 +89,7 @@ def item_scraped(self, item, spider): item['path'] = path item['files_store'] = self.directory + item['file_name'] = file_name def _write_file(self, path, data): path = os.path.join(self.directory, path) From b724fa4c317f6bc8a4e34e8e32dcbe77a34adcb7 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 11 Feb 2021 09:41:18 -0300 Subject: [PATCH 10/15] middlewares: add read decompressed middleware and test Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/middlewares.py | 17 +++++++++++++++ kingfisher_scrapy/settings.py | 3 ++- .../test_kingfisher_transform_middleware.py | 21 ++++++++++++++++++- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/middlewares.py b/kingfisher_scrapy/middlewares.py index 2013f6ac6..058deeb6f 100644 --- a/kingfisher_scrapy/middlewares.py +++ b/kingfisher_scrapy/middlewares.py @@ -6,6 +6,7 @@ import scrapy from kingfisher_scrapy import util +from kingfisher_scrapy.base_spider import CompressedFileSpider from kingfisher_scrapy.items import File, FileItem @@ -93,6 +94,7 @@ def process_spider_output(self, response, result, spider): continue data = item['data'] + # Data can be bytes or a file-like object. if isinstance(data, bytes): data = data.decode(encoding=item['encoding']).splitlines(True) @@ -240,3 +242,18 @@ def _get_package_metadata(self, data, skip_key, data_type): for item in util.items(ijson.parse(data), '', skip_key=skip_key): package.update(item) return package + + +class ReadDecompressedMiddleware: + """ + If the spider is a CompressedFileSpider that wasn't processed for other transform middlewares, reads the + decompressed file pointer. + Otherwise, yields the original item. + """ + def process_spider_output(self, response, result, spider): + for item in result: + if not isinstance(item, File) or not isinstance(spider, CompressedFileSpider): + yield item + continue + item['data'] = item['data'].read() + yield item diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 62f6cdbdc..a7efc5b03 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -63,7 +63,8 @@ 'kingfisher_scrapy.middlewares.LineDelimitedMiddleware': 500, 'kingfisher_scrapy.middlewares.RootPathMiddleware': 400, 'kingfisher_scrapy.middlewares.AddPackageMiddleware': 300, - 'kingfisher_scrapy.middlewares.ResizePackageMiddleware': 200 + 'kingfisher_scrapy.middlewares.ResizePackageMiddleware': 200, + 'kingfisher_scrapy.middlewares.ReadDecompressedMiddleware': 100 } # Enable or disable downloader middlewares diff --git a/tests/middlewares/test_kingfisher_transform_middleware.py b/tests/middlewares/test_kingfisher_transform_middleware.py index ce21be8a6..c8dbade05 100644 --- a/tests/middlewares/test_kingfisher_transform_middleware.py +++ b/tests/middlewares/test_kingfisher_transform_middleware.py @@ -7,7 +7,7 @@ from kingfisher_scrapy.base_spider import CompressedFileSpider, SimpleSpider from kingfisher_scrapy.items import File, FileError, FileItem from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ResizePackageMiddleware, - RootPathMiddleware) + RootPathMiddleware, ReadDecompressedMiddleware) from tests import response_fixture, spider_with_crawler @@ -16,6 +16,7 @@ LineDelimitedMiddleware, ResizePackageMiddleware, RootPathMiddleware, + ReadDecompressedMiddleware ]) @pytest.mark.parametrize('item', [ File({ @@ -187,3 +188,21 @@ def test_line_delimited_json_middleware_compressed(sample): 'data_type': 'release_package', 'encoding': 'utf-8' } + + +def test_read_decompressed_middleware(): + spider = spider_with_crawler(spider_class=CompressedFileSpider) + spider.data_type = 'release_package' + io = BytesIO() + with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: + zipfile.writestr('test.json', '{}') + + middleware = ReadDecompressedMiddleware() + response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) + generator = spider.parse(response) + item = next(generator) + + generator = middleware.process_spider_output(response, [item], spider) + transformed_item = list(generator) + assert len(transformed_item) == 1 + assert transformed_item[0]['data'] == b'{}' From 0ca5dee16b1cab4d179dd009e93d8990cf784bd4 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 11 Feb 2021 09:42:04 -0300 Subject: [PATCH 11/15] kingfisher store: add test for file items name Signed-off-by: Yohanna Lisnichuk --- tests/extensions/test_kingfisher_files_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/extensions/test_kingfisher_files_store.py b/tests/extensions/test_kingfisher_files_store.py index aa22b78f4..38ad3a7b8 100644 --- a/tests/extensions/test_kingfisher_files_store.py +++ b/tests/extensions/test_kingfisher_files_store.py @@ -84,4 +84,5 @@ def test_item_scraped_with_file_item(): extension = KingfisherFilesStore.from_crawler(spider.crawler) item = FileItem({'number': 1, 'file_name': 'file.json', 'data': 'data'}) - assert extension.item_scraped(item, spider) is None + extension.item_scraped(item, spider) + assert item['file_name'] == 'file-1.json' From 90120e5204aa9ca8d532e11761f978ef97344e58 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 11 Feb 2021 09:49:46 -0300 Subject: [PATCH 12/15] isort Signed-off-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 2 +- tests/middlewares/test_kingfisher_transform_middleware.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index ed864fe08..254554a71 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -12,7 +12,7 @@ from kingfisher_scrapy import util from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError, UnknownArchiveFormatError from kingfisher_scrapy.items import File, FileError -from kingfisher_scrapy.util import add_query_string, handle_http_error, get_file_name_and_extension +from kingfisher_scrapy.util import add_query_string, get_file_name_and_extension, handle_http_error browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # noqa: E501 diff --git a/tests/middlewares/test_kingfisher_transform_middleware.py b/tests/middlewares/test_kingfisher_transform_middleware.py index c8dbade05..7db870492 100644 --- a/tests/middlewares/test_kingfisher_transform_middleware.py +++ b/tests/middlewares/test_kingfisher_transform_middleware.py @@ -6,8 +6,8 @@ from kingfisher_scrapy.base_spider import CompressedFileSpider, SimpleSpider from kingfisher_scrapy.items import File, FileError, FileItem -from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ResizePackageMiddleware, - RootPathMiddleware, ReadDecompressedMiddleware) +from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ReadDecompressedMiddleware, + ResizePackageMiddleware, RootPathMiddleware) from tests import response_fixture, spider_with_crawler From e352c0303b084caeba95a128466356ae28a23de4 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 11 Feb 2021 10:55:53 -0300 Subject: [PATCH 13/15] test compressed file spider: add assert for 'package' in data Signed-off-by: Yohanna Lisnichuk --- tests/test_compressed_file_spider.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_compressed_file_spider.py b/tests/test_compressed_file_spider.py index a336cac95..7f2e71c10 100644 --- a/tests/test_compressed_file_spider.py +++ b/tests/test_compressed_file_spider.py @@ -29,6 +29,7 @@ def test_parse(): assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' assert item['data'] is not None + assert 'package' not in item['data'] with pytest.raises(StopIteration): next(generator) @@ -59,6 +60,7 @@ def test_parse_line_delimited(sample, len_items): assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' assert item['data'] is not None + assert 'package' not in item['data'] with pytest.raises(StopIteration): next(generator) @@ -129,6 +131,7 @@ def test_parse_rar_file(): assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' assert item['data'] is not None + assert 'package' not in item['data'] with pytest.raises(StopIteration): next(generator) From 3208d8ec49dcac2b85f3e6d86f85559a66b6aeb9 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 11 Feb 2021 09:53:58 -0500 Subject: [PATCH 14/15] tests: Use consistent code order and variable names with other tests --- .../test_kingfisher_transform_middleware.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/middlewares/test_kingfisher_transform_middleware.py b/tests/middlewares/test_kingfisher_transform_middleware.py index 7db870492..9815006ca 100644 --- a/tests/middlewares/test_kingfisher_transform_middleware.py +++ b/tests/middlewares/test_kingfisher_transform_middleware.py @@ -193,16 +193,19 @@ def test_line_delimited_json_middleware_compressed(sample): def test_read_decompressed_middleware(): spider = spider_with_crawler(spider_class=CompressedFileSpider) spider.data_type = 'release_package' + + middleware = ReadDecompressedMiddleware() + io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: zipfile.writestr('test.json', '{}') - middleware = ReadDecompressedMiddleware() response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) generator = spider.parse(response) item = next(generator) generator = middleware.process_spider_output(response, [item], spider) - transformed_item = list(generator) - assert len(transformed_item) == 1 - assert transformed_item[0]['data'] == b'{}' + transformed_items = list(generator) + + assert len(transformed_items) == 1 + assert transformed_items[0]['data'] == b'{}' From 83fad6a318e88302bf2f7262ee1a9427ff1d1d5c Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Thu, 11 Feb 2021 10:47:22 -0500 Subject: [PATCH 15/15] middlewares: Change ReadDecompressedMiddleware to more generic ReadDataMiddleware --- kingfisher_scrapy/middlewares.py | 9 ++++----- kingfisher_scrapy/settings.py | 2 +- .../middlewares/test_kingfisher_transform_middleware.py | 6 +++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/kingfisher_scrapy/middlewares.py b/kingfisher_scrapy/middlewares.py index 058deeb6f..add9f9c52 100644 --- a/kingfisher_scrapy/middlewares.py +++ b/kingfisher_scrapy/middlewares.py @@ -6,7 +6,6 @@ import scrapy from kingfisher_scrapy import util -from kingfisher_scrapy.base_spider import CompressedFileSpider from kingfisher_scrapy.items import File, FileItem @@ -178,6 +177,7 @@ def process_spider_output(self, response, result, spider): continue data = item['data'] + # If the spider's ``root_path`` class attribute is non-empty, then the JSON data is already parsed. if not isinstance(data, dict): data = json.loads(data, encoding=item['encoding']) @@ -244,15 +244,14 @@ def _get_package_metadata(self, data, skip_key, data_type): return package -class ReadDecompressedMiddleware: +class ReadDataMiddleware: """ - If the spider is a CompressedFileSpider that wasn't processed for other transform middlewares, reads the - decompressed file pointer. + If the item's ``data`` value is a file pointer, reads it. Otherwise, yields the original item. """ def process_spider_output(self, response, result, spider): for item in result: - if not isinstance(item, File) or not isinstance(spider, CompressedFileSpider): + if not isinstance(item, File) or not hasattr(item['data'], 'read'): yield item continue item['data'] = item['data'].read() diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index a7efc5b03..edabd587b 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -64,7 +64,7 @@ 'kingfisher_scrapy.middlewares.RootPathMiddleware': 400, 'kingfisher_scrapy.middlewares.AddPackageMiddleware': 300, 'kingfisher_scrapy.middlewares.ResizePackageMiddleware': 200, - 'kingfisher_scrapy.middlewares.ReadDecompressedMiddleware': 100 + 'kingfisher_scrapy.middlewares.ReadDataMiddleware': 100 } # Enable or disable downloader middlewares diff --git a/tests/middlewares/test_kingfisher_transform_middleware.py b/tests/middlewares/test_kingfisher_transform_middleware.py index 9815006ca..7f19863a9 100644 --- a/tests/middlewares/test_kingfisher_transform_middleware.py +++ b/tests/middlewares/test_kingfisher_transform_middleware.py @@ -6,7 +6,7 @@ from kingfisher_scrapy.base_spider import CompressedFileSpider, SimpleSpider from kingfisher_scrapy.items import File, FileError, FileItem -from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ReadDecompressedMiddleware, +from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ReadDataMiddleware, ResizePackageMiddleware, RootPathMiddleware) from tests import response_fixture, spider_with_crawler @@ -16,7 +16,7 @@ LineDelimitedMiddleware, ResizePackageMiddleware, RootPathMiddleware, - ReadDecompressedMiddleware + ReadDataMiddleware, ]) @pytest.mark.parametrize('item', [ File({ @@ -194,7 +194,7 @@ def test_read_decompressed_middleware(): spider = spider_with_crawler(spider_class=CompressedFileSpider) spider.data_type = 'release_package' - middleware = ReadDecompressedMiddleware() + middleware = ReadDataMiddleware() io = BytesIO() with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: