diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index f59367cc3..65dfb7d8e 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -55,6 +55,7 @@ Since many class attributes that control a spider's behavior, please put the cla unflatten_args = {} line_delimited = True root_path = 'item' + root_path_max_length = 1 skip_pluck = 'A reason' # SimpleSpider diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index a56b4c580..254554a71 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -12,7 +12,7 @@ from kingfisher_scrapy import util from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError, UnknownArchiveFormatError from kingfisher_scrapy.items import File, FileError -from kingfisher_scrapy.util import add_query_string, handle_http_error +from kingfisher_scrapy.util import add_query_string, get_file_name_and_extension, handle_http_error browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # noqa: E501 @@ -38,6 +38,8 @@ class BaseSpider(scrapy.Spider): If you need to set more arguments for the unflatten command, set a ``unflatten_args`` dict with them. - If the data is not formatted as OCDS (record, release, record package or release package), set a ``root_path`` class attribute to the path to the OCDS data. + - If the JSON file is line-delimited and the root path is to a JSON array, set a ``root_path_max_length`` class + attribute to the maximum length of the JSON array at the root path. - If the data is line-delimited JSON, add a ``line_delimited = True`` class attribute. If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then @@ -314,8 +316,7 @@ def start_requests(self): @handle_http_error def parse(self, response): - archive_name, archive_format = os.path.splitext(response.request.meta['file_name']) - archive_format = archive_format[1:].lower() + archive_name, archive_format = get_file_name_and_extension(response.request.meta['file_name']) if archive_format == 'zip': cls = ZipFile diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py index 68733258e..abf2cfadf 100644 --- a/kingfisher_scrapy/extensions.py +++ b/kingfisher_scrapy/extensions.py @@ -10,7 +10,7 @@ from kingfisher_scrapy import util from kingfisher_scrapy.items import File, FileError, FileItem, PluckedItem from kingfisher_scrapy.kingfisher_process import Client -from kingfisher_scrapy.util import _pluck_filename +from kingfisher_scrapy.util import _pluck_filename, get_file_name_and_extension # https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension @@ -66,7 +66,7 @@ def from_crawler(cls, crawler): def item_scraped(self, item, spider): """ - If the item is a file, writes its data to the filename in the crawl's directory. + If the item is a File or FileItem, writes its data to the filename in the crawl's directory. Returns a dict with the metadata. """ @@ -80,7 +80,8 @@ def item_scraped(self, item, spider): file_name = item['file_name'] if isinstance(item, FileItem): - file_name += f"-{item['number']}" + name, extension = get_file_name_and_extension(file_name) + file_name = f"{name}-{item['number']}.{extension}" path = os.path.join(name, spider.get_start_time('%Y%m%d_%H%M%S'), file_name) @@ -88,6 +89,7 @@ def item_scraped(self, item, spider): item['path'] = path item['files_store'] = self.directory + item['file_name'] = file_name def _write_file(self, path, data): path = os.path.join(self.directory, path) diff --git a/kingfisher_scrapy/middlewares.py b/kingfisher_scrapy/middlewares.py index bb200ea6f..ea93ca9b9 100644 --- a/kingfisher_scrapy/middlewares.py +++ b/kingfisher_scrapy/middlewares.py @@ -111,6 +111,7 @@ def process_spider_output(self, response, result, spider): continue data = item['data'] + # Data can be bytes or a file-like object. if isinstance(data, bytes): data = data.decode(encoding=item['encoding']).splitlines(True) @@ -194,6 +195,7 @@ def process_spider_output(self, response, result, spider): continue data = item['data'] + # If the spider's ``root_path`` class attribute is non-empty, then the JSON data is already parsed. if not isinstance(data, dict): data = json.loads(data, encoding=item['encoding']) @@ -258,3 +260,17 @@ def _get_package_metadata(self, data, skip_key, data_type): for item in util.items(ijson.parse(data), '', skip_key=skip_key): package.update(item) return package + + +class ReadDataMiddleware: + """ + If the item's ``data`` value is a file pointer, reads it. + Otherwise, yields the original item. + """ + def process_spider_output(self, response, result, spider): + for item in result: + if not isinstance(item, File) or not hasattr(item['data'], 'read'): + yield item + continue + item['data'] = item['data'].read() + yield item diff --git a/kingfisher_scrapy/settings.py b/kingfisher_scrapy/settings.py index 9d8cc2812..027c546be 100644 --- a/kingfisher_scrapy/settings.py +++ b/kingfisher_scrapy/settings.py @@ -63,7 +63,8 @@ 'kingfisher_scrapy.middlewares.LineDelimitedMiddleware': 500, 'kingfisher_scrapy.middlewares.RootPathMiddleware': 400, 'kingfisher_scrapy.middlewares.AddPackageMiddleware': 300, - 'kingfisher_scrapy.middlewares.ResizePackageMiddleware': 200 + 'kingfisher_scrapy.middlewares.ResizePackageMiddleware': 200, + 'kingfisher_scrapy.middlewares.ReadDataMiddleware': 100 } # Enable or disable downloader middlewares diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 2df590c58..b844fe7f4 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -20,6 +20,7 @@ class ColombiaBulk(CompressedFileSpider): # BaseSpider line_delimited = True root_path = 'Release' + root_path_max_length = 1 # SimpleSpider data_type = 'release' diff --git a/kingfisher_scrapy/util.py b/kingfisher_scrapy/util.py index 3923fee5a..f479655c2 100644 --- a/kingfisher_scrapy/util.py +++ b/kingfisher_scrapy/util.py @@ -1,5 +1,6 @@ import itertools import json +import os from datetime import date from decimal import Decimal from functools import wraps @@ -189,3 +190,14 @@ def default(obj): def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return itertools.zip_longest(*args, fillvalue=fillvalue) + + +def get_file_name_and_extension(filename): + """ + Given a ``filename`` returns its name and extension in two separate strings + >>> get_file_name_and_extension('test.json') + 'test', 'json' + """ + archive_name, archive_format = os.path.splitext(filename) + archive_format = archive_format[1:].lower() + return archive_name, archive_format diff --git a/tests/extensions/test_kingfisher_files_store.py b/tests/extensions/test_kingfisher_files_store.py index aa22b78f4..38ad3a7b8 100644 --- a/tests/extensions/test_kingfisher_files_store.py +++ b/tests/extensions/test_kingfisher_files_store.py @@ -84,4 +84,5 @@ def test_item_scraped_with_file_item(): extension = KingfisherFilesStore.from_crawler(spider.crawler) item = FileItem({'number': 1, 'file_name': 'file.json', 'data': 'data'}) - assert extension.item_scraped(item, spider) is None + extension.item_scraped(item, spider) + assert item['file_name'] == 'file-1.json' diff --git a/tests/middlewares/test_kingfisher_transform_middleware.py b/tests/middlewares/test_kingfisher_transform_middleware.py index ce21be8a6..7f19863a9 100644 --- a/tests/middlewares/test_kingfisher_transform_middleware.py +++ b/tests/middlewares/test_kingfisher_transform_middleware.py @@ -6,8 +6,8 @@ from kingfisher_scrapy.base_spider import CompressedFileSpider, SimpleSpider from kingfisher_scrapy.items import File, FileError, FileItem -from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ResizePackageMiddleware, - RootPathMiddleware) +from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ReadDataMiddleware, + ResizePackageMiddleware, RootPathMiddleware) from tests import response_fixture, spider_with_crawler @@ -16,6 +16,7 @@ LineDelimitedMiddleware, ResizePackageMiddleware, RootPathMiddleware, + ReadDataMiddleware, ]) @pytest.mark.parametrize('item', [ File({ @@ -187,3 +188,24 @@ def test_line_delimited_json_middleware_compressed(sample): 'data_type': 'release_package', 'encoding': 'utf-8' } + + +def test_read_decompressed_middleware(): + spider = spider_with_crawler(spider_class=CompressedFileSpider) + spider.data_type = 'release_package' + + middleware = ReadDataMiddleware() + + io = BytesIO() + with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile: + zipfile.writestr('test.json', '{}') + + response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'}) + generator = spider.parse(response) + item = next(generator) + + generator = middleware.process_spider_output(response, [item], spider) + transformed_items = list(generator) + + assert len(transformed_items) == 1 + assert transformed_items[0]['data'] == b'{}' diff --git a/tests/test_compressed_file_spider.py b/tests/test_compressed_file_spider.py index 72f0fc9c4..7f2e71c10 100644 --- a/tests/test_compressed_file_spider.py +++ b/tests/test_compressed_file_spider.py @@ -28,8 +28,8 @@ def test_parse(): assert item['url'] == 'http://example.com' assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' - assert 'package' not in item['data'] assert item['data'] is not None + assert 'package' not in item['data'] with pytest.raises(StopIteration): next(generator) @@ -59,8 +59,8 @@ def test_parse_line_delimited(sample, len_items): assert item['url'] == 'http://example.com' assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' - assert 'package' not in item['data'] assert item['data'] is not None + assert 'package' not in item['data'] with pytest.raises(StopIteration): next(generator) @@ -130,8 +130,8 @@ def test_parse_rar_file(): assert item['url'] == 'http://example.com' assert item['data_type'] == 'release_package' assert item['encoding'] == 'utf-8' - assert 'package' not in item['data'] assert item['data'] is not None + assert 'package' not in item['data'] with pytest.raises(StopIteration): next(generator)