Skip to content

Commit

Permalink
Merge f7e2cc7 into 00959ea
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Feb 11, 2021
2 parents 00959ea + f7e2cc7 commit 7999b75
Show file tree
Hide file tree
Showing 10 changed files with 83 additions and 34 deletions.
1 change: 1 addition & 0 deletions docs/contributing/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Since many class attributes that control a spider's behavior, please put the cla
unflatten_args = {}
line_delimited = True
root_path = 'item'
root_path_max_length = 1
skip_pluck = 'A reason'
# SimpleSpider
Expand Down
7 changes: 4 additions & 3 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError, UnknownArchiveFormatError
from kingfisher_scrapy.items import File, FileError
from kingfisher_scrapy.util import add_query_string, handle_http_error
from kingfisher_scrapy.util import add_query_string, get_file_name_and_extension, handle_http_error

browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # noqa: E501

Expand All @@ -38,6 +38,8 @@ class BaseSpider(scrapy.Spider):
If you need to set more arguments for the unflatten command, set a ``unflatten_args`` dict with them.
- If the data is not formatted as OCDS (record, release, record package or release package), set a ``root_path``
class attribute to the path to the OCDS data.
- If the JSON file is line-delimited and the root path is to a JSON array, set a ``root_path_max_length`` class
attribute to the maximum length of the JSON array at the root path.
- If the data is line-delimited JSON, add a ``line_delimited = True`` class attribute.
If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then
Expand Down Expand Up @@ -314,8 +316,7 @@ def start_requests(self):

@handle_http_error
def parse(self, response):
archive_name, archive_format = os.path.splitext(response.request.meta['file_name'])
archive_format = archive_format[1:].lower()
archive_name, archive_format = get_file_name_and_extension(response.request.meta['file_name'])

if archive_format == 'zip':
cls = ZipFile
Expand Down
13 changes: 7 additions & 6 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from kingfisher_scrapy import util
from kingfisher_scrapy.items import File, FileError, FileItem, PluckedItem
from kingfisher_scrapy.kingfisher_process import Client
from kingfisher_scrapy.util import _pluck_filename
from kingfisher_scrapy.util import _pluck_filename, get_file_name_and_extension


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
Expand Down Expand Up @@ -66,23 +66,24 @@ def from_crawler(cls, crawler):

def item_scraped(self, item, spider):
"""
If the item is a file, writes its data to the filename in the crawl's directory.
If the item is a File or FileItem, writes its data to the filename in the crawl's directory.
Returns a dict with the metadata.
"""
if not isinstance(item, (File, FileItem)):
return

# The crawl's relative directory, in the format `<spider_name>[_sample]/<YYMMDD_HHMMSS>`.
name = spider.name
directory = spider.name
if spider.sample:
name += '_sample'
directory += '_sample'

file_name = item['file_name']
if isinstance(item, FileItem):
file_name += f"-{item['number']}"
name, extension = get_file_name_and_extension(file_name)
file_name = f"{name}-{item['number']}.{extension}"

path = os.path.join(name, spider.get_start_time('%Y%m%d_%H%M%S'), file_name)
path = os.path.join(directory, spider.get_start_time('%Y%m%d_%H%M%S'), file_name)

self._write_file(path, item['data'])

Expand Down
16 changes: 16 additions & 0 deletions kingfisher_scrapy/middlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def process_spider_output(self, response, result, spider):
continue

data = item['data']

# Data can be bytes or a file-like object.
if isinstance(data, bytes):
data = data.decode(encoding=item['encoding']).splitlines(True)
Expand Down Expand Up @@ -194,6 +195,7 @@ def process_spider_output(self, response, result, spider):
continue

data = item['data']

# If the spider's ``root_path`` class attribute is non-empty, then the JSON data is already parsed.
if not isinstance(data, dict):
data = json.loads(data, encoding=item['encoding'])
Expand Down Expand Up @@ -258,3 +260,17 @@ def _get_package_metadata(self, data, skip_key, data_type):
for item in util.items(ijson.parse(data), '', skip_key=skip_key):
package.update(item)
return package


class ReadDataMiddleware:
"""
If the item's ``data`` value is a file pointer, reads it.
Otherwise, yields the original item.
"""
def process_spider_output(self, response, result, spider):
for item in result:
if not isinstance(item, File) or not hasattr(item['data'], 'read'):
yield item
continue
item['data'] = item['data'].read()
yield item
3 changes: 2 additions & 1 deletion kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@
'kingfisher_scrapy.middlewares.LineDelimitedMiddleware': 500,
'kingfisher_scrapy.middlewares.RootPathMiddleware': 400,
'kingfisher_scrapy.middlewares.AddPackageMiddleware': 300,
'kingfisher_scrapy.middlewares.ResizePackageMiddleware': 200
'kingfisher_scrapy.middlewares.ResizePackageMiddleware': 200,
'kingfisher_scrapy.middlewares.ReadDataMiddleware': 100
}

# Enable or disable downloader middlewares
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/colombia_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class ColombiaBulk(CompressedFileSpider):
# BaseSpider
line_delimited = True
root_path = 'Release'
root_path_max_length = 1

# SimpleSpider
data_type = 'release'
Expand Down
12 changes: 12 additions & 0 deletions kingfisher_scrapy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import date
from decimal import Decimal
from functools import wraps
from os.path import splitext
from urllib.parse import parse_qs, urlencode, urlsplit

from ijson import ObjectBuilder, utils
Expand Down Expand Up @@ -189,3 +190,14 @@ def default(obj):
def grouper(iterable, n, fillvalue=None):
args = [iter(iterable)] * n
return itertools.zip_longest(*args, fillvalue=fillvalue)


def get_file_name_and_extension(filename):
"""
Given a ``filename`` returns its name and extension in two separate strings
>>> get_file_name_and_extension('test.json')
'test', 'json'
"""
name, extension = splitext(filename)
extension = extension[1:].lower()
return name, extension
32 changes: 13 additions & 19 deletions tests/extensions/test_kingfisher_files_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from scrapy.exceptions import NotConfigured

from kingfisher_scrapy.extensions import KingfisherFilesStore
from kingfisher_scrapy.items import FileItem
from kingfisher_scrapy.items import File, FileItem
from tests import spider_with_crawler, spider_with_files_store


Expand Down Expand Up @@ -44,24 +44,28 @@ def test_item_scraped_with_build_file_from_response(sample, path, tmpdir):
assert item['files_store'] == tmpdir


@pytest.mark.parametrize('sample,path', [
(None, os.path.join('test', '20010203_040506', 'file.json')),
('true', os.path.join('test_sample', '20010203_040506', 'file.json')),
@pytest.mark.parametrize('sample,directory', [
(None, os.path.join('test', '20010203_040506')),
('true', os.path.join('test_sample', '20010203_040506')),
])
@pytest.mark.parametrize('data', [b'{"key": "value"}', {"key": "value"}])
def test_item_scraped_with_build_file(sample, path, data, tmpdir):
@pytest.mark.parametrize('item,expected_file_name', [
(File({'file_name': 'file.json', 'encoding': 'iso-8859-1'}), 'file.json'),
(FileItem({'number': 1, 'file_name': 'file.json'}), 'file-1.json')
])
def test_item_scraped_with_file_and_file_item(sample, directory, data, item, expected_file_name, tmpdir):
spider = spider_with_files_store(tmpdir, sample=sample)
extension = KingfisherFilesStore.from_crawler(spider.crawler)

item = spider.build_file(file_name='file.json', url='https://example.com/remote.json', data=data,
data_type='release_package', encoding='iso-8859-1')
path = os.path.join(directory, expected_file_name)
original_file_name = item['file_name']
item['data'] = data
extension.item_scraped(item, spider)

with open(tmpdir.join(path)) as f:
assert f.read() == '{"key": "value"}'

assert item['path'] == path
assert item['files_store'] == tmpdir
assert item['file_name'] == original_file_name


def test_item_scraped_with_build_file_and_existing_directory():
Expand All @@ -75,13 +79,3 @@ def test_item_scraped_with_build_file_and_existing_directory():

# No FileExistsError exception.
extension.item_scraped(item, spider)


def test_item_scraped_with_file_item():
with TemporaryDirectory() as tmpdirname:
files_store = os.path.join(tmpdirname, 'data')
spider = spider_with_crawler(settings={'FILES_STORE': files_store})
extension = KingfisherFilesStore.from_crawler(spider.crawler)
item = FileItem({'number': 1, 'file_name': 'file.json', 'data': 'data'})

assert extension.item_scraped(item, spider) is None
26 changes: 24 additions & 2 deletions tests/middlewares/test_kingfisher_transform_middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

from kingfisher_scrapy.base_spider import CompressedFileSpider, SimpleSpider
from kingfisher_scrapy.items import File, FileError, FileItem
from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ResizePackageMiddleware,
RootPathMiddleware)
from kingfisher_scrapy.middlewares import (AddPackageMiddleware, LineDelimitedMiddleware, ReadDataMiddleware,
ResizePackageMiddleware, RootPathMiddleware)
from tests import response_fixture, spider_with_crawler


Expand All @@ -16,6 +16,7 @@
LineDelimitedMiddleware,
ResizePackageMiddleware,
RootPathMiddleware,
ReadDataMiddleware,
])
@pytest.mark.parametrize('item', [
File({
Expand Down Expand Up @@ -187,3 +188,24 @@ def test_line_delimited_json_middleware_compressed(sample):
'data_type': 'release_package',
'encoding': 'utf-8'
}


def test_read_decompressed_middleware():
spider = spider_with_crawler(spider_class=CompressedFileSpider)
spider.data_type = 'release_package'

middleware = ReadDataMiddleware()

io = BytesIO()
with ZipFile(io, 'w', compression=ZIP_DEFLATED) as zipfile:
zipfile.writestr('test.json', '{}')

response = response_fixture(body=io.getvalue(), meta={'file_name': 'test.zip'})
generator = spider.parse(response)
item = next(generator)

generator = middleware.process_spider_output(response, [item], spider)
transformed_items = list(generator)

assert len(transformed_items) == 1
assert transformed_items[0]['data'] == b'{}'
6 changes: 3 additions & 3 deletions tests/test_compressed_file_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def test_parse():
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'release_package'
assert item['encoding'] == 'utf-8'
assert 'package' not in item['data']
assert item['data'] is not None
assert 'package' not in item['data']

with pytest.raises(StopIteration):
next(generator)
Expand Down Expand Up @@ -59,8 +59,8 @@ def test_parse_line_delimited(sample, len_items):
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'release_package'
assert item['encoding'] == 'utf-8'
assert 'package' not in item['data']
assert item['data'] is not None
assert 'package' not in item['data']

with pytest.raises(StopIteration):
next(generator)
Expand Down Expand Up @@ -130,8 +130,8 @@ def test_parse_rar_file():
assert item['url'] == 'http://example.com'
assert item['data_type'] == 'release_package'
assert item['encoding'] == 'utf-8'
assert 'package' not in item['data']
assert item['data'] is not None
assert 'package' not in item['data']

with pytest.raises(StopIteration):
next(generator)

0 comments on commit 7999b75

Please sign in to comment.