Skip to content

Commit

Permalink
Merge 33547e6 into acd4210
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Feb 27, 2021
2 parents acd4210 + 33547e6 commit 50fe148
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 13 deletions.
7 changes: 7 additions & 0 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ class attribute to the path to the OCDS data.
If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then
``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
``get_default_until_date()`` return value (which is the current time, by default).
If the spider needs to parse the JSON response in its ``parse`` method, set ``dont_truncate = True``.
"""
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'}

Expand All @@ -56,6 +58,7 @@ class attribute to the path to the OCDS data.
unflatten_args = {}
line_delimited = False
root_path = ''
dont_truncate = False

def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
Expand Down Expand Up @@ -415,6 +418,10 @@ def next_link(self, response, **kwargs):
"""
If the JSON response has a ``links.next`` key, returns a ``scrapy.Request`` for the URL.
"""
# If the sample size is 1, we don't want to parse the response, especially if --max-bytes is used.
if self.sample and self.sample == 1:
return

data = response.json()
url = resolve_pointer(data, self.next_pointer, None)
if url:
Expand Down
16 changes: 14 additions & 2 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from scrapy.exceptions import NotConfigured, StopDownload

from kingfisher_scrapy import util
from kingfisher_scrapy.base_spider import CompressedFileSpider
from kingfisher_scrapy.items import File, FileError, FileItem, PluckedItem
from kingfisher_scrapy.kingfisher_process import Client
from kingfisher_scrapy.util import _pluck_filename, get_file_name_and_extension
Expand Down Expand Up @@ -39,8 +40,19 @@ def from_crawler(cls, crawler):
return extension

def bytes_received(self, data, request, spider):
# We only limit the bytes received for final requests (i.e. where the callback is the default `parse` method).
if not spider.pluck or request.callback or request.meta['file_name'].endswith(('.rar', '.zip')):
if (
not spider.pluck
or spider.dont_truncate
# We only limit bytes received for final requests (i.e. where the callback is the default `parse` method).
or request.callback
# ijson will parse the value at `root_path`, which can go to the end of the file.
# https://github.com/ICRAR/ijson/issues/43
or spider.root_path
# XLSX files must be read in full.
or spider.unflatten
# ZIP and RAR files must be read in full.
or isinstance(spider, CompressedFileSpider)
):
return

self.bytes_received_counts[spider.name] += len(data)
Expand Down
17 changes: 13 additions & 4 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,21 @@ def process_item(self, item, spider):
value = _resolve_pointer(item['data'], pointer)
else:
try:
value = next(ijson.items(item['data'], pointer.replace('/', '.')[1:]))
value = next(ijson.items(item['data'], pointer[1:].replace('/', '.')))
except StopIteration:
value = f'error: {pointer} not found'
# The JSON text can be truncated by a `bytes_received` handler.
except ijson.common.IncompleteJSONError:
value = f'error: {pointer} not found within initial bytes'
except ijson.common.IncompleteJSONError as e:
message = str(e).split('\n', 1)[0]
if message.endswith((
# The JSON text can be truncated by a `bytes_received` handler.
'premature EOF',
# These messages occur if the JSON text is truncated at `"\\u` or `"\\`.
r"lexical error: invalid (non-hex) character occurs after '\u' inside string.",
r"lexical error: inside a string, '\' occurs before a character which it may not.",
)):
value = f'error: {pointer} not found within initial bytes'
else:
raise
else: # spider.release_pointer
if isinstance(item['data'], dict):
data = item['data']
Expand Down
1 change: 1 addition & 0 deletions kingfisher_scrapy/spiders/paraguay_hacienda.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ class ParaguayHacienda(BaseSpider):
release_ids = []
request_time_limit = 14.0
data_type = 'release_package'
dont_truncate = True

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
Expand Down
21 changes: 14 additions & 7 deletions tests/extensions/test_kingfisher_pluck.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from scrapy import Request
from scrapy.exceptions import StopDownload

from kingfisher_scrapy.base_spider import BaseSpider, CompressedFileSpider
from kingfisher_scrapy.extensions import KingfisherPluck
from kingfisher_scrapy.items import PluckedItem
from tests import spider_with_crawler
Expand Down Expand Up @@ -99,15 +100,21 @@ def test_bytes_received_dont_stop_download():
assert extension.max_bytes == 10


@pytest.mark.parametrize('test_request', [
Request('http://example.com', callback=lambda item: item, meta={'file_name': 'test.json'}),
Request('http://example.com', meta={'file_name': 'test.rar'}),
Request('http://example.com', meta={'file_name': 'test.zip'}),
@pytest.mark.parametrize('test_request,spider_class,attributes', [
(Request('http://example.com', callback=lambda item: item, meta={'file_name': 'test.json'}), BaseSpider, {}),
(Request('http://example.com', meta={'file_name': 'test.rar'}), CompressedFileSpider, {}),
(Request('http://example.com', meta={'file_name': 'test.zip'}), CompressedFileSpider, {}),
(Request('http://example.com', meta={'file_name': 'test.xlsx'}), BaseSpider, {'unflatten': True}),
(Request('http://example.com', meta={'file_name': 'test.json'}), BaseSpider, {'root_path': 'item'}),
(Request('http://example.com', meta={'file_name': 'test.json'}), BaseSpider, {'dont_truncate': True}),
])
def test_bytes_received_ignored_requests(test_request):
def test_bytes_received_ignored_requests(test_request, spider_class, attributes):
with TemporaryDirectory() as tmpdirname:
spider = spider_with_crawler(settings={'KINGFISHER_PLUCK_PATH': tmpdirname,
'KINGFISHER_PLUCK_MAX_BYTES': 10}, release_pointer='/date')
spider = spider_with_crawler(spider_class=spider_class, release_pointer='/date',
settings={'KINGFISHER_PLUCK_PATH': tmpdirname, 'KINGFISHER_PLUCK_MAX_BYTES': 10})
for attr, value in attributes.items():
setattr(spider, attr, value)

extension = KingfisherPluck.from_crawler(spider.crawler)

extension.bytes_received(data=b'12345', spider=spider, request=test_request)
Expand Down

0 comments on commit 50fe148

Please sign in to comment.