Merge 33547e6 into acd4210

open-contracting · Feb 27, 2021 · 50fe148 · 50fe148
2 parents acd4210 + 33547e6
commit 50fe148
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 13 deletions.
diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -46,6 +46,8 @@ class attribute to the path to the OCDS data.
     If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then
     ``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
     ``get_default_until_date()`` return value (which is the current time, by default).
+
+    If the spider needs to parse the JSON response in its ``parse`` method, set ``dont_truncate = True``.
     """
     VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'}
 
@@ -56,6 +58,7 @@ class attribute to the path to the OCDS data.
     unflatten_args = {}
     line_delimited = False
     root_path = ''
+    dont_truncate = False
 
     def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
                  keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
@@ -415,6 +418,10 @@ def next_link(self, response, **kwargs):
         """
         If the JSON response has a ``links.next`` key, returns a ``scrapy.Request`` for the URL.
         """
+        # If the sample size is 1, we don't want to parse the response, especially if --max-bytes is used.
+        if self.sample and self.sample == 1:
+            return
+
         data = response.json()
         url = resolve_pointer(data, self.next_pointer, None)
         if url:

diff --git a/kingfisher_scrapy/extensions.py b/kingfisher_scrapy/extensions.py
@@ -9,6 +9,7 @@
 from scrapy.exceptions import NotConfigured, StopDownload
 
 from kingfisher_scrapy import util
+from kingfisher_scrapy.base_spider import CompressedFileSpider
 from kingfisher_scrapy.items import File, FileError, FileItem, PluckedItem
 from kingfisher_scrapy.kingfisher_process import Client
 from kingfisher_scrapy.util import _pluck_filename, get_file_name_and_extension
@@ -39,8 +40,19 @@ def from_crawler(cls, crawler):
         return extension
 
     def bytes_received(self, data, request, spider):
-        # We only limit the bytes received for final requests (i.e. where the callback is the default `parse` method).
-        if not spider.pluck or request.callback or request.meta['file_name'].endswith(('.rar', '.zip')):
+        if (
+            not spider.pluck
+            or spider.dont_truncate
+            # We only limit bytes received for final requests (i.e. where the callback is the default `parse` method).
+            or request.callback
+            # ijson will parse the value at `root_path`, which can go to the end of the file.
+            # https://github.com/ICRAR/ijson/issues/43
+            or spider.root_path
+            # XLSX files must be read in full.
+            or spider.unflatten
+            # ZIP and RAR files must be read in full.
+            or isinstance(spider, CompressedFileSpider)
+        ):
             return
 
         self.bytes_received_counts[spider.name] += len(data)

diff --git a/kingfisher_scrapy/pipelines.py b/kingfisher_scrapy/pipelines.py
@@ -89,12 +89,21 @@ def process_item(self, item, spider):
                 value = _resolve_pointer(item['data'], pointer)
             else:
                 try:
-                    value = next(ijson.items(item['data'], pointer.replace('/', '.')[1:]))
+                    value = next(ijson.items(item['data'], pointer[1:].replace('/', '.')))
                 except StopIteration:
                     value = f'error: {pointer} not found'
-                # The JSON text can be truncated by a `bytes_received` handler.
-                except ijson.common.IncompleteJSONError:
-                    value = f'error: {pointer} not found within initial bytes'
+                except ijson.common.IncompleteJSONError as e:
+                    message = str(e).split('\n', 1)[0]
+                    if message.endswith((
+                        # The JSON text can be truncated by a `bytes_received` handler.
+                        'premature EOF',
+                        # These messages occur if the JSON text is truncated at `"\\u` or `"\\`.
+                        r"lexical error: invalid (non-hex) character occurs after '\u' inside string.",
+                        r"lexical error: inside a string, '\' occurs before a character which it may not.",
+                    )):
+                        value = f'error: {pointer} not found within initial bytes'
+                    else:
+                        raise
         else:  # spider.release_pointer
             if isinstance(item['data'], dict):
                 data = item['data']

diff --git a/kingfisher_scrapy/spiders/paraguay_hacienda.py b/kingfisher_scrapy/spiders/paraguay_hacienda.py
@@ -37,6 +37,7 @@ class ParaguayHacienda(BaseSpider):
     release_ids = []
     request_time_limit = 14.0
     data_type = 'release_package'
+    dont_truncate = True
 
     @classmethod
     def from_crawler(cls, crawler, *args, **kwargs):

diff --git a/tests/extensions/test_kingfisher_pluck.py b/tests/extensions/test_kingfisher_pluck.py
@@ -6,6 +6,7 @@
 from scrapy import Request
 from scrapy.exceptions import StopDownload
 
+from kingfisher_scrapy.base_spider import BaseSpider, CompressedFileSpider
 from kingfisher_scrapy.extensions import KingfisherPluck
 from kingfisher_scrapy.items import PluckedItem
 from tests import spider_with_crawler
@@ -99,15 +100,21 @@ def test_bytes_received_dont_stop_download():
         assert extension.max_bytes == 10
 
 
-@pytest.mark.parametrize('test_request', [
-    Request('http://example.com', callback=lambda item: item, meta={'file_name': 'test.json'}),
-    Request('http://example.com', meta={'file_name': 'test.rar'}),
-    Request('http://example.com', meta={'file_name': 'test.zip'}),
+@pytest.mark.parametrize('test_request,spider_class,attributes', [
+    (Request('http://example.com', callback=lambda item: item, meta={'file_name': 'test.json'}), BaseSpider, {}),
+    (Request('http://example.com', meta={'file_name': 'test.rar'}), CompressedFileSpider, {}),
+    (Request('http://example.com', meta={'file_name': 'test.zip'}), CompressedFileSpider, {}),
+    (Request('http://example.com', meta={'file_name': 'test.xlsx'}), BaseSpider, {'unflatten': True}),
+    (Request('http://example.com', meta={'file_name': 'test.json'}), BaseSpider, {'root_path': 'item'}),
+    (Request('http://example.com', meta={'file_name': 'test.json'}), BaseSpider, {'dont_truncate': True}),
 ])
-def test_bytes_received_ignored_requests(test_request):
+def test_bytes_received_ignored_requests(test_request, spider_class, attributes):
     with TemporaryDirectory() as tmpdirname:
-        spider = spider_with_crawler(settings={'KINGFISHER_PLUCK_PATH': tmpdirname,
-                                               'KINGFISHER_PLUCK_MAX_BYTES': 10}, release_pointer='/date')
+        spider = spider_with_crawler(spider_class=spider_class, release_pointer='/date',
+                                     settings={'KINGFISHER_PLUCK_PATH': tmpdirname, 'KINGFISHER_PLUCK_MAX_BYTES': 10})
+        for attr, value in attributes.items():
+            setattr(spider, attr, value)
+
         extension = KingfisherPluck.from_crawler(spider.crawler)
 
         extension.bytes_received(data=b'12345', spider=spider, request=test_request)