Merge pull request #572 from open-contracting/329-data-types

Add KingfisherTransformMiddleware and update affected spiders
open-contracting · Feb 9, 2021 · 20674ca · 20674ca
2 parents 01eefcb + 66af007
commit 20674ca
Show file tree

Hide file tree

Showing 105 changed files with 1,063 additions and 459 deletions.
diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst
@@ -34,6 +34,58 @@ After choosing a base class, read its documentation, as well as its parent class
 
 After writing the spider, add a docstring for :ref:`spider metadata<spider-metadata>`.
 
+Since many class attributes that control a spider's behavior, please put the class attributes in this order, including comments with class names:
+
+.. code-block:: python
+
+   class NewSpider(ParentSpider):
+      """
+      The typical docstring.
+      """
+      name = 'new_spider'
+      # Any other class attributes from Scrapy, including `download_delay`, `download_timeout`, `user_agent`, `custom_settings`
+
+      # BaseSpider
+      ocds_version = '1.0'
+      date_format = 'datetime'
+      default_from_date = '2000-01-01T00:00:00'
+      default_until_date = '2010-01-01T00:00:00'
+      date_required = True
+      unflatten = True
+      unflatten_args = {}
+      line_delimited = True
+      root_path = 'item'
+      skip_pluck = 'A reason'
+
+      # SimpleSpider
+      data_type = 'release_package'
+      encoding = 'iso-8859-1'
+
+      # CompressedFileSpider
+      resize_package = True
+      file_name_must_contain = '-'
+
+      # LinksSpider
+      next_page_formatter = staticmethod(parameters('page'))
+      next_pointer = '/next_page/uri'
+
+      # PeriodicSpider
+      pattern = 'https://example.com/{}'
+      start_requests_callback = 'parse_list'
+
+      # IndexSpider
+      total_pages_pointer = '/data/last_page'
+      count_pointer = '/meta/count'
+      limit = 1000
+      use_page = True
+      formatter = staticmethod(parameters('pageNumber'))
+      param_page = 'pageNumber'
+      param_limit = 'customLimit'
+      param_offset = = 'customOffset'
+      additional_params = {'pageSize': 1000}
+      base_url = 'https://example.com/elsewhere'
+      yield_list_results = False
+
 Test the spider
 ~~~~~~~~~~~~~~~
 

diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -1,19 +1,17 @@
-import json
 import os
 from abc import abstractmethod
 from datetime import datetime
 from io import BytesIO
 from math import ceil
 from zipfile import ZipFile
 
-import ijson
 import scrapy
 from jsonpointer import resolve_pointer
 from rarfile import RarFile
 
 from kingfisher_scrapy import util
-from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError
-from kingfisher_scrapy.items import File, FileError, FileItem
+from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError, UnknownArchiveFormatError
+from kingfisher_scrapy.items import File, FileError
 from kingfisher_scrapy.util import add_query_string, handle_http_error
 
 browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'  # noqa: E501
@@ -23,26 +21,38 @@ class BaseSpider(scrapy.Spider):
     """
     -  If the data source uses OCDS 1.0, add an ``ocds_version = '1.0'`` class attribute. This is used for `Kingfisher
        Process integration <https://github.com/open-contracting/kingfisher-collect/issues/411>`__.
-    -  If the spider supports ``from_date`` and ``until_date`` spider arguments, set the ``default_from_date`` class
-       attribute to a date string.
-    -  If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
-       ``default_from_date`` class attribute to a date string.
+    -  If the spider supports ``from_date`` and ``until_date`` spider arguments:
+
+       -  If the source supports time components, set a ``date_format`` class attribute to "datetime".
+       -  Set a ``default_from_date`` class attribute to a date ("YYYY-MM-DD") or datetime ("YYYY-MM-DDTHH:MM:SS").
+       -  If the source stopped publishing, set a ``default_until_date`` class attribute to a date or datetime.
+
+       The :class:`~kingfisher_scrapy.base_spider.PeriodicSpider` class changes the allowed date formats to "year"
+       ("YYYY") and "year-month" ("YYYY-MM").
+
+    -  If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set a
+       ``default_from_date`` class attribute as above.
     -  If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
-    -  If a spider collects data as CSV or XLSX files, set the class attribute ``unflatten = True`` to convert each
+    -  If a spider collects data as CSV or XLSX files, add a ``unflatten = True`` class attribute to convert each
        item to json files in the Unflatten pipeline class using the ``unflatten`` command from Flatten Tool.
        If you need to set more arguments for the unflatten command, set a ``unflatten_args`` dict with them.
+    -  If the data is not formatted as OCDS (record, release, record package or release package), set a ``root_path``
+       class attribute to the path to the OCDS data.
+    -  If the data is line-delimited JSON, add a ``line_delimited = True`` class attribute.
+
     If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then
     ``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
     ``get_default_until_date()`` return value (which is the current time, by default).
     """
-    MAX_RELEASES_PER_PACKAGE = 100
     VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'}
 
     ocds_version = '1.1'
     date_format = 'date'
     date_required = False
     unflatten = False
     unflatten_args = {}
+    line_delimited = False
+    root_path = ''
 
     def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
                  keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
@@ -205,7 +215,7 @@ def build_file_from_response(self, response, **kwargs):
         kwargs.setdefault('data', response.body)
         return self.build_file(**kwargs)
 
-    def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True):
+    def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'):
         """
         Returns a File item to yield.
         """
@@ -215,20 +225,6 @@ def build_file(self, *, file_name=None, url=None, data=None, data_type=None, enc
             'data_type': data_type,
             'url': url,
             'encoding': encoding,
-            'post_to_api': post_to_api,
-        })
-
-    def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'):
-        """
-        Returns a FileItem item to yield.
-        """
-        return FileItem({
-            'number': number,
-            'file_name': file_name,
-            'data': data,
-            'data_type': data_type,
-            'url': url,
-            'encoding': encoding,
         })
 
     def build_file_error_from_response(self, response, **kwargs):
@@ -244,51 +240,13 @@ def build_file_error_from_response(self, response, **kwargs):
         item.update(kwargs)
         return item
 
-    def _get_package_metadata(self, f, skip_key):
-        """
-        Returns the package metadata from a file object.
-
-        :param f: a file object
-        :param str skip_key: the key to skip
-        :returns: the package metadata
-        :rtype: dict
-        """
-        package = {}
-        for item in util.items(ijson.parse(f), '', skip_key=skip_key):
-            package.update(item)
-        return package
-
-    def parse_json_lines(self, f, *, file_name='data.json', url=None, data_type=None, encoding='utf-8'):
-        for number, line in enumerate(f, 1):
-            if self.sample and number > self.sample:
-                break
-            if isinstance(line, bytes):
-                line = line.decode(encoding=encoding)
-            yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type,
-                                       encoding=encoding)
-
-    def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None, data_type=None, encoding='utf-8',
-                         array_field_name='releases'):
-        if self.sample:
-            size = self.sample
-        else:
-            size = self.MAX_RELEASES_PER_PACKAGE
-
-        package = self._get_package_metadata(f_package, array_field_name)
-
-        for number, items in enumerate(util.grouper(ijson.items(f_list, f'{array_field_name}.item'), size), 1):
-            package[array_field_name] = filter(None, items)
-            data = json.dumps(package, default=util.default)
-            yield self.build_file_item(number=number, file_name=file_name, url=url, data=data, data_type=data_type,
-                                       encoding=encoding)
-            if self.sample:
-                break
-
     @classmethod
     def get_default_until_date(cls, spider):
         """
-        Returns the default value of the ``until_date`` spider argument.
+        Returns the ``default_until_date`` class attribute if truthy. Otherwise, returns the current time.
         """
+        if getattr(spider, 'default_until_date', None):
+            return spider.default_until_date
         return datetime.utcnow()
 
 
@@ -299,7 +257,6 @@ class SimpleSpider(BaseSpider):
     #. Inherit from ``SimpleSpider``
     #. Set a ``data_type`` class attribute to the data type of the responses
     #. Optionally, set an ``encoding`` class attribute to the encoding of the responses (default UTF-8)
-    #. Optionally, set a ``data_pointer`` class attribute to the JSON Pointer for OCDS data (default "")
     #. Write a ``start_requests`` method (and any intermediate callbacks) to send requests
 
     .. code-block:: python
@@ -317,37 +274,21 @@ def start_requests(self):
     """
 
     encoding = 'utf-8'
-    data_pointer = ''
 
     @handle_http_error
     def parse(self, response):
-        kwargs = {}
-        if self.data_pointer:
-            kwargs['data'] = json.dumps(resolve_pointer(response.json(), self.data_pointer)).encode()
-
-        yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding, **kwargs)
+        yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding)
 
 
 class CompressedFileSpider(BaseSpider):
     """
     This class makes it easy to collect data from ZIP or RAR files. It assumes all files have the same data type.
+    Each compressed file is saved to disk. The archive file is *not* saved to disk.
 
     #. Inherit from ``CompressedFileSpider``
     #. Set a ``data_type`` class attribute to the data type of the compressed files
     #. Optionally, set an ``encoding`` class attribute to the encoding of the compressed files (default UTF-8)
-    #. Optionally, set a ``compressed_file_format`` class attribute to the format of the compressed files
-
-       ``json_lines``
-         Yields each line of each compressed file.
-         The archive file is saved to disk. The compressed files are *not* saved to disk.
-       ``release_package``
-         Re-packages the releases in the compressed files in groups of
-         :const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages.
-         The archive file is saved to disk. The compressed files are *not* saved to disk.
-       ``None``
-         Yields each compressed file.
-         Each compressed file is saved to disk. The archive file is *not* saved to disk.
-
+    #. Optionally, add a ``resize_package = True`` class attribute to split large packages (e.g. greater than 100MB)
     #. Write a ``start_requests`` method to request the archive files
 
     .. code-block:: python
@@ -361,23 +302,28 @@ class MySpider(CompressedFileSpider):
 
             def start_requests(self):
                 yield self.build_request('https://example.com/api/packages.zip', formatter=components(-1))
+
+    .. note::
+
+       ``resize_package = True`` is not compatible with ``line_delimited = True`` or ``root_path``.
     """
 
     encoding = 'utf-8'
-    skip_pluck = 'Archive files are not supported'
-    compressed_file_format = None
+    resize_package = False
     file_name_must_contain = ''
 
     @handle_http_error
     def parse(self, response):
         archive_name, archive_format = os.path.splitext(response.request.meta['file_name'])
         archive_format = archive_format[1:].lower()
-        if self.compressed_file_format:
-            yield self.build_file_from_response(response, data_type=archive_format, post_to_api=False)
+
         if archive_format == 'zip':
             cls = ZipFile
-        else:
+        elif archive_format == 'rar':
             cls = RarFile
+        else:
+            raise UnknownArchiveFormatError(response.request.meta['file_name'])
+
         archive_file = cls(BytesIO(response.body))
         for file_info in archive_file.infolist():
             filename = file_info.filename
@@ -391,21 +337,21 @@ def parse(self, response):
             if not basename.endswith('.json'):
                 basename += '.json'
 
-            data = archive_file.open(filename)
+            compressed_file = archive_file.open(filename)
+            # If `resize_package = True`, then we need to open the file twice: once to extract the package metadata and
+            # then to extract the releases themselves.
+            if self.resize_package:
+                data = {'data': compressed_file, 'package': archive_file.open(filename)}
+            else:
+                data = compressed_file
 
-            kwargs = {
+            yield File({
                 'file_name': basename,
-                'url': response.request.url,
+                'data': data,
                 'data_type': self.data_type,
-                'encoding': self.encoding,
-            }
-            if self.compressed_file_format == 'json_lines':
-                yield from self.parse_json_lines(data, **kwargs)
-            elif self.compressed_file_format == 'release_package':
-                package = archive_file.open(filename)
-                yield from self.parse_json_array(package, data, **kwargs)
-            else:
-                yield self.build_file(data=data.read(), **kwargs)
+                'url': response.request.url,
+                'encoding': self.encoding
+            })
 
 
 class LinksSpider(SimpleSpider):
@@ -478,9 +424,8 @@ class PeriodicSpider(SimpleSpider):
 
     #. Implement a ``get_formatter`` method to return the formatter to use in
        :meth:`~kingfisher_scrapy.base_spider.BaseSpider.build_request` calls
-    #. Set a ``default_from_date`` class attribute to a year ("YYYY") or year-month ("YYYY-MM") as a string
-    #. Optionally, set a ``default_until_date`` class attribute to a year ("YYYY") or year-month ("YYYY-MM") as a
-       string, if the source is known to have stopped publishing - otherwise, it defaults to today
+    #. Set a ``default_from_date`` class attribute to a year ("YYYY") or year-month ("YYYY-MM")
+    #. If the source stopped publishing, set a ``default_until_date`` class attribute to a year or year-month
     #. Optionally, set a ``start_requests_callback`` class attribute to a method's name - otherwise, it defaults to
        :meth:`~kingfisher_scrapy.base_spider.SimpleSpider.parse`
 
@@ -499,15 +444,6 @@ def __init__(self, *args, **kwargs):
         else:
             self.start_requests_callback = self.parse
 
-    @classmethod
-    def get_default_until_date(cls, spider):
-        """
-        Returns the ``default_until_date`` class attribute if truthy. Otherwise, returns today's date.
-        """
-        if getattr(spider, 'default_until_date', None):
-            return spider.default_until_date
-        return datetime.today()
-
     def start_requests(self):
         start = self.from_date
         stop = self.until_date
@@ -550,6 +486,8 @@ class IndexSpider(SimpleSpider):
            configure the spider to send a ``page`` query string parameter instead of a pair of ``limit`` and ``offset``
            query string parameters. The spider then yields a request for each offset/page.
 
+    #. Set a ``formatter`` class attribute to set the file name as in
+       :meth:`~kingfisher_scrapy.base_spider.BaseSpider.build_request`.
     #. Write a ``start_requests`` method to yield the initial URL. The request's ``callback`` parameter should be set
        to ``self.parse_list``.
 

diff --git a/kingfisher_scrapy/commands/crawlall.py b/kingfisher_scrapy/commands/crawlall.py
@@ -1,8 +1,6 @@
 from scrapy.commands import ScrapyCommand
 from scrapy.exceptions import UsageError
 
-from kingfisher_scrapy.base_spider import BaseSpider, CompressedFileSpider
-
 EXCEPTIONS = {
     'fail',
     # Require authentication
@@ -44,9 +42,6 @@ def run(self, args, opts):
         if opts.sample:
             kwargs['sample'] = opts.sample
 
-        BaseSpider.parse_json_lines = yield_nothing
-        CompressedFileSpider.parse = yield_nothing
-
         # Stop after one item or error.
         self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
         # Disable LogStats extension.

diff --git a/kingfisher_scrapy/exceptions.py b/kingfisher_scrapy/exceptions.py
@@ -16,3 +16,7 @@ class AccessTokenError(KingfisherScrapyError):
 
 class MissingNextLinkError(KingfisherScrapyError):
     """Raised when a next link is not found on the first page of results"""
+
+
+class UnknownArchiveFormatError(KingfisherScrapyError):
+    """Raised if the archive format of a file can't be determined from the filename"""