Skip to content

Commit

Permalink
Merge pull request #572 from open-contracting/329-data-types
Browse files Browse the repository at this point in the history
Add KingfisherTransformMiddleware and update affected spiders
  • Loading branch information
jpmckinney committed Feb 9, 2021
2 parents 01eefcb + 66af007 commit 20674ca
Show file tree
Hide file tree
Showing 105 changed files with 1,063 additions and 459 deletions.
52 changes: 52 additions & 0 deletions docs/contributing/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,58 @@ After choosing a base class, read its documentation, as well as its parent class

After writing the spider, add a docstring for :ref:`spider metadata<spider-metadata>`.

Since many class attributes that control a spider's behavior, please put the class attributes in this order, including comments with class names:

.. code-block:: python
class NewSpider(ParentSpider):
"""
The typical docstring.
"""
name = 'new_spider'
# Any other class attributes from Scrapy, including `download_delay`, `download_timeout`, `user_agent`, `custom_settings`
# BaseSpider
ocds_version = '1.0'
date_format = 'datetime'
default_from_date = '2000-01-01T00:00:00'
default_until_date = '2010-01-01T00:00:00'
date_required = True
unflatten = True
unflatten_args = {}
line_delimited = True
root_path = 'item'
skip_pluck = 'A reason'
# SimpleSpider
data_type = 'release_package'
encoding = 'iso-8859-1'
# CompressedFileSpider
resize_package = True
file_name_must_contain = '-'
# LinksSpider
next_page_formatter = staticmethod(parameters('page'))
next_pointer = '/next_page/uri'
# PeriodicSpider
pattern = 'https://example.com/{}'
start_requests_callback = 'parse_list'
# IndexSpider
total_pages_pointer = '/data/last_page'
count_pointer = '/meta/count'
limit = 1000
use_page = True
formatter = staticmethod(parameters('pageNumber'))
param_page = 'pageNumber'
param_limit = 'customLimit'
param_offset = = 'customOffset'
additional_params = {'pageSize': 1000}
base_url = 'https://example.com/elsewhere'
yield_list_results = False
Test the spider
~~~~~~~~~~~~~~~

Expand Down
168 changes: 53 additions & 115 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
import json
import os
from abc import abstractmethod
from datetime import datetime
from io import BytesIO
from math import ceil
from zipfile import ZipFile

import ijson
import scrapy
from jsonpointer import resolve_pointer
from rarfile import RarFile

from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError
from kingfisher_scrapy.items import File, FileError, FileItem
from kingfisher_scrapy.exceptions import MissingNextLinkError, SpiderArgumentError, UnknownArchiveFormatError
from kingfisher_scrapy.items import File, FileError
from kingfisher_scrapy.util import add_query_string, handle_http_error

browser_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' # noqa: E501
Expand All @@ -23,26 +21,38 @@ class BaseSpider(scrapy.Spider):
"""
- If the data source uses OCDS 1.0, add an ``ocds_version = '1.0'`` class attribute. This is used for `Kingfisher
Process integration <https://github.com/open-contracting/kingfisher-collect/issues/411>`__.
- If the spider supports ``from_date`` and ``until_date`` spider arguments, set the ``default_from_date`` class
attribute to a date string.
- If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
``default_from_date`` class attribute to a date string.
- If the spider supports ``from_date`` and ``until_date`` spider arguments:
- If the source supports time components, set a ``date_format`` class attribute to "datetime".
- Set a ``default_from_date`` class attribute to a date ("YYYY-MM-DD") or datetime ("YYYY-MM-DDTHH:MM:SS").
- If the source stopped publishing, set a ``default_until_date`` class attribute to a date or datetime.
The :class:`~kingfisher_scrapy.base_spider.PeriodicSpider` class changes the allowed date formats to "year"
("YYYY") and "year-month" ("YYYY-MM").
- If a spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set a
``default_from_date`` class attribute as above.
- If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
- If a spider collects data as CSV or XLSX files, set the class attribute ``unflatten = True`` to convert each
- If a spider collects data as CSV or XLSX files, add a ``unflatten = True`` class attribute to convert each
item to json files in the Unflatten pipeline class using the ``unflatten`` command from Flatten Tool.
If you need to set more arguments for the unflatten command, set a ``unflatten_args`` dict with them.
- If the data is not formatted as OCDS (record, release, record package or release package), set a ``root_path``
class attribute to the path to the OCDS data.
- If the data is line-delimited JSON, add a ``line_delimited = True`` class attribute.
If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set, then
``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
``get_default_until_date()`` return value (which is the current time, by default).
"""
MAX_RELEASES_PER_PACKAGE = 100
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'}

ocds_version = '1.1'
date_format = 'date'
date_required = False
unflatten = False
unflatten_args = {}
line_delimited = False
root_path = ''

def __init__(self, sample=None, note=None, from_date=None, until_date=None, crawl_time=None,
keep_collection_open=None, package_pointer=None, release_pointer=None, truncate=None, *args,
Expand Down Expand Up @@ -205,7 +215,7 @@ def build_file_from_response(self, response, **kwargs):
kwargs.setdefault('data', response.body)
return self.build_file(**kwargs)

def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True):
def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'):
"""
Returns a File item to yield.
"""
Expand All @@ -215,20 +225,6 @@ def build_file(self, *, file_name=None, url=None, data=None, data_type=None, enc
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': post_to_api,
})

def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'):
"""
Returns a FileItem item to yield.
"""
return FileItem({
'number': number,
'file_name': file_name,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
})

def build_file_error_from_response(self, response, **kwargs):
Expand All @@ -244,51 +240,13 @@ def build_file_error_from_response(self, response, **kwargs):
item.update(kwargs)
return item

def _get_package_metadata(self, f, skip_key):
"""
Returns the package metadata from a file object.
:param f: a file object
:param str skip_key: the key to skip
:returns: the package metadata
:rtype: dict
"""
package = {}
for item in util.items(ijson.parse(f), '', skip_key=skip_key):
package.update(item)
return package

def parse_json_lines(self, f, *, file_name='data.json', url=None, data_type=None, encoding='utf-8'):
for number, line in enumerate(f, 1):
if self.sample and number > self.sample:
break
if isinstance(line, bytes):
line = line.decode(encoding=encoding)
yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type,
encoding=encoding)

def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None, data_type=None, encoding='utf-8',
array_field_name='releases'):
if self.sample:
size = self.sample
else:
size = self.MAX_RELEASES_PER_PACKAGE

package = self._get_package_metadata(f_package, array_field_name)

for number, items in enumerate(util.grouper(ijson.items(f_list, f'{array_field_name}.item'), size), 1):
package[array_field_name] = filter(None, items)
data = json.dumps(package, default=util.default)
yield self.build_file_item(number=number, file_name=file_name, url=url, data=data, data_type=data_type,
encoding=encoding)
if self.sample:
break

@classmethod
def get_default_until_date(cls, spider):
"""
Returns the default value of the ``until_date`` spider argument.
Returns the ``default_until_date`` class attribute if truthy. Otherwise, returns the current time.
"""
if getattr(spider, 'default_until_date', None):
return spider.default_until_date
return datetime.utcnow()


Expand All @@ -299,7 +257,6 @@ class SimpleSpider(BaseSpider):
#. Inherit from ``SimpleSpider``
#. Set a ``data_type`` class attribute to the data type of the responses
#. Optionally, set an ``encoding`` class attribute to the encoding of the responses (default UTF-8)
#. Optionally, set a ``data_pointer`` class attribute to the JSON Pointer for OCDS data (default "")
#. Write a ``start_requests`` method (and any intermediate callbacks) to send requests
.. code-block:: python
Expand All @@ -317,37 +274,21 @@ def start_requests(self):
"""

encoding = 'utf-8'
data_pointer = ''

@handle_http_error
def parse(self, response):
kwargs = {}
if self.data_pointer:
kwargs['data'] = json.dumps(resolve_pointer(response.json(), self.data_pointer)).encode()

yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding, **kwargs)
yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding)


class CompressedFileSpider(BaseSpider):
"""
This class makes it easy to collect data from ZIP or RAR files. It assumes all files have the same data type.
Each compressed file is saved to disk. The archive file is *not* saved to disk.
#. Inherit from ``CompressedFileSpider``
#. Set a ``data_type`` class attribute to the data type of the compressed files
#. Optionally, set an ``encoding`` class attribute to the encoding of the compressed files (default UTF-8)
#. Optionally, set a ``compressed_file_format`` class attribute to the format of the compressed files
``json_lines``
Yields each line of each compressed file.
The archive file is saved to disk. The compressed files are *not* saved to disk.
``release_package``
Re-packages the releases in the compressed files in groups of
:const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages.
The archive file is saved to disk. The compressed files are *not* saved to disk.
``None``
Yields each compressed file.
Each compressed file is saved to disk. The archive file is *not* saved to disk.
#. Optionally, add a ``resize_package = True`` class attribute to split large packages (e.g. greater than 100MB)
#. Write a ``start_requests`` method to request the archive files
.. code-block:: python
Expand All @@ -361,23 +302,28 @@ class MySpider(CompressedFileSpider):
def start_requests(self):
yield self.build_request('https://example.com/api/packages.zip', formatter=components(-1))
.. note::
``resize_package = True`` is not compatible with ``line_delimited = True`` or ``root_path``.
"""

encoding = 'utf-8'
skip_pluck = 'Archive files are not supported'
compressed_file_format = None
resize_package = False
file_name_must_contain = ''

@handle_http_error
def parse(self, response):
archive_name, archive_format = os.path.splitext(response.request.meta['file_name'])
archive_format = archive_format[1:].lower()
if self.compressed_file_format:
yield self.build_file_from_response(response, data_type=archive_format, post_to_api=False)

if archive_format == 'zip':
cls = ZipFile
else:
elif archive_format == 'rar':
cls = RarFile
else:
raise UnknownArchiveFormatError(response.request.meta['file_name'])

archive_file = cls(BytesIO(response.body))
for file_info in archive_file.infolist():
filename = file_info.filename
Expand All @@ -391,21 +337,21 @@ def parse(self, response):
if not basename.endswith('.json'):
basename += '.json'

data = archive_file.open(filename)
compressed_file = archive_file.open(filename)
# If `resize_package = True`, then we need to open the file twice: once to extract the package metadata and
# then to extract the releases themselves.
if self.resize_package:
data = {'data': compressed_file, 'package': archive_file.open(filename)}
else:
data = compressed_file

kwargs = {
yield File({
'file_name': basename,
'url': response.request.url,
'data': data,
'data_type': self.data_type,
'encoding': self.encoding,
}
if self.compressed_file_format == 'json_lines':
yield from self.parse_json_lines(data, **kwargs)
elif self.compressed_file_format == 'release_package':
package = archive_file.open(filename)
yield from self.parse_json_array(package, data, **kwargs)
else:
yield self.build_file(data=data.read(), **kwargs)
'url': response.request.url,
'encoding': self.encoding
})


class LinksSpider(SimpleSpider):
Expand Down Expand Up @@ -478,9 +424,8 @@ class PeriodicSpider(SimpleSpider):
#. Implement a ``get_formatter`` method to return the formatter to use in
:meth:`~kingfisher_scrapy.base_spider.BaseSpider.build_request` calls
#. Set a ``default_from_date`` class attribute to a year ("YYYY") or year-month ("YYYY-MM") as a string
#. Optionally, set a ``default_until_date`` class attribute to a year ("YYYY") or year-month ("YYYY-MM") as a
string, if the source is known to have stopped publishing - otherwise, it defaults to today
#. Set a ``default_from_date`` class attribute to a year ("YYYY") or year-month ("YYYY-MM")
#. If the source stopped publishing, set a ``default_until_date`` class attribute to a year or year-month
#. Optionally, set a ``start_requests_callback`` class attribute to a method's name - otherwise, it defaults to
:meth:`~kingfisher_scrapy.base_spider.SimpleSpider.parse`
Expand All @@ -499,15 +444,6 @@ def __init__(self, *args, **kwargs):
else:
self.start_requests_callback = self.parse

@classmethod
def get_default_until_date(cls, spider):
"""
Returns the ``default_until_date`` class attribute if truthy. Otherwise, returns today's date.
"""
if getattr(spider, 'default_until_date', None):
return spider.default_until_date
return datetime.today()

def start_requests(self):
start = self.from_date
stop = self.until_date
Expand Down Expand Up @@ -550,6 +486,8 @@ class IndexSpider(SimpleSpider):
configure the spider to send a ``page`` query string parameter instead of a pair of ``limit`` and ``offset``
query string parameters. The spider then yields a request for each offset/page.
#. Set a ``formatter`` class attribute to set the file name as in
:meth:`~kingfisher_scrapy.base_spider.BaseSpider.build_request`.
#. Write a ``start_requests`` method to yield the initial URL. The request's ``callback`` parameter should be set
to ``self.parse_list``.
Expand Down
5 changes: 0 additions & 5 deletions kingfisher_scrapy/commands/crawlall.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from scrapy.commands import ScrapyCommand
from scrapy.exceptions import UsageError

from kingfisher_scrapy.base_spider import BaseSpider, CompressedFileSpider

EXCEPTIONS = {
'fail',
# Require authentication
Expand Down Expand Up @@ -44,9 +42,6 @@ def run(self, args, opts):
if opts.sample:
kwargs['sample'] = opts.sample

BaseSpider.parse_json_lines = yield_nothing
CompressedFileSpider.parse = yield_nothing

# Stop after one item or error.
self.settings.set('CLOSESPIDER_ERRORCOUNT', 1)
# Disable LogStats extension.
Expand Down
4 changes: 4 additions & 0 deletions kingfisher_scrapy/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,7 @@ class AccessTokenError(KingfisherScrapyError):

class MissingNextLinkError(KingfisherScrapyError):
"""Raised when a next link is not found on the first page of results"""


class UnknownArchiveFormatError(KingfisherScrapyError):
"""Raised if the archive format of a file can't be determined from the filename"""
Loading

0 comments on commit 20674ca

Please sign in to comment.