Skip to content

Commit

Permalink
Merge branch 'master' into 347-add_docstrings_2
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed May 31, 2020
2 parents 2131641 + 4e634fb commit 81b9bd6
Show file tree
Hide file tree
Showing 73 changed files with 634 additions and 752 deletions.
12 changes: 5 additions & 7 deletions docs/writing-spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,22 +52,20 @@ Here is a sample:

.. code-block:: python
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error
class VerySimple(BaseSpider):
name = "very_simple"
class VerySimple(SimpleSpider):
name = 'very_simple'
data_type = 'release_package'
def start_requests(self):
# This API only has one URL to get. Make a request for that, and set a filename
yield scrapy.Request(
url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
meta={'kf_filename': '13-14.json'}
)
@handle_error
def parse(self, response):
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type='release_package')
Spider properties
-----------------

Expand Down
189 changes: 115 additions & 74 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import ijson
import scrapy
from jsonpointer import resolve_pointer

from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import SpiderArgumentError
Expand Down Expand Up @@ -91,6 +92,8 @@ def is_http_success(self, response):
"""
Returns whether the response status is a non-2xx code.
"""
# All 2xx codes are successful.
# https://tools.ietf.org/html/rfc7231#section-6.3
return 200 <= response.status < 300

def get_start_time(self, format):
Expand All @@ -99,26 +102,32 @@ def get_start_time(self, format):
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def build_file_from_response(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True):
def build_file_from_response(self, response, **kwargs):
"""
Returns an item to yield, based on the response to a request.
"""
return self.build_file(response.body, filename, response.request.url, data_type, encoding, post_to_api)

def build_file(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True):
if 'file_name' not in kwargs:
kwargs['file_name'] = response.request.meta['kf_filename']
if 'url' not in kwargs:
kwargs['url'] = response.request.url
if 'data' not in kwargs:
kwargs['data'] = response.body
return self.build_file(**kwargs)

def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield.
"""
return File({
'file_name': filename,
'file_name': file_name,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': post_to_api,
})

def build_file_item(self, number, data, data_type, url, encoding, file_name):
def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'):
return FileItem({
'number': number,
'file_name': file_name,
Expand All @@ -129,14 +138,14 @@ def build_file_item(self, number, data, data_type, url, encoding, file_name):
})

def build_file_error_from_response(self, response, **kwargs):
file_error = {
item = FileError({
'url': response.request.url,
'errors': {'http_code': response.status},
}
})
if 'kf_filename' in response.request.meta:
file_error['file_name'] = response.request.meta['kf_filename']
file_error.update(kwargs)
return FileError(file_error)
item['file_name'] = response.request.meta['kf_filename']
item.update(kwargs)
return item

def _get_package_metadata(self, f, skip_key):
"""
Expand All @@ -152,16 +161,17 @@ def _get_package_metadata(self, f, skip_key):
package.update(item)
return package

def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.json'):
def parse_json_lines(self, f, *, file_name='data.json', url=None, data_type=None, encoding='utf-8'):
for number, line in enumerate(f, 1):
if self.sample and number > self.MAX_SAMPLE:
break
if isinstance(line, bytes):
line = line.decode(encoding=encoding)
yield self.build_file_item(number, line, data_type, url, encoding, file_name)
yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type,
encoding=encoding)

def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases',
file_name='data.json'):
def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None, data_type=None, encoding='utf-8',
array_field_name='releases'):
if self.sample:
size = self.MAX_SAMPLE
else:
Expand All @@ -172,64 +182,92 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8',
for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1):
package[array_field_name] = filter(None, items)
data = json.dumps(package, default=util.default)
yield self.build_file_item(number, data, data_type, url, encoding, file_name)
yield self.build_file_item(number=number, file_name=file_name, url=url, data=data, data_type=data_type,
encoding=encoding)
if self.sample:
break


class ZipSpider(BaseSpider):
class SimpleSpider(BaseSpider):
"""
This class makes it easy to collect data from ZIP files:
Most spiders can inherit from this class. It assumes all responses have the same data type.
- Inherit from ``ZipSpider``
- Set a ``parse_zipfile_kwargs`` class attribute to the keyword arguments for the
:meth:`kingfisher_scrapy.base_spider.ZipSpider.parse_zipfile` method
- Write a ``start_requests`` method to request the ZIP files
1. Inherit from ``SimpleSpider``
1. Set a ``data_type`` class attribute to the data type of the responses
1. Optionally, set an ``encoding`` class attribute to the encoding of the responses (default UTF-8)
1. Optionally, set a ``data_pointer`` class attribute to the JSON Pointer for OCDS data (default "")
1. Write a ``start_requests`` method (and any intermediate callbacks) to send requests
.. code-block:: python
import scrapy
from kingfisher_scrapy.base_spider import ZipSpider
from kingfisher_scrapy.base_spider import SimpleSpider
class MySpider(LinksSpider):
class MySpider(SimpleSpider):
name = 'my_spider'
parse_zipfile_kwargs = {'data_type': 'release_package'}
data_type = 'release_package'
def start_requests(self):
yield scrapy.Request(
url='https://example.com/api/packages.zip',
meta={'kf_filename': 'all.json'}
)
yield scrapy.Request('https://example.com/api/package.json', meta={'kf_filename': 'all.json'})
"""

encoding = 'utf-8'
data_pointer = ''

@handle_error
def parse(self, response):
yield from self.parse_zipfile(response, **self.parse_zipfile_kwargs)
kwargs = {}
if self.data_pointer:
kwargs['data'] = json.dumps(resolve_pointer(json.loads(response.text), self.data_pointer)).encode()

def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'):
"""
Handles a response that is a ZIP file.
:param response response: the response
:param str data_type: the compressed files' ``data_type``
:param str file_format: The compressed files' format
``json_lines``
Yields each line of the compressed files.
The ZIP file is saved to disk.
``release_package``
Re-packages the releases in the compressed files in groups of
:const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages.
The ZIP file is saved to disk.
``None``
Yields each compressed file.
Each compressed file is saved to disk.
:param str encoding: the compressed files' encoding
"""
if file_format:
yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding, **kwargs)


class ZipSpider(BaseSpider):
"""
This class makes it easy to collect data from ZIP files. It assumes all files have the same data type.
1. Inherit from ``ZipSpider``
1. Set a ``data_type`` class attribute to the data type of the compressed files
1. Optionally, set an ``encoding`` class attribute to the encoding of the compressed_files (default UTF-8)
1. Optionally, set a ``zip_file_format`` class attribute to the format of the compressed files
``json_lines``
Yields each line of the compressed files.
The ZIP file is saved to disk.
``release_package``
Re-packages the releases in the compressed files in groups of
:const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages.
The ZIP file is saved to disk.
``None``
Yields each compressed file.
Each compressed file is saved to disk.
1. Write a ``start_requests`` method to request the ZIP files
.. code-block:: python
import scrapy
from kingfisher_scrapy.base_spider import ZipSpider
class MySpider(ZipSpider):
name = 'my_spider'
data_type = 'release_package'
def start_requests(self):
yield scrapy.Request('https://example.com/api/packages.zip', meta={'kf_filename': 'all.json'})
"""

encoding = 'utf-8'
zip_file_format = None

@handle_error
def parse(self, response):
if self.zip_file_format:
filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest())
self.build_file_from_response(response, filename, post_to_api=False)
self.build_file_from_response(response, file_name=filename, post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand All @@ -239,26 +277,31 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')

data = zip_file.open(finfo.filename)

if file_format == 'json_lines':
yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding,
file_name=filename)
elif file_format == 'release_package':
kwargs = {
'file_name': filename,
'url': response.request.url,
'data_type': self.data_type,
'encoding': self.encoding,
}

if self.zip_file_format == 'json_lines':
yield from self.parse_json_lines(data, **kwargs)
elif self.zip_file_format == 'release_package':
package = zip_file.open(finfo.filename)
yield from self.parse_json_array(package, data, data_type, response.request.url,
encoding=encoding, file_name=filename)
yield from self.parse_json_array(package, data, **kwargs)
else:
yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
yield self.build_file(data=data.read(), **kwargs)


class LinksSpider(BaseSpider):
class LinksSpider(SimpleSpider):
"""
This class makes it easy to collect data from an API that implements the `pagination
<https://github.com/open-contracting-extensions/ocds_pagination_extension>`__ pattern:
- Inherit from ``LinksSpider``
- Set a ``data_type`` class attribute to the data type of the API responses
- Write a ``start_requests`` method to request the first page
1. Inherit from ``LinksSpider``
1. Set a ``data_type`` class attribute to the data type of the API responses
1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next")
1. Write a ``start_requests`` method to request the first page of API results
.. code-block:: python
Expand All @@ -271,25 +314,23 @@ class MySpider(LinksSpider):
data_type = 'release_package'
def start_requests(self):
yield scrapy.Request(
url='https://example.com/api/packages.json',
meta={'kf_filename': 'page1.json'}
)
yield scrapy.Request('https://example.com/api/packages.json', meta={'kf_filename': 'page1.json'})
"""

next_pointer = '/links/next'

@handle_error
def parse(self, response):
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=self.data_type)
yield from super().parse(response)

if not self.sample:
yield self.next_link(response)

@staticmethod
def next_link(response):
def next_link(self, response):
"""
If the JSON response has a ``links.next`` key, returns a ``scrapy.Request`` for the URL.
"""
data = json.loads(response.text)
if 'links' in data and 'next' in data['links']:
url = data['links']['next']
url = resolve_pointer(data, self.next_pointer, None)
if url:
return scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'})
6 changes: 5 additions & 1 deletion kingfisher_scrapy/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,8 @@ class AuthenticationError(KingfisherScrapyError):


class SpiderArgumentError(KingfisherScrapyError):
"""Raises when a spider argument's value is invalid"""
"""Raised when a spider argument's value is invalid"""


class MissingRequiredFieldError(KingfisherScrapyError, KeyError):
"""Raised when an item is missing a required field"""
2 changes: 1 addition & 1 deletion kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def item_scraped(self, item, spider):
metadata = {
'url': item['url'],
'data_type': item['data_type'],
'encoding': item['encoding'],
'encoding': item.get('encoding', 'utf-8'),
}
self._write_file(path + '.fileinfo', metadata, spider)

Expand Down
Loading

0 comments on commit 81b9bd6

Please sign in to comment.