Skip to content

Commit

Permalink
Merge 415f451 into 70f7f6f
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed May 31, 2020
2 parents 70f7f6f + 415f451 commit 87968c3
Show file tree
Hide file tree
Showing 67 changed files with 447 additions and 682 deletions.
12 changes: 5 additions & 7 deletions docs/writing-spiders.rst
Expand Up @@ -52,22 +52,20 @@ Here is a sample:

.. code-block:: python
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error
class VerySimple(BaseSpider):
name = "very_simple"
class VerySimple(SimpleSpider):
name = 'very_simple'
data_type = 'release_package'
def start_requests(self):
# This API only has one URL to get. Make a request for that, and set a filename
yield scrapy.Request(
url='https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
meta={'kf_filename': '13-14.json'}
)
@handle_error
def parse(self, response):
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type='release_package')
Spider properties
-----------------

Expand Down
179 changes: 110 additions & 69 deletions kingfisher_scrapy/base_spider.py
Expand Up @@ -6,6 +6,7 @@

import ijson
import scrapy
from jsonpointer import resolve_pointer

from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import SpiderArgumentError
Expand Down Expand Up @@ -91,6 +92,8 @@ def is_http_success(self, response):
"""
Returns whether the response status is a non-2xx code.
"""
# All 2xx codes are successful.
# https://tools.ietf.org/html/rfc7231#section-6.3
return 200 <= response.status < 300

def get_start_time(self, format):
Expand All @@ -99,26 +102,32 @@ def get_start_time(self, format):
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def build_file_from_response(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True):
def build_file_from_response(self, response, **kwargs):
"""
Returns an item to yield, based on the response to a request.
"""
return self.build_file(response.body, filename, response.request.url, data_type, encoding, post_to_api)

def build_file(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True):
if 'file_name' not in kwargs:
kwargs['file_name'] = response.request.meta['kf_filename']
if 'url' not in kwargs:
kwargs['url'] = response.request.url
if 'data' not in kwargs:
kwargs['data'] = response.body
return self.build_file(**kwargs)

def build_file(self, *, file_name=None, url=None, data=None, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield.
"""
return File({
'file_name': filename,
'file_name': file_name,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': post_to_api,
})

def build_file_item(self, number, data, data_type, url, encoding, file_name):
def build_file_item(self, *, number=None, file_name=None, url=None, data=None, data_type=None, encoding='utf-8'):
return FileItem({
'number': number,
'file_name': file_name,
Expand Down Expand Up @@ -152,16 +161,17 @@ def _get_package_metadata(self, f, skip_key):
package.update(item)
return package

def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.json'):
def parse_json_lines(self, f, *, file_name='data.json', url=None, data_type=None, encoding='utf-8'):
for number, line in enumerate(f, 1):
if self.sample and number > self.MAX_SAMPLE:
break
if isinstance(line, bytes):
line = line.decode(encoding=encoding)
yield self.build_file_item(number, line, data_type, url, encoding, file_name)
yield self.build_file_item(number=number, file_name=file_name, url=url, data=line, data_type=data_type,
encoding=encoding)

def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases',
file_name='data.json'):
def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None, data_type=None, encoding='utf-8',
array_field_name='releases'):
if self.sample:
size = self.MAX_SAMPLE
else:
Expand All @@ -172,64 +182,92 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8',
for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1):
package[array_field_name] = filter(None, items)
data = json.dumps(package, default=util.default)
yield self.build_file_item(number, data, data_type, url, encoding, file_name)
yield self.build_file_item(number=number, file_name=file_name, url=url, data=data, data_type=data_type,
encoding=encoding)
if self.sample:
break


class ZipSpider(BaseSpider):
class SimpleSpider(BaseSpider):
"""
This class makes it easy to collect data from ZIP files:
Most spiders can inherit from this class. It assumes all responses have the same data type.
- Inherit from ``ZipSpider``
- Set a ``parse_zipfile_kwargs`` class attribute to the keyword arguments for the
:meth:`kingfisher_scrapy.base_spider.ZipSpider.parse_zipfile` method
- Write a ``start_requests`` method to request the ZIP files
1. Inherit from ``SimpleSpider``
1. Set a ``data_type`` class attribute to the data type of the responses
1. Optionally, set an ``encoding`` class attribute to the encoding of the responses (default UTF-8)
1. Optionally, set a ``data_pointer`` class attribute to the JSON Pointer for OCDS data (default "")
1. Write a ``start_requests`` method (and any intermediate callbacks) to send requests
.. code-block:: python
import scrapy
from kingfisher_scrapy.base_spider import ZipSpider
from kingfisher_scrapy.base_spider import SimpleSpider
class MySpider(LinksSpider):
class MySpider(SimpleSpider):
name = 'my_spider'
parse_zipfile_kwargs = {'data_type': 'release_package'}
data_type = 'release_package'
def start_requests(self):
yield scrapy.Request(
url='https://example.com/api/packages.zip',
meta={'kf_filename': 'all.json'}
)
yield scrapy.Request('https://example.com/api/package.json', meta={'kf_filename': 'all.json'})
"""

encoding = 'utf-8'
data_pointer = ''

@handle_error
def parse(self, response):
yield from self.parse_zipfile(response, **self.parse_zipfile_kwargs)
kwargs = {}
if self.data_pointer:
kwargs['data'] = json.dumps(resolve_pointer(json.loads(response.text), self.data_pointer)).encode()

def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'):
"""
Handles a response that is a ZIP file.
:param response response: the response
:param str data_type: the compressed files' ``data_type``
:param str file_format: The compressed files' format
``json_lines``
Yields each line of the compressed files.
The ZIP file is saved to disk.
``release_package``
Re-packages the releases in the compressed files in groups of
:const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages.
The ZIP file is saved to disk.
``None``
Yields each compressed file.
Each compressed file is saved to disk.
:param str encoding: the compressed files' encoding
"""
if file_format:
yield self.build_file_from_response(response, data_type=self.data_type, encoding=self.encoding, **kwargs)


class ZipSpider(BaseSpider):
"""
This class makes it easy to collect data from ZIP files. It assumes all files have the same data type.
1. Inherit from ``ZipSpider``
1. Set a ``data_type`` class attribute to the data type of the compressed files
1. Optionally, set an ``encoding`` class attribute to the encoding of the compressed_files (default UTF-8)
1. Optionally, set a ``zip_file_format`` class attribute to the format of the compressed files
``json_lines``
Yields each line of the compressed files.
The ZIP file is saved to disk.
``release_package``
Re-packages the releases in the compressed files in groups of
:const:`~kingfisher_scrapy.base_spider.BaseSpider.MAX_RELEASES_PER_PACKAGE`, and yields the packages.
The ZIP file is saved to disk.
``None``
Yields each compressed file.
Each compressed file is saved to disk.
1. Write a ``start_requests`` method to request the ZIP files
.. code-block:: python
import scrapy
from kingfisher_scrapy.base_spider import ZipSpider
class MySpider(ZipSpider):
name = 'my_spider'
data_type = 'release_package'
def start_requests(self):
yield scrapy.Request('https://example.com/api/packages.zip', meta={'kf_filename': 'all.json'})
"""

encoding = 'utf-8'
zip_file_format = None

@handle_error
def parse(self, response):
if self.zip_file_format:
filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest())
self.build_file_from_response(response, filename, post_to_api=False)
self.build_file_from_response(response, file_name=filename, post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand All @@ -239,26 +277,31 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')

data = zip_file.open(finfo.filename)

if file_format == 'json_lines':
yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding,
file_name=filename)
elif file_format == 'release_package':
kwargs = {
'file_name': filename,
'url': response.request.url,
'data_type': self.data_type,
'encoding': self.encoding,
}

if self.zip_file_format == 'json_lines':
yield from self.parse_json_lines(data, **kwargs)
elif self.zip_file_format == 'release_package':
package = zip_file.open(finfo.filename)
yield from self.parse_json_array(package, data, data_type, response.request.url,
encoding=encoding, file_name=filename)
yield from self.parse_json_array(package, data, **kwargs)
else:
yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
yield self.build_file(data=data.read(), **kwargs)


class LinksSpider(BaseSpider):
class LinksSpider(SimpleSpider):
"""
This class makes it easy to collect data from an API that implements the `pagination
<https://github.com/open-contracting-extensions/ocds_pagination_extension>`__ pattern:
- Inherit from ``LinksSpider``
- Set a ``data_type`` class attribute to the data type of the API responses
- Write a ``start_requests`` method to request the first page
1. Inherit from ``LinksSpider``
1. Set a ``data_type`` class attribute to the data type of the API responses
1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next")
1. Write a ``start_requests`` method to request the first page of API results
.. code-block:: python
Expand All @@ -271,25 +314,23 @@ class MySpider(LinksSpider):
data_type = 'release_package'
def start_requests(self):
yield scrapy.Request(
url='https://example.com/api/packages.json',
meta={'kf_filename': 'page1.json'}
)
yield scrapy.Request('https://example.com/api/packages.json', meta={'kf_filename': 'page1.json'})
"""

next_pointer = '/links/next'

@handle_error
def parse(self, response):
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=self.data_type)
yield from super().parse(response)

if not self.sample:
yield self.next_link(response)

@staticmethod
def next_link(response):
def next_link(self, response):
"""
If the JSON response has a ``links.next`` key, returns a ``scrapy.Request`` for the URL.
"""
data = json.loads(response.text)
if 'links' in data and 'next' in data['links']:
url = data['links']['next']
url = resolve_pointer(data, self.next_pointer, None)
if url:
return scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'})
18 changes: 6 additions & 12 deletions kingfisher_scrapy/spiders/afghanistan_records.py
Expand Up @@ -2,17 +2,19 @@

import scrapy

from kingfisher_scrapy.base_spider import BaseSpider
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error


class AfghanistanRecords(BaseSpider):
class AfghanistanRecords(SimpleSpider):
name = 'afghanistan_records'
data_type = 'record'

download_delay = 1

def start_requests(self):
yield scrapy.Request(
url='https://ocds.ageops.net/api/ocds/records',
'https://ocds.ageops.net/api/ocds/records',
meta={'kf_filename': 'list.json'},
callback=self.parse_list
)
Expand All @@ -24,12 +26,4 @@ def parse_list(self, response):
files_urls = [files_urls[0]]

for file_url in files_urls:
yield scrapy.Request(
url=file_url,
meta={'kf_filename': file_url.split('/')[-1] + '.json'},
callback=self.parse_record
)

@handle_error
def parse_record(self, response):
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="record")
yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'})
20 changes: 7 additions & 13 deletions kingfisher_scrapy/spiders/afghanistan_releases.py
Expand Up @@ -2,17 +2,19 @@

import scrapy

from kingfisher_scrapy.base_spider import BaseSpider
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error


class AfghanistanReleases(BaseSpider):
class AfghanistanReleases(SimpleSpider):
name = 'afghanistan_releases'
data_type = 'release'

download_delay = 1.5

def start_requests(self):
yield scrapy.Request(
url='https://ocds.ageops.net/api/ocds/releases/dates',
'https://ocds.ageops.net/api/ocds/releases/dates',
meta={'kf_filename': 'list.json'},
callback=self.parse_list
)
Expand All @@ -25,7 +27,7 @@ def parse_list(self, response):

for file_url in files_urls:
yield scrapy.Request(
url=file_url,
file_url,
meta={'kf_filename': file_url.split('/')[-1] + '.json'},
callback=self.parse_release_list
)
Expand All @@ -37,12 +39,4 @@ def parse_release_list(self, response):
files_urls = [files_urls[0]]

for file_url in files_urls:
yield scrapy.Request(
url=file_url,
meta={'kf_filename': file_url.split('/')[-1] + '.json'},
callback=self.parse_release
)

@handle_error
def parse_release(self, response):
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="release")
yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'})

0 comments on commit 87968c3

Please sign in to comment.