Skip to content

Commit

Permalink
Merge pull request #409 from open-contracting/build-request
Browse files Browse the repository at this point in the history
Add build_request and other helpers
  • Loading branch information
jpmckinney committed Jun 2, 2020
2 parents ac0882a + 59bcf35 commit e32f339
Show file tree
Hide file tree
Showing 64 changed files with 706 additions and 836 deletions.
1 change: 0 additions & 1 deletion docs/api/base_spider.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ Base Spider
.. automodule:: kingfisher_scrapy.base_spider
:members:
:undoc-members:

1 change: 0 additions & 1 deletion docs/api/exceptions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,3 @@ Exceptions
.. automodule:: kingfisher_scrapy.exceptions
:members:
:undoc-members:

6 changes: 6 additions & 0 deletions docs/api/extensions.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Extensions
==========

.. automodule:: kingfisher_scrapy.extensions
:members:
:undoc-members:
2 changes: 2 additions & 0 deletions docs/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ API Reference
.. toctree::

base_spider.rst
extensions.rst
util.rst
exceptions.rst
6 changes: 6 additions & 0 deletions docs/api/util.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Utilities
=========

.. automodule:: kingfisher_scrapy.util
:members:
:undoc-members:
10 changes: 4 additions & 6 deletions docs/writing-spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,16 @@ Here is a sample:
.. code-block:: python
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error
from kingfisher_scrapy.util import components, handle_error
class VerySimple(SimpleSpider):
name = 'very_simple'
data_type = 'release_package'
def start_requests(self):
# This API only has one URL to get. Make a request for that, and set a filename
yield scrapy.Request(
'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json',
meta={'kf_filename': '13-14.json'}
)
# Request the source's only URL, and transform the URL to a file name using ``basename``.
url = 'https://buyandsell.gc.ca/cds/public/ocds/tpsgc-pwgsc_ocds_EF-FY-13-14.json'
yield self.build_request(url, formatter=components(-1))
Spider properties
-----------------
Expand Down
51 changes: 47 additions & 4 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import hashlib
import json
from datetime import datetime
from io import BytesIO
Expand Down Expand Up @@ -102,6 +101,49 @@ def get_start_time(self, format):
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def build_request(self, url, formatter, **kwargs):
"""
Returns a Scrapy request, with a file name added to the request's ``meta`` attribute. If the file name doesn't
have a ``.json`` or ``.zip`` extension, it adds a ``.json`` extension.
If the last component of a URL's path is unique, use it as the file name. For example:
>>> from kingfisher_scrapy.base_spider import BaseSpider
>>> from kingfisher_scrapy.util import components
>>> url = 'https://example.com/package.json'
>>> formatter = components(-1)
>>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
{'kf_filename': 'package.json'}
To use a query string parameter as the file name:
>>> from kingfisher_scrapy.util import parameters
>>> url = 'https://example.com/packages?page=1&per_page=100'
>>> formatter = parameters('page')
>>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
{'kf_filename': 'page-1.json'}
To add a query string parameter to the file name:
>>> from kingfisher_scrapy.util import join
>>> url = 'https://example.com/packages?page=1&per_page=100'
>>> formatter = join(components(-1), parameters('page'))
>>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
{'kf_filename': 'packages-page-1.json'}
:param str url: the URL to request
:param formatter: a function that accepts a URL and returns a file name
:returns: a Scrapy request
:rtype: scrapy.Request
"""
file_name = formatter(url)
if not file_name.endswith(('.json', '.zip')):
file_name += '.json'
meta = {'kf_filename': file_name}
if 'meta' in kwargs:
meta.update(kwargs.pop('meta'))
return scrapy.Request(url, meta=meta, **kwargs)

def build_file_from_response(self, response, **kwargs):
"""
Returns an item to yield, based on the response to a request.
Expand Down Expand Up @@ -266,8 +308,7 @@ def start_requests(self):
@handle_error
def parse(self, response):
if self.zip_file_format:
filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest())
self.build_file_from_response(response, file_name=filename, post_to_api=False)
self.build_file_from_response(response, data_type='zip', post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand Down Expand Up @@ -300,6 +341,8 @@ class LinksSpider(SimpleSpider):
1. Inherit from ``LinksSpider``
1. Set a ``data_type`` class attribute to the data type of the API responses
1. Set a ``next_page_formatter`` class attribute to set the file name as in
:meth:`~kingfisher_scrapy.base_spider.BaseSpider.build_request`
1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next")
1. Write a ``start_requests`` method to request the first page of API results
Expand Down Expand Up @@ -333,4 +376,4 @@ def next_link(self, response):
data = json.loads(response.text)
url = resolve_pointer(data, self.next_pointer, None)
if url:
return scrapy.Request(url, meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest() + '.json'})
return self.build_request(url, formatter=self.next_page_formatter)
4 changes: 0 additions & 4 deletions kingfisher_scrapy/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +0,0 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
20 changes: 9 additions & 11 deletions kingfisher_scrapy/spiders/afghanistan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error
from kingfisher_scrapy.util import components, handle_error


class AfghanistanRecords(SimpleSpider):
Expand All @@ -13,17 +13,15 @@ class AfghanistanRecords(SimpleSpider):
download_delay = 1

def start_requests(self):
yield scrapy.Request(
'https://ocds.ageops.net/api/ocds/records',
meta={'kf_filename': 'list.json'},
callback=self.parse_list
)
# A JSON array of URL strings, in reverse chronological order.
url = 'https://ocds.ageops.net/api/ocds/records'
yield scrapy.Request(url, meta={'kf_filename': 'list.json'}, callback=self.parse_list)

@handle_error
def parse_list(self, response):
files_urls = json.loads(response.text)
urls = json.loads(response.text)
if self.sample:
files_urls = [files_urls[0]]

for file_url in files_urls:
yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'})
urls = [urls[0]]
for url in urls:
# URL looks like https://ocds.ageops.net/api/record/5ed2a62c4192f32c8c74a4e5
yield self.build_request(url, formatter=components(-1))
35 changes: 15 additions & 20 deletions kingfisher_scrapy/spiders/afghanistan_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_error
from kingfisher_scrapy.util import components, handle_error


class AfghanistanReleases(SimpleSpider):
Expand All @@ -13,30 +13,25 @@ class AfghanistanReleases(SimpleSpider):
download_delay = 1.5

def start_requests(self):
yield scrapy.Request(
'https://ocds.ageops.net/api/ocds/releases/dates',
meta={'kf_filename': 'list.json'},
callback=self.parse_list
)
# A JSON array of URL strings, in reverse chronological order.
url = 'https://ocds.ageops.net/api/ocds/releases/dates'
yield scrapy.Request(url, meta={'kf_filename': 'list.json'}, callback=self.parse_list)

@handle_error
def parse_list(self, response):
files_urls = json.loads(response.text)
urls = json.loads(response.text)
if self.sample:
files_urls = [files_urls[0]]

for file_url in files_urls:
yield scrapy.Request(
file_url,
meta={'kf_filename': file_url.split('/')[-1] + '.json'},
callback=self.parse_release_list
)
urls = [urls[0]]
for url in urls:
# A JSON array of URL strings, in reverse chronological order.
# URL looks like https://ocds.ageops.net/api/ocds/releases/2020-05-30
yield self.build_request(url, formatter=components(-1), callback=self.parse_release_list)

@handle_error
def parse_release_list(self, response):
files_urls = json.loads(response.text)
urls = json.loads(response.text)
if self.sample:
files_urls = [files_urls[0]]

for file_url in files_urls:
yield scrapy.Request(file_url, meta={'kf_filename': file_url.split('/')[-1] + '.json'})
urls = [urls[0]]
for url in urls:
# URL looks like https://ocds.ageops.net/api/release/5ed2a62c4192f32c8c74a4e3
yield self.build_request(url, formatter=components(-1))
13 changes: 6 additions & 7 deletions kingfisher_scrapy/spiders/argentina_buenos_aires.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import scrapy

from kingfisher_scrapy.base_spider import ZipSpider
from kingfisher_scrapy.util import handle_error
from kingfisher_scrapy.util import components, handle_error


class ArgentinaBuenosAires(ZipSpider):
Expand All @@ -24,15 +24,14 @@ class ArgentinaBuenosAires(ZipSpider):
download_timeout = 1000

def start_requests(self):
yield scrapy.Request(
'https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras',
meta={'kf_filename': 'list.json'},
callback=self.parse_list
)
# A CKAN API JSON response.
url = 'https://data.buenosaires.gob.ar/api/3/action/package_show?id=buenos-aires-compras'
yield scrapy.Request(url, meta={'kf_filename': 'list.json'}, callback=self.parse_list)

@handle_error
def parse_list(self, response):
data = json.loads(response.text)
for resource in data['result']['resources']:
if resource['format'].upper() == 'JSON':
yield scrapy.Request(resource['url'], meta={'kf_filename': resource['url'].rsplit('/', 1)[-1]})
# Presently, only one URL matches.
yield self.build_request(resource['url'], formatter=components(-1))
6 changes: 2 additions & 4 deletions kingfisher_scrapy/spiders/argentina_vialidad.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,5 @@ class ArgentinaVialidad(SimpleSpider):
data_type = 'release_package_list'

def start_requests(self):
yield scrapy.Request(
'https://datosabiertos.vialidad.gob.ar/api/ocds/package/all',
meta={'kf_filename': 'all.json'}
)
url = 'https://datosabiertos.vialidad.gob.ar/api/ocds/package/all'
yield scrapy.Request(url, meta={'kf_filename': 'all.json'})
5 changes: 4 additions & 1 deletion kingfisher_scrapy/spiders/armenia.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import scrapy

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.util import parameters


class Armenia(LinksSpider):
name = 'armenia'
data_type = 'release_package'
next_pointer = '/next_page/uri'
next_page_formatter = parameters('offset')

def start_requests(self):
yield scrapy.Request('https://armeps.am/ocds/release', meta={'kf_filename': 'page1.json'})
url = 'https://armeps.am/ocds/release'
yield scrapy.Request(url, meta={'kf_filename': 'offset-0.json'})
20 changes: 6 additions & 14 deletions kingfisher_scrapy/spiders/australia.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,18 @@
import datetime
from datetime import date

import scrapy

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.util import parameters


class Australia(LinksSpider):
name = 'australia'
data_type = 'release_package'
next_page_formatter = parameters('cursor')

def start_requests(self):
url_prefix = 'https://api.tenders.gov.au/ocds/findByDates/contractPublished/'
url = f'https://api.tenders.gov.au/ocds/findByDates/contractPublished/' \
f'2004-01-01T00:00:00Z/{date.today().year}-12-31T23:59:59Z'

if self.sample:
yield scrapy.Request(
url_prefix + '2018-01-01T00:00:00Z/2018-12-31T23:59:59Z',
meta={'kf_filename': 'year-2018.json'}
)
else:
current_year = datetime.datetime.now().year + 1
for year in range(2004, current_year):
yield scrapy.Request(
url_prefix + '{}-01-01T00:00:00Z/{}-12-31T23:59:59Z'.format(year, year),
meta={'kf_filename': 'year-{}.json'.format(year)}
)
yield scrapy.Request(url, meta={'kf_filename': 'start.json'})
Loading

0 comments on commit e32f339

Please sign in to comment.