Skip to content

Commit

Permalink
Merge pull request #497 from open-contracting/337-index-spider
Browse files Browse the repository at this point in the history
Add IndexSpider
  • Loading branch information
romifz committed Sep 21, 2020
2 parents 93aef51 + 33cb386 commit 282eaf9
Show file tree
Hide file tree
Showing 17 changed files with 294 additions and 138 deletions.
118 changes: 118 additions & 0 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from abc import abstractmethod
from datetime import datetime
from io import BytesIO
from math import ceil
from zipfile import ZipFile

import ijson
Expand Down Expand Up @@ -409,6 +410,8 @@ class LinksSpider(SimpleSpider):
1. Write a ``start_requests`` method to request the first page of API results
1. Optionally, set a ``next_pointer`` class attribute to the JSON Pointer for the next link (default "/links/next")
If the API returns the number of total pages or results in the response, consider using ``IndexSpider`` instead.
.. code-block:: python
import scrapy
Expand Down Expand Up @@ -517,3 +520,118 @@ def get_formatter(self):

def build_urls(self, pattern, date):
yield pattern.format(date)


class IndexSpider(SimpleSpider):
"""
This class can be used to collect data from an API which includes the total number of results or pages in their
response data, and receives pagination parameters like ``page``, ``limit`` and ``offset``. The values for the
parameters are calculated and the requests are sent to the Scrapy's pipeline at the same time. To create a spider
that inherits from ``IndexSpider``:
1. Set a pointer to the attribute that contains the total number of pages or elements in the response data for the
first request to the API:
1. Set ``total_pages_pointer`` to point to the JSON element that contains the total number of pages in the
response data. The API will add the 'page' GET parameter to the URL in the subsequent requests.
1. Set ``count_pointer`` to point to the JSON element with the total number of results. If you use
``count_pointer``, you must set ``limit`` to indicate the number of results to return for each page. The
``limit`` attribute can either a number or a JSON pointer. Optionally, set ``use_page`` to ``True``
to calculate a 'page' parameter instead of the 'limit' and 'offset'.
1. Write a ``start_request`` method with a request to the initial URL. The request's callback should be set to
``self.parse_list``.
If neither ``total_pages_counter`` nor ``count_pointer`` can be used to create the URLs (e.g. if you need to query
a separate URL that does not return JSON), you can provide a custom range of parameters defining the
``range_generator`` method. This method should return page or offset numbers. You also need to define a
``build_url`` method, that receives the pages/offset generated by ``range_generator``. See the ``kenya_makueni``
spider for an example.
The names of the GET parameters 'page', 'limit' and 'offset' to include in the URLS are customizable. Define the
``param_page``, ``param_limit`` and ``param_offset`` class members to set the custom names. Any additional GET
parameters can be added by defining ``additional_params``, which should be a dictionary.
Th base URL is taken from the first URL yielded by ``start_requests``. If you need a different URL for the pages,
define the ``base_url`` class member.
By default the content received in ``parse_list`` is yielded. If you want to avoid this, set ``yield_list_results``
to ``False``.
"""

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

self.param_page = getattr(self, 'param_page', 'page')
self.param_limit = getattr(self, 'param_limit', 'limit')
self.param_offset = getattr(self, 'param_offset', 'offset')
self.additional_params = getattr(self, 'additional_params', {})
self.base_url = getattr(self, 'base_url', '')
self.yield_list_results = getattr(self, 'yield_list_results', True)

if hasattr(self, 'total_pages_pointer') and self.total_pages_pointer:
self.range_generator = self.pages_from_total_range_generator
if not hasattr(self, 'url_builder'):
self.url_builder = self.pages_url_builder
elif hasattr(self, 'count_pointer') and self.count_pointer:
if hasattr(self, 'use_page') and self.use_page:
self.range_generator = self.page_size_range_generator
if not hasattr(self, 'url_builder'):
self.url_builder = self.pages_url_builder
else:
self.range_generator = self.limit_offset_range_generator
if not hasattr(self, 'url_builder'):
self.url_builder = self.limit_offset_url_builder

@handle_http_error
def parse_list(self, response, **kwargs):
if self.yield_list_results:
yield from self.parse(response)
if not self.base_url:
self._set_base_url(response.request.url)
try:
data = json.loads(response.text)
except json.JSONDecodeError:
data = None
if self.sample:
return
for generated_params in self.range_generator(data, response):
yield self.build_request(self.url_builder(generated_params, data, response), formatter=self.formatter,
**kwargs)

def pages_from_total_range_generator(self, data, response):
pages = resolve_pointer(data, self.total_pages_pointer)
return range(2, pages + 1)

def pages_url_builder(self, params, data, response):
return self._build_url({
self.param_page: params
})

def limit_offset_range_generator(self, data, response):
limit = self._resolve_limit(data)
count = resolve_pointer(data, self.count_pointer)
return range(self.limit, count, limit)

def limit_offset_url_builder(self, params, data, response):
return self._build_url({
self.param_limit: self.limit,
self.param_offset: params
})

def page_size_range_generator(self, data, response):
limit = self._resolve_limit(data)
count = resolve_pointer(data, self.count_pointer)
return range(2, (ceil(count/limit))+1)

def _resolve_limit(self, data):
if isinstance(self.limit, str) and self.limit.startswith('/'):
return resolve_pointer(data, self.limit)
return int(self.limit)

def _set_base_url(self, url):
self.base_url = util.replace_parameters(url, page=None, limit=None, offset=None)

def _build_url(self, params):
url_params = params.copy()
url_params.update(self.additional_params)
return util.replace_parameters(self.base_url, **url_params)
8 changes: 4 additions & 4 deletions kingfisher_scrapy/spiders/armenia.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import scrapy

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.util import get_parameter_value, parameters, replace_parameter
from kingfisher_scrapy.util import get_parameter_value, parameters, replace_parameters

MILLISECONDS_PER_DAY = 86400000
EXPONENT_LIMIT = 10 # 1024 days
Expand Down Expand Up @@ -65,7 +65,7 @@ def parse_date_range(self, response):
# Otherwise, continue.
else:
new_offset = min(first_offset + MILLISECONDS_PER_DAY * 2 ** exponent, start_time)
url = replace_parameter(response.request.url, 'offset', new_offset)
url = replace_parameters(response.request.url, offset=new_offset)
yield self._build_request(url, self.parse_date_range, {'prev': offset, 'exponent': exponent,
'first': first_offset})

Expand Down Expand Up @@ -93,10 +93,10 @@ def parse_binary_search(self, response, minimum=None, maximum=None):
# If the last request used the offset, we can reuse its response.
yield from self.parse(response)
else:
url = replace_parameter(response.request.url, 'offset', maximum)
url = replace_parameters(response.request.url, offset=maximum)
yield self._build_request(url, self.parse, {})
else:
url = replace_parameter(response.request.url, 'offset', (minimum + maximum) // 2)
url = replace_parameters(response.request.url, offset=(minimum + maximum) // 2)
yield self._build_request(url, self.parse_binary_search, {'minimum': minimum, 'maximum': maximum,
'first': first_offset})

Expand Down
26 changes: 7 additions & 19 deletions kingfisher_scrapy/spiders/canada_montreal.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import json

import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, parameters, replace_parameter
from kingfisher_scrapy.base_spider import IndexSpider
from kingfisher_scrapy.util import parameters


class CanadaMontreal(SimpleSpider):
class CanadaMontreal(IndexSpider):
"""
API documentation
http://donnees.ville.montreal.qc.ca/dataset/contrats-et-subventions-api
Expand All @@ -16,21 +14,11 @@ class CanadaMontreal(SimpleSpider):
"""
name = 'canada_montreal'
data_type = 'release_package'
step = 10000
limit = 10000
ocds_version = '1.0'
count_pointer = '/meta/count'
formatter = staticmethod(parameters('offset'))

def start_requests(self):
url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={step}'.format(step=self.step)
url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={step}'.format(step=self.limit)
yield scrapy.Request(url, meta={'file_name': 'offset-0.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
yield from self.parse(response)

if not self.sample:
data = json.loads(response.text)
offset = data['meta']['pagination']['limit']
total = data['meta']['count']
for offset in range(offset, total, self.step):
url = replace_parameter(response.request.url, 'offset', offset)
yield self.build_request(url, formatter=parameters('offset'))
34 changes: 17 additions & 17 deletions kingfisher_scrapy/spiders/chile_base.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import json
from datetime import date

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.base_spider import IndexSpider
from kingfisher_scrapy.util import components, date_range_by_month, handle_http_error


class ChileCompraBaseSpider(SimpleSpider):
class ChileCompraBaseSpider(IndexSpider):
custom_settings = {
'DOWNLOAD_FAIL_ON_DATALOSS': False,
}

limit = 100
base_list_url = 'https://apis.mercadopublico.cl/OCDS/data/listaA%C3%B1oMes/{0.year:d}/{0.month:02d}/{1}/{2}'
formatter = staticmethod(components(-4, -1))
count_pointer = '/pagination/total'
yield_list_results = False

def start_requests(self):
today = date.today()
Expand Down Expand Up @@ -40,7 +43,7 @@ def start_requests(self):
)

@handle_http_error
def parse_list(self, response):
def parse_list(self, response, **kwargs):
data = json.loads(response.text)
# Some files contain invalid packages, e.g.:
# {
Expand All @@ -52,6 +55,12 @@ def parse_list(self, response):
yield self.build_file_error_from_response(response, errors=data)
return

kwargs['callback'] = self.parse_items
yield from super().parse_list(response, **kwargs)
yield from self.parse_items(response)

def parse_items(self, response):
data = json.loads(response.text)
for item in data['data']:
# An item looks like:
#
Expand All @@ -63,17 +72,8 @@ def parse_list(self, response):
# }
yield from self.handle_item(item)

if 'pagination' in data and (data['pagination']['offset'] + self.limit) < data['pagination']['total']\
and not self.sample:
year = response.request.meta['year']
month = response.request.meta['month']
offset = data['pagination']['offset']
yield self.build_request(
self.base_list_url.format(date(year, month, 1), offset + self.limit, self.limit),
formatter=components(-4, -1),
meta={
'year': year,
'month': month,
},
callback=self.parse_list
)
def url_builder(self, params, data, response):
year = response.request.meta['year']
month = response.request.meta['month']

return self.base_list_url.format(date(year, month, 1), params, self.limit)
9 changes: 5 additions & 4 deletions kingfisher_scrapy/spiders/honduras_portal_base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import scrapy

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.base_spider import IndexSpider
from kingfisher_scrapy.util import parameters


class HondurasPortalBase(LinksSpider):
class HondurasPortalBase(IndexSpider):
next_pointer = '/next'
next_page_formatter = staticmethod(parameters('page'))
formatter = staticmethod(parameters('page'))
total_pages_pointer = '/pages'
publishers = ['oncae', 'sefin']

download_delay = 0.9
Expand All @@ -23,4 +24,4 @@ def start_requests(self):
url = self.url
if self.publisher:
url = url + '&publisher=' + self.publisher
yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list)
24 changes: 14 additions & 10 deletions kingfisher_scrapy/spiders/kenya_makueni.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, parameters
from kingfisher_scrapy.base_spider import IndexSpider
from kingfisher_scrapy.util import parameters


class KenyaMakueni(SimpleSpider):
class KenyaMakueni(IndexSpider):
"""
Swagger API documentation
https://opencontracting.makueni.go.ke/swagger-ui.html#/ocds-controller
Expand All @@ -17,8 +17,12 @@ class KenyaMakueni(SimpleSpider):
name = 'kenya_makueni'
data_type = 'release_package_list'
step = 10
additional_params = {'pageSize': step}
yield_list_results = False
param_page = 'pageNumber'
formatter = staticmethod(parameters('pageNumber'))

url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={step}&pageNumber={page}'
base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={step}&pageNumber={page}'

def start_requests(self):
if self.sample:
Expand All @@ -28,12 +32,12 @@ def start_requests(self):
yield scrapy.Request(
'https://opencontracting.makueni.go.ke/api/ocds/release/count',
meta={'file_name': 'count.json'},
callback=self.parse_count
callback=self.parse_list
)

@handle_http_error
def parse_count(self, response):
def range_generator(self, data, response):
total = int(response.text)
for page in range(ceil(total / self.step)):
url = self.url.format(step=self.step, page=page)
yield self.build_request(url, formatter=parameters('pageNumber'))
return range(ceil(total / self.step))

def url_builder(self, params, data, response):
return self.pages_url_builder(params, data, response)
26 changes: 7 additions & 19 deletions kingfisher_scrapy/spiders/mexico_administracion_publica_federal.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import json
from math import ceil

import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, parameters, replace_parameter
from kingfisher_scrapy.base_spider import IndexSpider
from kingfisher_scrapy.util import parameters


class MexicoAdministracionPublicaFederal(SimpleSpider):
class MexicoAdministracionPublicaFederal(IndexSpider):
"""
Bulk download documentation
https://datos.gob.mx/busca/dataset/concentrado-de-contrataciones-abiertas-de-la-apf
Expand All @@ -17,20 +14,11 @@ class MexicoAdministracionPublicaFederal(SimpleSpider):
"""
name = 'mexico_administracion_publica_federal'
data_type = 'record_package_list_in_results'
count_pointer = '/pagination/total'
limit = '/pagination/pageSize'
use_page = True
formatter = staticmethod(parameters('page'))

def start_requests(self):
url = 'https://api.datos.gob.mx/v1/contratacionesabiertas'
yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
yield from self.parse(response)

if not self.sample:
data = json.loads(response.text)
page = data['pagination']['page']
total = data['pagination']['total']
limit = data['pagination']['pageSize']
for page in range(page + 1, ceil(total / limit)):
url = replace_parameter(response.request.url, 'page', page)
yield self.build_request(url, formatter=parameters('page'))

0 comments on commit 282eaf9

Please sign in to comment.