Skip to content

Commit

Permalink
Merge branch 'master' into command-sample
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Oct 21, 2020
2 parents 77c2b15 + 3bf61bf commit a8e493e
Show file tree
Hide file tree
Showing 9 changed files with 26 additions and 32 deletions.
5 changes: 1 addition & 4 deletions docs/index.rst
Expand Up @@ -27,12 +27,9 @@ By default, these files are written to a ``data`` directory (you can :ref:`chang
└── zambia
└── 20200102_030405
├── <...>.json
├── <...>.fileinfo
└── <...>
As you can see, the ``data`` directory contains a ``zambia`` spider directory (matching the spider's name), which in turn contains a ``20200102_030405`` crawl directory (matching the time at which you started the crawl – in this case, 2020-01-02 03:04:05).

The crawl's directory will contain ``.json`` and ``.fileinfo`` files. The JSON files are the OCDS data. Each ``.fileinfo`` file contains metadata about a corresponding JSON file: the URL at which the JSON file was retrieved, along with other details.
As you can see, the ``data`` directory contains a ``zambia`` spider directory (matching the spider's name), which in turn contains a ``20200102_030405`` crawl directory (matching the time at which you started the crawl – in this case, 2020-01-02 03:04:05). The crawl directory contains ``.json`` files – the OCDS data.

.. toctree::
:caption: Contents
Expand Down
25 changes: 12 additions & 13 deletions kingfisher_scrapy/base_spider.py
Expand Up @@ -565,15 +565,15 @@ class IndexSpider(SimpleSpider):
response data. The API will add the 'page' GET parameter to the URL in the subsequent requests.
1. Set ``count_pointer`` to point to the JSON element with the total number of results. If you use
``count_pointer``, you must set ``limit`` to indicate the number of results to return for each page. The
``limit`` attribute can either a number or a JSON pointer. Optionally, set ``use_page`` to ``True``
to calculate a 'page' parameter instead of the 'limit' and 'offset'.
``limit`` attribute can be either a number or a JSON pointer. Optionally, set ``use_page`` to ``True`` to
calculate a 'page' parameter instead of the 'limit' and 'offset'.
1. Write a ``start_request`` method with a request to the initial URL. The request's callback should be set to
``self.parse_list``.
If neither ``total_pages_counter`` nor ``count_pointer`` can be used to create the URLs (e.g. if you need to query
If neither ``total_pages_pointer`` nor ``count_pointer`` can be used to create the URLs (e.g. if you need to query
a separate URL that does not return JSON), you can provide a custom range of parameters defining the
``range_generator`` method. This method should return page or offset numbers. You also need to define a
``build_url`` method, that receives the pages/offset generated by ``range_generator``. See the ``kenya_makueni``
``url_builder`` method, that receives the pages/offset generated by ``range_generator``. See the ``kenya_makueni``
spider for an example.
The names of the GET parameters 'page', 'limit' and 'offset' to include in the URLS are customizable. Define the
Expand All @@ -585,7 +585,6 @@ class IndexSpider(SimpleSpider):
By default the content received in ``parse_list`` is yielded. If you want to avoid this, set ``yield_list_results``
to ``False``.
"""

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -622,34 +621,34 @@ def parse_list(self, response, **kwargs):
data = json.loads(response.text)
except json.JSONDecodeError:
data = None
for generated_params in self.range_generator(data, response):
yield self.build_request(self.url_builder(generated_params, data, response), formatter=self.formatter,
**kwargs)
for value in self.range_generator(data, response):
yield self.build_request(self.url_builder(value, data, response), formatter=self.formatter, **kwargs)

def pages_from_total_range_generator(self, data, response):
pages = resolve_pointer(data, self.total_pages_pointer)
return range(2, pages + 1)

def pages_url_builder(self, params, data, response):
def pages_url_builder(self, value, data, response):
return self._build_url({
self.param_page: params
self.param_page: value,
})

def limit_offset_range_generator(self, data, response):
limit = self._resolve_limit(data)
count = resolve_pointer(data, self.count_pointer)
return range(self.limit, count, limit)

def limit_offset_url_builder(self, params, data, response):
def limit_offset_url_builder(self, value, data, response):
return self._build_url({
self.param_limit: self.limit,
self.param_offset: params
self.param_offset: value,
})

def page_size_range_generator(self, data, response):
limit = self._resolve_limit(data)
count = resolve_pointer(data, self.count_pointer)
return range(2, (ceil(count/limit))+1)
# Assumes the first page is page 1, not page 0.
return range(2, ceil(count / limit) + 1)

def _resolve_limit(self, data):
if isinstance(self.limit, str) and self.limit.startswith('/'):
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/canada_montreal.py
Expand Up @@ -17,5 +17,5 @@ class CanadaMontreal(IndexSpider):
formatter = staticmethod(parameters('offset'))

def start_requests(self):
url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={step}'.format(step=self.limit)
url = f'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={self.limit}'
yield scrapy.Request(url, meta={'file_name': 'offset-0.json'}, callback=self.parse_list)
4 changes: 2 additions & 2 deletions kingfisher_scrapy/spiders/chile_base.py
Expand Up @@ -69,8 +69,8 @@ def parse_items(self, response):
# }
yield from self.handle_item(item)

def url_builder(self, params, data, response):
def url_builder(self, value, data, response):
year = response.request.meta['year']
month = response.request.meta['month']

return self.base_list_url.format(date(year, month, 1), params, self.limit)
return self.base_list_url.format(date(year, month, 1), value, self.limit)
3 changes: 1 addition & 2 deletions kingfisher_scrapy/spiders/honduras_oncae.py
@@ -1,4 +1,3 @@
from os.path import split
from urllib.parse import urlparse

import scrapy
Expand Down Expand Up @@ -46,7 +45,7 @@ def start_requests(self):
def parse_list(self, response):
urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
for url in urls:
path, file = split(urlparse(url).path)
path, file = urlparse(url).path.rsplit('/', 1)
current_system = path.replace('/datosabiertos/', "")
if self.system and current_system != self.system:
continue
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/honduras_portal_base.py
Expand Up @@ -23,5 +23,5 @@ def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
def start_requests(self):
url = self.url
if self.publisher:
url = url + '&publisher=' + self.publisher
url = f'{url}&publisher={self.publisher}'
yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list)
6 changes: 3 additions & 3 deletions kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
Expand Up @@ -26,10 +26,10 @@ class HondurasPortalBulkFiles(SimpleSpider):
@classmethod
def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
if publisher and publisher not in spider.publishers.keys():
if publisher and publisher not in spider.publishers:
raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')

spider.publisher_filter = spider.publishers.get(publisher)
spider.publisher_name = spider.publishers.get(publisher)

return spider

Expand All @@ -44,7 +44,7 @@ def start_requests(self):
def parse_list(self, response):
items = json.loads(response.text)
for item in items:
if self.publisher and self.publisher_filter not in item['publicador']:
if self.publisher and self.publisher_name not in item['publicador']:
continue
url = item['urls']['json']
yield self.build_request(url, formatter=components(-1))
9 changes: 4 additions & 5 deletions kingfisher_scrapy/spiders/kenya_makueni.py
Expand Up @@ -13,13 +13,13 @@ class KenyaMakueni(IndexSpider):
"""
name = 'kenya_makueni'
data_type = 'release_package_list'
step = 10
additional_params = {'pageSize': step}
limit = 10
additional_params = {'pageSize': limit}
yield_list_results = False
param_page = 'pageNumber'
formatter = staticmethod(parameters('pageNumber'))

base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={step}&pageNumber={page}'
base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={limit}&pageNumber={page}'

def start_requests(self):
yield scrapy.Request(
Expand All @@ -29,8 +29,7 @@ def start_requests(self):
)

def range_generator(self, data, response):
total = int(response.text)
return range(ceil(total / self.step))
return range(ceil(int(response.text) / self.limit))

def url_builder(self, params, data, response):
return self.pages_url_builder(params, data, response)
2 changes: 1 addition & 1 deletion tests/test_util.py
Expand Up @@ -8,7 +8,7 @@
('http://example.com/?page=1', None, 'http://example.com/'),
('http://example.com/', None, 'http://example.com/'),
])
def test_replace_parameter(url, value, expected):
def test_replace_parameters(url, value, expected):
assert replace_parameters(url, page=value) == expected


Expand Down

0 comments on commit a8e493e

Please sign in to comment.