diff --git a/docs/index.rst b/docs/index.rst index c64dd86a..d7645bea 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -27,12 +27,9 @@ By default, these files are written to a ``data`` directory (you can :ref:`chang └── zambia └── 20200102_030405 ├── <...>.json - ├── <...>.fileinfo └── <...> -As you can see, the ``data`` directory contains a ``zambia`` spider directory (matching the spider's name), which in turn contains a ``20200102_030405`` crawl directory (matching the time at which you started the crawl – in this case, 2020-01-02 03:04:05). - -The crawl's directory will contain ``.json`` and ``.fileinfo`` files. The JSON files are the OCDS data. Each ``.fileinfo`` file contains metadata about a corresponding JSON file: the URL at which the JSON file was retrieved, along with other details. +As you can see, the ``data`` directory contains a ``zambia`` spider directory (matching the spider's name), which in turn contains a ``20200102_030405`` crawl directory (matching the time at which you started the crawl – in this case, 2020-01-02 03:04:05). The crawl directory contains ``.json`` files – the OCDS data. .. toctree:: :caption: Contents diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 8d4248f8..73e2805e 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -565,15 +565,15 @@ class IndexSpider(SimpleSpider): response data. The API will add the 'page' GET parameter to the URL in the subsequent requests. 1. Set ``count_pointer`` to point to the JSON element with the total number of results. If you use ``count_pointer``, you must set ``limit`` to indicate the number of results to return for each page. The - ``limit`` attribute can either a number or a JSON pointer. Optionally, set ``use_page`` to ``True`` - to calculate a 'page' parameter instead of the 'limit' and 'offset'. + ``limit`` attribute can be either a number or a JSON pointer. Optionally, set ``use_page`` to ``True`` to + calculate a 'page' parameter instead of the 'limit' and 'offset'. 1. Write a ``start_request`` method with a request to the initial URL. The request's callback should be set to ``self.parse_list``. - If neither ``total_pages_counter`` nor ``count_pointer`` can be used to create the URLs (e.g. if you need to query + If neither ``total_pages_pointer`` nor ``count_pointer`` can be used to create the URLs (e.g. if you need to query a separate URL that does not return JSON), you can provide a custom range of parameters defining the ``range_generator`` method. This method should return page or offset numbers. You also need to define a - ``build_url`` method, that receives the pages/offset generated by ``range_generator``. See the ``kenya_makueni`` + ``url_builder`` method, that receives the pages/offset generated by ``range_generator``. See the ``kenya_makueni`` spider for an example. The names of the GET parameters 'page', 'limit' and 'offset' to include in the URLS are customizable. Define the @@ -585,7 +585,6 @@ class IndexSpider(SimpleSpider): By default the content received in ``parse_list`` is yielded. If you want to avoid this, set ``yield_list_results`` to ``False``. - """ def __init__(self, *args, **kwargs): @@ -622,17 +621,16 @@ def parse_list(self, response, **kwargs): data = json.loads(response.text) except json.JSONDecodeError: data = None - for generated_params in self.range_generator(data, response): - yield self.build_request(self.url_builder(generated_params, data, response), formatter=self.formatter, - **kwargs) + for value in self.range_generator(data, response): + yield self.build_request(self.url_builder(value, data, response), formatter=self.formatter, **kwargs) def pages_from_total_range_generator(self, data, response): pages = resolve_pointer(data, self.total_pages_pointer) return range(2, pages + 1) - def pages_url_builder(self, params, data, response): + def pages_url_builder(self, value, data, response): return self._build_url({ - self.param_page: params + self.param_page: value, }) def limit_offset_range_generator(self, data, response): @@ -640,16 +638,17 @@ def limit_offset_range_generator(self, data, response): count = resolve_pointer(data, self.count_pointer) return range(self.limit, count, limit) - def limit_offset_url_builder(self, params, data, response): + def limit_offset_url_builder(self, value, data, response): return self._build_url({ self.param_limit: self.limit, - self.param_offset: params + self.param_offset: value, }) def page_size_range_generator(self, data, response): limit = self._resolve_limit(data) count = resolve_pointer(data, self.count_pointer) - return range(2, (ceil(count/limit))+1) + # Assumes the first page is page 1, not page 0. + return range(2, ceil(count / limit) + 1) def _resolve_limit(self, data): if isinstance(self.limit, str) and self.limit.startswith('/'): diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py index 15b68d1f..3a726972 100644 --- a/kingfisher_scrapy/spiders/canada_montreal.py +++ b/kingfisher_scrapy/spiders/canada_montreal.py @@ -17,5 +17,5 @@ class CanadaMontreal(IndexSpider): formatter = staticmethod(parameters('offset')) def start_requests(self): - url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={step}'.format(step=self.limit) + url = f'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={self.limit}' yield scrapy.Request(url, meta={'file_name': 'offset-0.json'}, callback=self.parse_list) diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py index 9df58f1f..69762420 100644 --- a/kingfisher_scrapy/spiders/chile_base.py +++ b/kingfisher_scrapy/spiders/chile_base.py @@ -69,8 +69,8 @@ def parse_items(self, response): # } yield from self.handle_item(item) - def url_builder(self, params, data, response): + def url_builder(self, value, data, response): year = response.request.meta['year'] month = response.request.meta['month'] - return self.base_list_url.format(date(year, month, 1), params, self.limit) + return self.base_list_url.format(date(year, month, 1), value, self.limit) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 04e6a9bc..81405995 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -1,4 +1,3 @@ -from os.path import split from urllib.parse import urlparse import scrapy @@ -46,7 +45,7 @@ def start_requests(self): def parse_list(self, response): urls = response.xpath('//a[contains(., "[json]")]/@href').getall() for url in urls: - path, file = split(urlparse(url).path) + path, file = urlparse(url).path.rsplit('/', 1) current_system = path.replace('/datosabiertos/', "") if self.system and current_system != self.system: continue diff --git a/kingfisher_scrapy/spiders/honduras_portal_base.py b/kingfisher_scrapy/spiders/honduras_portal_base.py index 3ebf6474..fe6f6620 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_base.py +++ b/kingfisher_scrapy/spiders/honduras_portal_base.py @@ -23,5 +23,5 @@ def from_crawler(cls, crawler, publisher=None, *args, **kwargs): def start_requests(self): url = self.url if self.publisher: - url = url + '&publisher=' + self.publisher + url = f'{url}&publisher={self.publisher}' yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list) diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index e83461c8..3024e304 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -26,10 +26,10 @@ class HondurasPortalBulkFiles(SimpleSpider): @classmethod def from_crawler(cls, crawler, publisher=None, *args, **kwargs): spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs) - if publisher and publisher not in spider.publishers.keys(): + if publisher and publisher not in spider.publishers: raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized') - spider.publisher_filter = spider.publishers.get(publisher) + spider.publisher_name = spider.publishers.get(publisher) return spider @@ -44,7 +44,7 @@ def start_requests(self): def parse_list(self, response): items = json.loads(response.text) for item in items: - if self.publisher and self.publisher_filter not in item['publicador']: + if self.publisher and self.publisher_name not in item['publicador']: continue url = item['urls']['json'] yield self.build_request(url, formatter=components(-1)) diff --git a/kingfisher_scrapy/spiders/kenya_makueni.py b/kingfisher_scrapy/spiders/kenya_makueni.py index 1ee88b5c..0289cf10 100644 --- a/kingfisher_scrapy/spiders/kenya_makueni.py +++ b/kingfisher_scrapy/spiders/kenya_makueni.py @@ -13,13 +13,13 @@ class KenyaMakueni(IndexSpider): """ name = 'kenya_makueni' data_type = 'release_package_list' - step = 10 - additional_params = {'pageSize': step} + limit = 10 + additional_params = {'pageSize': limit} yield_list_results = False param_page = 'pageNumber' formatter = staticmethod(parameters('pageNumber')) - base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={step}&pageNumber={page}' + base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={limit}&pageNumber={page}' def start_requests(self): yield scrapy.Request( @@ -29,8 +29,7 @@ def start_requests(self): ) def range_generator(self, data, response): - total = int(response.text) - return range(ceil(total / self.step)) + return range(ceil(int(response.text) / self.limit)) def url_builder(self, params, data, response): return self.pages_url_builder(params, data, response) diff --git a/tests/test_util.py b/tests/test_util.py index 93da8889..62c73437 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -8,7 +8,7 @@ ('http://example.com/?page=1', None, 'http://example.com/'), ('http://example.com/', None, 'http://example.com/'), ]) -def test_replace_parameter(url, value, expected): +def test_replace_parameters(url, value, expected): assert replace_parameters(url, page=value) == expected