Merge branch 'master' into command-sample

open-contracting · Oct 21, 2020 · a8e493e · a8e493e
2 parents 77c2b15 + 3bf61bf
commit a8e493e
Show file tree

Hide file tree

Showing 9 changed files with 26 additions and 32 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -27,12 +27,9 @@ By default, these files are written to a ``data`` directory (you can :ref:`chang
        └── zambia
            └── 20200102_030405
                ├── <...>.json
-               ├── <...>.fileinfo
                └── <...>
 
-As you can see, the ``data`` directory contains a ``zambia`` spider directory (matching the spider's name), which in turn contains a ``20200102_030405`` crawl directory (matching the time at which you started the crawl – in this case, 2020-01-02 03:04:05).
-
-The crawl's directory will contain ``.json`` and ``.fileinfo`` files. The JSON files are the OCDS data. Each ``.fileinfo`` file contains metadata about a corresponding JSON file: the URL at which the JSON file was retrieved, along with other details.
+As you can see, the ``data`` directory contains a ``zambia`` spider directory (matching the spider's name), which in turn contains a ``20200102_030405`` crawl directory (matching the time at which you started the crawl – in this case, 2020-01-02 03:04:05). The crawl directory contains ``.json`` files – the OCDS data.
 
 .. toctree::
    :caption: Contents

diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -565,15 +565,15 @@ class IndexSpider(SimpleSpider):
         response data. The API will add the 'page' GET parameter to the URL in the subsequent requests.
         1. Set ``count_pointer`` to point to the JSON element with the total number of results. If you use
         ``count_pointer``, you must set ``limit`` to indicate the number of results to return for each page. The
-        ``limit`` attribute can either a number or a JSON pointer. Optionally, set ``use_page`` to ``True``
-        to calculate a 'page' parameter instead of the 'limit' and 'offset'.
+        ``limit`` attribute can be either a number or a JSON pointer. Optionally, set ``use_page`` to ``True`` to
+        calculate a 'page' parameter instead of the 'limit' and 'offset'.
     1. Write a ``start_request`` method with a request to the initial URL. The request's callback should be set to
     ``self.parse_list``.
 
-    If neither ``total_pages_counter`` nor ``count_pointer`` can be used to create the URLs (e.g. if you need to query
+    If neither ``total_pages_pointer`` nor ``count_pointer`` can be used to create the URLs (e.g. if you need to query
     a separate URL that does not return JSON), you can provide a custom range of parameters defining the
     ``range_generator`` method. This method should return page or offset numbers. You also need to define a
-    ``build_url`` method, that receives the pages/offset generated by ``range_generator``. See the ``kenya_makueni``
+    ``url_builder`` method, that receives the pages/offset generated by ``range_generator``. See the ``kenya_makueni``
     spider for an example.
 
     The names of the GET parameters 'page', 'limit' and 'offset' to include in the URLS are customizable. Define the
@@ -585,7 +585,6 @@ class IndexSpider(SimpleSpider):
 
     By default the content received in ``parse_list`` is yielded. If you want to avoid this, set ``yield_list_results``
     to ``False``.
-
     """
 
     def __init__(self, *args, **kwargs):
@@ -622,34 +621,34 @@ def parse_list(self, response, **kwargs):
             data = json.loads(response.text)
         except json.JSONDecodeError:
             data = None
-        for generated_params in self.range_generator(data, response):
-            yield self.build_request(self.url_builder(generated_params, data, response), formatter=self.formatter,
-                                     **kwargs)
+        for value in self.range_generator(data, response):
+            yield self.build_request(self.url_builder(value, data, response), formatter=self.formatter, **kwargs)
 
     def pages_from_total_range_generator(self, data, response):
         pages = resolve_pointer(data, self.total_pages_pointer)
         return range(2, pages + 1)
 
-    def pages_url_builder(self, params, data, response):
+    def pages_url_builder(self, value, data, response):
         return self._build_url({
-            self.param_page: params
+            self.param_page: value,
         })
 
     def limit_offset_range_generator(self, data, response):
         limit = self._resolve_limit(data)
         count = resolve_pointer(data, self.count_pointer)
         return range(self.limit, count, limit)
 
-    def limit_offset_url_builder(self, params, data, response):
+    def limit_offset_url_builder(self, value, data, response):
         return self._build_url({
             self.param_limit: self.limit,
-            self.param_offset: params
+            self.param_offset: value,
         })
 
     def page_size_range_generator(self, data, response):
         limit = self._resolve_limit(data)
         count = resolve_pointer(data, self.count_pointer)
-        return range(2, (ceil(count/limit))+1)
+        # Assumes the first page is page 1, not page 0.
+        return range(2, ceil(count / limit) + 1)
 
     def _resolve_limit(self, data):
         if isinstance(self.limit, str) and self.limit.startswith('/'):

diff --git a/kingfisher_scrapy/spiders/canada_montreal.py b/kingfisher_scrapy/spiders/canada_montreal.py
@@ -17,5 +17,5 @@ class CanadaMontreal(IndexSpider):
     formatter = staticmethod(parameters('offset'))
 
     def start_requests(self):
-        url = 'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={step}'.format(step=self.limit)
+        url = f'https://ville.montreal.qc.ca/vuesurlescontrats/api/releases.json?limit={self.limit}'
         yield scrapy.Request(url, meta={'file_name': 'offset-0.json'}, callback=self.parse_list)
diff --git a/kingfisher_scrapy/spiders/chile_base.py b/kingfisher_scrapy/spiders/chile_base.py
@@ -69,8 +69,8 @@ def parse_items(self, response):
             # }
             yield from self.handle_item(item)
 
-    def url_builder(self, params, data, response):
+    def url_builder(self, value, data, response):
         year = response.request.meta['year']
         month = response.request.meta['month']
 
-        return self.base_list_url.format(date(year, month, 1), params, self.limit)
+        return self.base_list_url.format(date(year, month, 1), value, self.limit)
diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -1,4 +1,3 @@
-from os.path import split
 from urllib.parse import urlparse
 
 import scrapy
@@ -46,7 +45,7 @@ def start_requests(self):
     def parse_list(self, response):
         urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
         for url in urls:
-            path, file = split(urlparse(url).path)
+            path, file = urlparse(url).path.rsplit('/', 1)
             current_system = path.replace('/datosabiertos/', "")
             if self.system and current_system != self.system:
                 continue

diff --git a/kingfisher_scrapy/spiders/honduras_portal_base.py b/kingfisher_scrapy/spiders/honduras_portal_base.py
@@ -23,5 +23,5 @@ def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
     def start_requests(self):
         url = self.url
         if self.publisher:
-            url = url + '&publisher=' + self.publisher
+            url = f'{url}&publisher={self.publisher}'
         yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list)
diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
@@ -26,10 +26,10 @@ class HondurasPortalBulkFiles(SimpleSpider):
     @classmethod
     def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
         spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
-        if publisher and publisher not in spider.publishers.keys():
+        if publisher and publisher not in spider.publishers:
             raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')
 
-        spider.publisher_filter = spider.publishers.get(publisher)
+        spider.publisher_name = spider.publishers.get(publisher)
 
         return spider
 
@@ -44,7 +44,7 @@ def start_requests(self):
     def parse_list(self, response):
         items = json.loads(response.text)
         for item in items:
-            if self.publisher and self.publisher_filter not in item['publicador']:
+            if self.publisher and self.publisher_name not in item['publicador']:
                 continue
             url = item['urls']['json']
             yield self.build_request(url, formatter=components(-1))
diff --git a/kingfisher_scrapy/spiders/kenya_makueni.py b/kingfisher_scrapy/spiders/kenya_makueni.py
@@ -13,13 +13,13 @@ class KenyaMakueni(IndexSpider):
     """
     name = 'kenya_makueni'
     data_type = 'release_package_list'
-    step = 10
-    additional_params = {'pageSize': step}
+    limit = 10
+    additional_params = {'pageSize': limit}
     yield_list_results = False
     param_page = 'pageNumber'
     formatter = staticmethod(parameters('pageNumber'))
 
-    base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={step}&pageNumber={page}'
+    base_url = 'https://opencontracting.makueni.go.ke/api/ocds/package/all?pageSize={limit}&pageNumber={page}'
 
     def start_requests(self):
         yield scrapy.Request(
@@ -29,8 +29,7 @@ def start_requests(self):
         )
 
     def range_generator(self, data, response):
-        total = int(response.text)
-        return range(ceil(total / self.step))
+        return range(ceil(int(response.text) / self.limit))
 
     def url_builder(self, params, data, response):
         return self.pages_url_builder(params, data, response)
diff --git a/tests/test_util.py b/tests/test_util.py
@@ -8,7 +8,7 @@
     ('http://example.com/?page=1', None, 'http://example.com/'),
     ('http://example.com/', None, 'http://example.com/'),
 ])
-def test_replace_parameter(url, value, expected):
+def test_replace_parameters(url, value, expected):
     assert replace_parameters(url, page=value) == expected