Merge c4182b6 into 279e79c

open-contracting · Sep 17, 2020 · 12a5a97 · 12a5a97
2 parents 279e79c + c4182b6
commit 12a5a97
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 29 deletions.
diff --git a/kingfisher_scrapy/spiders/honduras_portal_base.py b/kingfisher_scrapy/spiders/honduras_portal_base.py
@@ -0,0 +1,26 @@
+import scrapy
+
+from kingfisher_scrapy.base_spider import LinksSpider
+from kingfisher_scrapy.util import parameters
+
+
+class HondurasPortalBase(LinksSpider):
+    next_pointer = '/next'
+    next_page_formatter = staticmethod(parameters('page'))
+    publishers = ['oncae', 'sefin']
+
+    download_delay = 0.9
+
+    @classmethod
+    def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
+        spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
+        if publisher and publisher not in spider.publishers:
+            raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')
+
+        return spider
+
+    def start_requests(self):
+        url = self.url
+        if self.publisher:
+            url = url + '&publisher=' + self.publisher
+        yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
@@ -11,12 +11,31 @@ class HondurasPortalBulkFiles(SimpleSpider):
     Bulk download documentation
       http://www.contratacionesabiertas.gob.hn/descargas/
     Spider arguments
+      publisher
+        Filter the data by a specific publisher.
+        ``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
+        ``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
       sample
         Downloads the first package listed in http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json.
+        If ``publisher'' is also provided, a single package is downloaded from that publisher.
     """
     name = 'honduras_portal_bulk_files'
     data_type = 'release_package'
     skip_pluck = 'Already covered (see code for details)'  # honduras_portal_releases
+    publishers = ['oncae', 'sefin']
+
+    @classmethod
+    def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
+        spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
+        if publisher and publisher not in spider.publishers:
+            raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')
+
+        if publisher == 'oncae':
+            spider.publisher_filter = 'ONCAE'
+        elif publisher == 'sefin':
+            spider.publisher_filter = 'Secretaria de Finanzas'
+
+        return spider
 
     def start_requests(self):
         yield scrapy.Request(
@@ -28,9 +47,11 @@ def start_requests(self):
     @handle_http_error
     def parse_list(self, response):
         items = json.loads(response.text)
-        if self.sample:
-            items = [items[0]]
-
         for item in items:
+            if self.publisher and self.publisher_filter not in item['publicador']:
+                continue
             url = item['urls']['json']
             yield self.build_request(url, formatter=components(-1))
+
+            if self.sample:
+                return
diff --git a/kingfisher_scrapy/spiders/honduras_portal_records.py b/kingfisher_scrapy/spiders/honduras_portal_records.py
@@ -1,28 +1,23 @@
-import scrapy
+from kingfisher_scrapy.spiders.honduras_portal_base import HondurasPortalBase
 
-from kingfisher_scrapy.base_spider import LinksSpider
-from kingfisher_scrapy.util import parameters
 
-
-class HondurasPortalRecords(LinksSpider):
+class HondurasPortalRecords(HondurasPortalBase):
     """
     API documentation
       http://www.contratacionesabiertas.gob.hn/manual_api/
     Swagger API documentation
       http://www.contratacionesabiertas.gob.hn/servicio/
     Spider arguments
+      publisher
+        Filter the data by a specific publisher.
+        ``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
+        ``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
       sample
         Download only the first record package in the dataset.
+        If ``publisher'' is also provided, a single package is downloaded from that publisher.
     """
     name = 'honduras_portal_records'
     data_type = 'record_package'
     data_pointer = '/recordPackage'
-    next_pointer = '/next'
-    next_page_formatter = staticmethod(parameters('page'))
     skip_pluck = 'Already covered (see code for details)'  # honduras_portal_releases
-
-    download_delay = 0.9
-
-    def start_requests(self):
-        url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json'
-        yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
+    url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json'
diff --git a/kingfisher_scrapy/spiders/honduras_portal_releases.py b/kingfisher_scrapy/spiders/honduras_portal_releases.py
@@ -1,27 +1,22 @@
-import scrapy
+from kingfisher_scrapy.spiders.honduras_portal_base import HondurasPortalBase
 
-from kingfisher_scrapy.base_spider import LinksSpider
-from kingfisher_scrapy.util import parameters
 
-
-class HondurasPortalReleases(LinksSpider):
+class HondurasPortalReleases(HondurasPortalBase):
     """
     API documentation
       http://www.contratacionesabiertas.gob.hn/manual_api/
     Swagger API documentation
       http://www.contratacionesabiertas.gob.hn/servicio/
     Spider arguments
+      publisher
+        Filter the data by a specific publisher.
+        ``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
+        ``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
       sample
         Download only the first release package in the dataset.
+        If ``publisher'' is also provided, a single package is downloaded from that publisher.
     """
     name = 'honduras_portal_releases'
     data_type = 'release_package'
     data_pointer = '/releasePackage'
-    next_pointer = '/next'
-    next_page_formatter = staticmethod(parameters('page'))
-
-    download_delay = 0.9
-
-    def start_requests(self):
-        url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json'
-        yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
+    url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json'