Merge pull request #555 from open-contracting/346-add-date-args

Update Honduras scrapers
open-contracting · Nov 17, 2020 · 7911579 · 7911579
2 parents 9dde2cf + bf4d753
commit 7911579
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 49 deletions.
diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -1,17 +1,19 @@
-from urllib.parse import urlparse
-
-import scrapy
-
-from kingfisher_scrapy.base_spider import CompressedFileSpider
+from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicSpider
 from kingfisher_scrapy.exceptions import SpiderArgumentError
-from kingfisher_scrapy.util import components, handle_http_error
+from kingfisher_scrapy.util import components
 
 
-class HondurasONCAE(CompressedFileSpider):
+class HondurasONCAE(CompressedFileSpider, PeriodicSpider):
     """
     Domain
       Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE)
     Spider arguments
+      from_date
+        Download only releases from this year onward (YYYY format).
+        If ``until_date`` is provided and ``from_date`` don't, defaults to '2000'.
+      until_date
+        Download only releases until this year (YYYY format).
+        If ``from_date`` is provided and ``until_date`` don't, defaults to current year.
       system
         Filter by system:
 
@@ -32,27 +34,23 @@ class HondurasONCAE(CompressedFileSpider):
     # the files take too long to be downloaded, so we increase the download timeout
     download_timeout = 900
 
+    # PeriodicSpider variables
+    date_format = 'year'
+    default_from_date = '2005'
+    pattern = 'http://200.13.162.79/datosabiertos/{}'
+
     @classmethod
     def from_crawler(cls, crawler, system=None, *args, **kwargs):
         spider = super().from_crawler(crawler, system=system, *args, **kwargs)
         if system and spider.system not in spider.available_systems:
             raise SpiderArgumentError(f'spider argument `system`: {spider.system!r} not recognized')
         return spider
 
-    def start_requests(self):
-        yield scrapy.Request(
-            'http://oncae.gob.hn/datosabiertos',
-            meta={'file_name': 'list.html'},
-            callback=self.parse_list
-        )
-
-    @handle_http_error
-    def parse_list(self, response):
-        urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
-        for url in urls:
-            path, file = urlparse(url).path.rsplit('/', 1)
-            current_system = path.replace('/datosabiertos/', "")
-            if self.system and current_system != self.system:
+    def build_urls(self, date):
+        for system in self.available_systems:
+            if self.system and system != self.system:
                 continue
-            # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
-            yield self.build_request(url, formatter=components(-1))
+            yield self.pattern.format(f"{system}/{system}_datos_{date}_json.zip")
+
+    def get_formatter(self):
+        return components(-1)
diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
@@ -1,54 +1,78 @@
-import json
-
-import scrapy
-
-from kingfisher_scrapy.base_spider import SimpleSpider
+from kingfisher_scrapy.base_spider import PeriodicSpider
 from kingfisher_scrapy.exceptions import SpiderArgumentError
-from kingfisher_scrapy.util import components, handle_http_error
+from kingfisher_scrapy.util import components
 
 
-class HondurasPortalBulkFiles(SimpleSpider):
+class HondurasPortalBulkFiles(PeriodicSpider):
     """
     Domain
       Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) / Secretaria de Finanzas de Honduras (SEFIN)
     Spider arguments
+      from_date
+        Download only releases from this date onward (YYYY-MM format).
+        If ``until_date`` is provided and ``from_date`` don't, defaults to '2005-11'.
+      until_date
+        Download only releases until this date (YYYY-MM format).
+        If ``from_date`` is provided and ``until_date`` don't, defaults to current year-month.
       publisher
         Filter by publisher:
 
         oncae
           Oficina Normativa de Contratación y Adquisiciones del Estado
         sefin
           Secretaria de Finanzas de Honduras
+      system
+        Filter by oncae system:
+
+        CE
+          Catálogo Electrónico
+        DDC
+          Módulo de Difusión Directa de Contratos
+        HC1
+          HonduCompras 1.0 (Módulo de Difusión de Compras y Contrataciones)
     Bulk download documentation
       http://www.contratacionesabiertas.gob.hn/descargas/
     """
     name = 'honduras_portal_bulk_files'
     data_type = 'release_package'
     skip_pluck = 'Already covered (see code for details)'  # honduras_portal_releases
-    publishers = {'oncae': 'ONCAE', 'sefin': 'Secretaria de Finanzas'}
+    available_publishers = {'oncae': 'oficina_normativa', 'sefin': 'secretaria_de_fin_HN.SIAFI2'}
+    oncae_systems = {'HC1': 'honducompras-1', 'CE': 'catalogo-electronico', 'DDC': 'difusion-directa-contrato'}
+
+    # PeriodicSpider variables
+    date_format = 'year-month'
+    default_from_date = '2005-11'
+    pattern = 'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/{}'
 
     @classmethod
-    def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
-        spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
-        if publisher and spider.publisher not in spider.publishers:
+    def from_crawler(cls, crawler, publisher=None, system=None, *args, **kwargs):
+        spider = super().from_crawler(crawler, publisher=publisher, system=system, *args, **kwargs)
+        if publisher and spider.publisher not in spider.available_publishers:
             raise SpiderArgumentError(f'spider argument `publisher`: {spider.publisher!r} not recognized')
 
-        spider.publisher_name = spider.publishers.get(publisher)
+        if system:
+            if spider.publisher != 'oncae':
+                raise SpiderArgumentError(f'spider argument `system` is not supported for publisher: '
+                                          f'{spider.publisher!r}')
+            if spider.system not in spider.oncae_systems:
+                raise SpiderArgumentError(f'spider argument `system`: {spider.system!r} not recognized')
 
         return spider
 
-    def start_requests(self):
-        yield scrapy.Request(
-            'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json',
-            meta={'file_name': 'list.json'},
-            callback=self.parse_list,
-        )
-
-    @handle_http_error
-    def parse_list(self, response):
-        items = json.loads(response.text)
-        for item in items:
-            if self.publisher and self.publisher_name not in item['publicador']:
+    def build_urls(self, date):
+        for publisher in self.available_publishers:
+            if self.publisher and publisher != self.publisher:
                 continue
-            url = item['urls']['json']
-            yield self.build_request(url, formatter=components(-1))
+
+            if publisher == 'oncae':
+                for system in self.oncae_systems:
+                    if self.system and system != self.system:
+                        continue
+                    yield self.pattern.format(f"{self.available_publishers[publisher]}_"
+                                              f"{self.oncae_systems[system]}_{date.year}_{date.month:02d}.json")
+            else:
+                yield self.pattern.format(f"{self.available_publishers[publisher]}_"
+                                          f"{date.year}_{date.month:02d}.json")
+
+    def get_formatter(self):
+        return components(-1)