Merge 4f3c774 into 279e79c

open-contracting · Sep 17, 2020 · d004b38 · d004b38
2 parents 279e79c + 4f3c774
commit d004b38
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 4 deletions.
diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py
@@ -23,7 +23,7 @@ class Colombia(LinksSpider):
         The year to crawl. See API documentation for valid values.
       from_date
         Download only releases from this release.date onward (YYYY-MM-DD format).
-        If `until_date` is provided and ``from_date`` don't, defaults to '2011-01-01'.
+        If ``until_date`` is provided and ``from_date`` don't, defaults to '2011-01-01'.
       until_date
         Download only releases until this release.date (YYYY-MM-DD format).
         If ``from_date`` is provided and ``until_date`` don't, defaults to today.

diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -1,3 +1,6 @@
+from os.path import split
+from urllib.parse import urlparse
+
 import scrapy
 
 from kingfisher_scrapy.base_spider import CompressedFileSpider
@@ -9,16 +12,29 @@ class HondurasONCAE(CompressedFileSpider):
     Bulk download documentation
       http://oncae.gob.hn/datosabiertos
     Spider arguments
+      system
+        Download only data from the provided system.
+        ``HC1`` for "HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones" system.
+        ``CE`` for "Módulo de Difusión Directa de Contratos" system.
+        ``DDC`` for "Catálogo Electrónico" system.
       sample
-        Downloads the first package listed on the downloads page.
+        Downloads the first package listed on the downloads page for each system.
     """
     name = 'honduras_oncae'
     data_type = 'release_package'
     skip_pluck = 'Already covered (see code for details)'  # honduras_portal_releases
+    systems = ['HC1', 'CE', 'DDC']
 
     # the files take too long to be downloaded, so we increase the download timeout
     download_timeout = 900
 
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        spider = super().from_crawler(crawler, *args, **kwargs)
+        if hasattr(spider, 'system') and spider.system not in spider.systems:
+            raise scrapy.exceptions.CloseSpider('Specified system is not recognized')
+        return spider
+
     def start_requests(self):
         yield scrapy.Request(
             'http://oncae.gob.hn/datosabiertos',
@@ -28,9 +44,18 @@ def start_requests(self):
 
     @handle_http_error
     def parse_list(self, response):
+        systems_flags = {system: False for system in self.systems}
         urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
-        if self.sample:
-            urls = [urls[0]]
         for url in urls:
+            path, file = split(urlparse(url).path)
+            current_system = path.replace('/datosabiertos/', "")
+            if hasattr(self, 'system') and current_system != self.system:
+                continue
+            if self.sample:
+                if systems_flags[current_system]:
+                    continue
+                if next((system for system in systems_flags if not system), False):
+                    return
+                systems_flags[current_system] = True
             # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
             yield self.build_request(url, formatter=components(-1))