From dec74c8b327e251cbb095232323a1c4ecdf24441 Mon Sep 17 00:00:00 2001
From: Andres Aguilera <amaguilera96@gmail.com>
Date: Wed, 16 Sep 2020 20:18:00 -0400
Subject: [PATCH 1/6] Update sample argument and add system argument

---
 kingfisher_scrapy/spiders/honduras_oncae.py | 31 +++++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
index 038947bb5..449d6fcd7 100644
--- a/kingfisher_scrapy/spiders/honduras_oncae.py
+++ b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -1,3 +1,6 @@
+from os.path import split
+from urllib.parse import urlparse
+
 import scrapy
 
 from kingfisher_scrapy.base_spider import CompressedFileSpider
@@ -9,16 +12,29 @@ class HondurasONCAE(CompressedFileSpider):
     Bulk download documentation
       http://oncae.gob.hn/datosabiertos
     Spider arguments
+      system
+        Download only data from the provided system.
+        ``HC1`` for "HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones" system.
+        ``CE`` for "Módulo de Difusión Directa de Contratos" system.
+        ``DDC`` for "Catálogo Electrónico" system.
       sample
-        Downloads the first package listed on the downloads page.
+        Downloads the first package listed on the downloads page for each system.
     """
     name = 'honduras_oncae'
     data_type = 'release_package'
     skip_pluck = 'Already covered (see code for details)'  # honduras_portal_releases
+    systems = ['HC1', 'CE', 'DDC']
 
     # the files take too long to be downloaded, so we increase the download timeout
     download_timeout = 900
 
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        spider = super().from_crawler(crawler, *args, **kwargs)
+        if hasattr(spider, 'system') and spider.system not in spider.systems:
+            raise scrapy.exceptions.CloseSpider('Specified system is not recognized')
+        return spider
+
     def start_requests(self):
         yield scrapy.Request(
             'http://oncae.gob.hn/datosabiertos',
@@ -28,9 +44,18 @@ def start_requests(self):
 
     @handle_http_error
     def parse_list(self, response):
+        systems_flags = {system: False for system in self.systems}
         urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
-        if self.sample:
-            urls = [urls[0]]
         for url in urls:
+            path, file = split(urlparse(url).path)
+            current_system = path.replace('/datosabiertos/', "")
+            if hasattr(self, 'system') and current_system != self.system:
+                continue
+            if self.sample:
+                if systems_flags[current_system]:
+                    continue
+                if next((system for system in systems_flags if not system), False):
+                    return
+                systems_flags[current_system] = True
             # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
             yield self.build_request(url, formatter=components(-1))

From 4f3c77437dcc1e04f98ef470c1aa576a6114a843 Mon Sep 17 00:00:00 2001
From: Andres Aguilera <amaguilera96@gmail.com>
Date: Wed, 16 Sep 2020 20:18:55 -0400
Subject: [PATCH 2/6] Correct docstrings

---
 kingfisher_scrapy/spiders/colombia.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py
index e4eb357de..e8a2abde0 100644
--- a/kingfisher_scrapy/spiders/colombia.py
+++ b/kingfisher_scrapy/spiders/colombia.py
@@ -23,7 +23,7 @@ class Colombia(LinksSpider):
         The year to crawl. See API documentation for valid values.
       from_date
         Download only releases from this release.date onward (YYYY-MM-DD format).
-        If `until_date` is provided and ``from_date`` don't, defaults to '2011-01-01'.
+        If ``until_date`` is provided and ``from_date`` don't, defaults to '2011-01-01'.
       until_date
         Download only releases until this release.date (YYYY-MM-DD format).
         If ``from_date`` is provided and ``until_date`` don't, defaults to today.

From 9399a48b4207a952ddf0c62f95bbe50ad955fa2c Mon Sep 17 00:00:00 2001
From: Andres Aguilera <amaguilera96@gmail.com>
Date: Wed, 16 Sep 2020 20:30:25 -0400
Subject: [PATCH 3/6] Update docstrings

---
 kingfisher_scrapy/spiders/honduras_oncae.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
index 449d6fcd7..caa2c0b04 100644
--- a/kingfisher_scrapy/spiders/honduras_oncae.py
+++ b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -19,6 +19,7 @@ class HondurasONCAE(CompressedFileSpider):
         ``DDC`` for "Catálogo Electrónico" system.
       sample
         Downloads the first package listed on the downloads page for each system.
+        If ``system'' is also provided, a single package is downloaded from that system.
     """
     name = 'honduras_oncae'
     data_type = 'release_package'

From 5b4723ddcc1526ece85040d14e02681967d9d6bb Mon Sep 17 00:00:00 2001
From: Andres Aguilera <amaguilera96@gmail.com>
Date: Wed, 16 Sep 2020 21:41:03 -0400
Subject: [PATCH 4/6] Update changes

---
 kingfisher_scrapy/spiders/honduras_oncae.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
index caa2c0b04..de112a85e 100644
--- a/kingfisher_scrapy/spiders/honduras_oncae.py
+++ b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -30,9 +30,9 @@ class HondurasONCAE(CompressedFileSpider):
     download_timeout = 900
 
     @classmethod
-    def from_crawler(cls, crawler, *args, **kwargs):
-        spider = super().from_crawler(crawler, *args, **kwargs)
-        if hasattr(spider, 'system') and spider.system not in spider.systems:
+    def from_crawler(cls, crawler, system=None, *args, **kwargs):
+        spider = super().from_crawler(crawler, system=system, *args, **kwargs)
+        if system and spider.system not in spider.systems:
             raise scrapy.exceptions.CloseSpider('Specified system is not recognized')
         return spider
 
@@ -50,7 +50,7 @@ def parse_list(self, response):
         for url in urls:
             path, file = split(urlparse(url).path)
             current_system = path.replace('/datosabiertos/', "")
-            if hasattr(self, 'system') and current_system != self.system:
+            if self.system and current_system != self.system:
                 continue
             if self.sample:
                 if systems_flags[current_system]:

From 8a8898b45dcde476efc2ba2793527a42e0e61de2 Mon Sep 17 00:00:00 2001
From: Andres Aguilera <amaguilera96@gmail.com>
Date: Mon, 21 Sep 2020 12:08:16 -0400
Subject: [PATCH 5/6] Update changes from review

---
 kingfisher_scrapy/spiders/honduras_oncae.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
index de112a85e..fc46f8f04 100644
--- a/kingfisher_scrapy/spiders/honduras_oncae.py
+++ b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -45,7 +45,6 @@ def start_requests(self):
 
     @handle_http_error
     def parse_list(self, response):
-        systems_flags = {system: False for system in self.systems}
         urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
         for url in urls:
             path, file = split(urlparse(url).path)
@@ -53,10 +52,12 @@ def parse_list(self, response):
             if self.system and current_system != self.system:
                 continue
             if self.sample:
-                if systems_flags[current_system]:
-                    continue
-                if next((system for system in systems_flags if not system), False):
+                # if we already downloaded a package for all the available systems
+                if not self.systems:
                     return
-                systems_flags[current_system] = True
+                # if we already processed a file for the current system
+                if current_system not in self.systems:
+                    continue
+                self.systems.remove(current_system)
             # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
             yield self.build_request(url, formatter=components(-1))

From bb69f7cd6d2a51a54c1e14e27099a7b75137447f Mon Sep 17 00:00:00 2001
From: Andres Aguilera <amaguilera96@gmail.com>
Date: Mon, 21 Sep 2020 17:04:40 -0400
Subject: [PATCH 6/6] Update changes

---
 kingfisher_scrapy/spiders/honduras_oncae.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py
index fc46f8f04..3f3c80fe5 100644
--- a/kingfisher_scrapy/spiders/honduras_oncae.py
+++ b/kingfisher_scrapy/spiders/honduras_oncae.py
@@ -24,7 +24,7 @@ class HondurasONCAE(CompressedFileSpider):
     name = 'honduras_oncae'
     data_type = 'release_package'
     skip_pluck = 'Already covered (see code for details)'  # honduras_portal_releases
-    systems = ['HC1', 'CE', 'DDC']
+    available_systems = ['HC1', 'CE', 'DDC']
 
     # the files take too long to be downloaded, so we increase the download timeout
     download_timeout = 900
@@ -32,7 +32,7 @@ class HondurasONCAE(CompressedFileSpider):
     @classmethod
     def from_crawler(cls, crawler, system=None, *args, **kwargs):
         spider = super().from_crawler(crawler, system=system, *args, **kwargs)
-        if system and spider.system not in spider.systems:
+        if system and spider.system not in spider.available_systems:
             raise scrapy.exceptions.CloseSpider('Specified system is not recognized')
         return spider
 
@@ -45,6 +45,7 @@ def start_requests(self):
 
     @handle_http_error
     def parse_list(self, response):
+        downloaded_systems = set()
         urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
         for url in urls:
             path, file = split(urlparse(url).path)
@@ -53,11 +54,12 @@ def parse_list(self, response):
                 continue
             if self.sample:
                 # if we already downloaded a package for all the available systems
-                if not self.systems:
+                if downloaded_systems == self.available_systems:
                     return
                 # if we already processed a file for the current system
-                if current_system not in self.systems:
+                if current_system in downloaded_systems:
                     continue
-                self.systems.remove(current_system)
+                # add the current system to the set of downloaded_systems
+                downloaded_systems.add(current_system)
             # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
             yield self.build_request(url, formatter=components(-1))