From dec74c8b327e251cbb095232323a1c4ecdf24441 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Wed, 16 Sep 2020 20:18:00 -0400 Subject: [PATCH 1/6] Update sample argument and add system argument --- kingfisher_scrapy/spiders/honduras_oncae.py | 31 +++++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 038947bb5..449d6fcd7 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -1,3 +1,6 @@ +from os.path import split +from urllib.parse import urlparse + import scrapy from kingfisher_scrapy.base_spider import CompressedFileSpider @@ -9,16 +12,29 @@ class HondurasONCAE(CompressedFileSpider): Bulk download documentation http://oncae.gob.hn/datosabiertos Spider arguments + system + Download only data from the provided system. + ``HC1`` for "HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones" system. + ``CE`` for "Módulo de Difusión Directa de Contratos" system. + ``DDC`` for "Catálogo Electrónico" system. sample - Downloads the first package listed on the downloads page. + Downloads the first package listed on the downloads page for each system. """ name = 'honduras_oncae' data_type = 'release_package' skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases + systems = ['HC1', 'CE', 'DDC'] # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + spider = super().from_crawler(crawler, *args, **kwargs) + if hasattr(spider, 'system') and spider.system not in spider.systems: + raise scrapy.exceptions.CloseSpider('Specified system is not recognized') + return spider + def start_requests(self): yield scrapy.Request( 'http://oncae.gob.hn/datosabiertos', @@ -28,9 +44,18 @@ def start_requests(self): @handle_http_error def parse_list(self, response): + systems_flags = {system: False for system in self.systems} urls = response.xpath('//a[contains(., "[json]")]/@href').getall() - if self.sample: - urls = [urls[0]] for url in urls: + path, file = split(urlparse(url).path) + current_system = path.replace('/datosabiertos/', "") + if hasattr(self, 'system') and current_system != self.system: + continue + if self.sample: + if systems_flags[current_system]: + continue + if next((system for system in systems_flags if not system), False): + return + systems_flags[current_system] = True # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip yield self.build_request(url, formatter=components(-1)) From 4f3c77437dcc1e04f98ef470c1aa576a6114a843 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Wed, 16 Sep 2020 20:18:55 -0400 Subject: [PATCH 2/6] Correct docstrings --- kingfisher_scrapy/spiders/colombia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index e4eb357de..e8a2abde0 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -23,7 +23,7 @@ class Colombia(LinksSpider): The year to crawl. See API documentation for valid values. from_date Download only releases from this release.date onward (YYYY-MM-DD format). - If `until_date` is provided and ``from_date`` don't, defaults to '2011-01-01'. + If ``until_date`` is provided and ``from_date`` don't, defaults to '2011-01-01'. until_date Download only releases until this release.date (YYYY-MM-DD format). If ``from_date`` is provided and ``until_date`` don't, defaults to today. From 9399a48b4207a952ddf0c62f95bbe50ad955fa2c Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Wed, 16 Sep 2020 20:30:25 -0400 Subject: [PATCH 3/6] Update docstrings --- kingfisher_scrapy/spiders/honduras_oncae.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 449d6fcd7..caa2c0b04 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -19,6 +19,7 @@ class HondurasONCAE(CompressedFileSpider): ``DDC`` for "Catálogo Electrónico" system. sample Downloads the first package listed on the downloads page for each system. + If ``system'' is also provided, a single package is downloaded from that system. """ name = 'honduras_oncae' data_type = 'release_package' From 5b4723ddcc1526ece85040d14e02681967d9d6bb Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Wed, 16 Sep 2020 21:41:03 -0400 Subject: [PATCH 4/6] Update changes --- kingfisher_scrapy/spiders/honduras_oncae.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index caa2c0b04..de112a85e 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -30,9 +30,9 @@ class HondurasONCAE(CompressedFileSpider): download_timeout = 900 @classmethod - def from_crawler(cls, crawler, *args, **kwargs): - spider = super().from_crawler(crawler, *args, **kwargs) - if hasattr(spider, 'system') and spider.system not in spider.systems: + def from_crawler(cls, crawler, system=None, *args, **kwargs): + spider = super().from_crawler(crawler, system=system, *args, **kwargs) + if system and spider.system not in spider.systems: raise scrapy.exceptions.CloseSpider('Specified system is not recognized') return spider @@ -50,7 +50,7 @@ def parse_list(self, response): for url in urls: path, file = split(urlparse(url).path) current_system = path.replace('/datosabiertos/', "") - if hasattr(self, 'system') and current_system != self.system: + if self.system and current_system != self.system: continue if self.sample: if systems_flags[current_system]: From 8a8898b45dcde476efc2ba2793527a42e0e61de2 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Mon, 21 Sep 2020 12:08:16 -0400 Subject: [PATCH 5/6] Update changes from review --- kingfisher_scrapy/spiders/honduras_oncae.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index de112a85e..fc46f8f04 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -45,7 +45,6 @@ def start_requests(self): @handle_http_error def parse_list(self, response): - systems_flags = {system: False for system in self.systems} urls = response.xpath('//a[contains(., "[json]")]/@href').getall() for url in urls: path, file = split(urlparse(url).path) @@ -53,10 +52,12 @@ def parse_list(self, response): if self.system and current_system != self.system: continue if self.sample: - if systems_flags[current_system]: - continue - if next((system for system in systems_flags if not system), False): + # if we already downloaded a package for all the available systems + if not self.systems: return - systems_flags[current_system] = True + # if we already processed a file for the current system + if current_system not in self.systems: + continue + self.systems.remove(current_system) # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip yield self.build_request(url, formatter=components(-1)) From bb69f7cd6d2a51a54c1e14e27099a7b75137447f Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Mon, 21 Sep 2020 17:04:40 -0400 Subject: [PATCH 6/6] Update changes --- kingfisher_scrapy/spiders/honduras_oncae.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index fc46f8f04..3f3c80fe5 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -24,7 +24,7 @@ class HondurasONCAE(CompressedFileSpider): name = 'honduras_oncae' data_type = 'release_package' skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases - systems = ['HC1', 'CE', 'DDC'] + available_systems = ['HC1', 'CE', 'DDC'] # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 @@ -32,7 +32,7 @@ class HondurasONCAE(CompressedFileSpider): @classmethod def from_crawler(cls, crawler, system=None, *args, **kwargs): spider = super().from_crawler(crawler, system=system, *args, **kwargs) - if system and spider.system not in spider.systems: + if system and spider.system not in spider.available_systems: raise scrapy.exceptions.CloseSpider('Specified system is not recognized') return spider @@ -45,6 +45,7 @@ def start_requests(self): @handle_http_error def parse_list(self, response): + downloaded_systems = set() urls = response.xpath('//a[contains(., "[json]")]/@href').getall() for url in urls: path, file = split(urlparse(url).path) @@ -53,11 +54,12 @@ def parse_list(self, response): continue if self.sample: # if we already downloaded a package for all the available systems - if not self.systems: + if downloaded_systems == self.available_systems: return # if we already processed a file for the current system - if current_system not in self.systems: + if current_system in downloaded_systems: continue - self.systems.remove(current_system) + # add the current system to the set of downloaded_systems + downloaded_systems.add(current_system) # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip yield self.build_request(url, formatter=components(-1))