From dec74c8b327e251cbb095232323a1c4ecdf24441 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Wed, 16 Sep 2020 20:18:00 -0400 Subject: [PATCH 1/2] Update sample argument and add system argument --- kingfisher_scrapy/spiders/honduras_oncae.py | 31 +++++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 038947bb5..449d6fcd7 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -1,3 +1,6 @@ +from os.path import split +from urllib.parse import urlparse + import scrapy from kingfisher_scrapy.base_spider import CompressedFileSpider @@ -9,16 +12,29 @@ class HondurasONCAE(CompressedFileSpider): Bulk download documentation http://oncae.gob.hn/datosabiertos Spider arguments + system + Download only data from the provided system. + ``HC1`` for "HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones" system. + ``CE`` for "Módulo de Difusión Directa de Contratos" system. + ``DDC`` for "Catálogo Electrónico" system. sample - Downloads the first package listed on the downloads page. + Downloads the first package listed on the downloads page for each system. """ name = 'honduras_oncae' data_type = 'release_package' skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases + systems = ['HC1', 'CE', 'DDC'] # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + spider = super().from_crawler(crawler, *args, **kwargs) + if hasattr(spider, 'system') and spider.system not in spider.systems: + raise scrapy.exceptions.CloseSpider('Specified system is not recognized') + return spider + def start_requests(self): yield scrapy.Request( 'http://oncae.gob.hn/datosabiertos', @@ -28,9 +44,18 @@ def start_requests(self): @handle_http_error def parse_list(self, response): + systems_flags = {system: False for system in self.systems} urls = response.xpath('//a[contains(., "[json]")]/@href').getall() - if self.sample: - urls = [urls[0]] for url in urls: + path, file = split(urlparse(url).path) + current_system = path.replace('/datosabiertos/', "") + if hasattr(self, 'system') and current_system != self.system: + continue + if self.sample: + if systems_flags[current_system]: + continue + if next((system for system in systems_flags if not system), False): + return + systems_flags[current_system] = True # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip yield self.build_request(url, formatter=components(-1)) From 4f3c77437dcc1e04f98ef470c1aa576a6114a843 Mon Sep 17 00:00:00 2001 From: Andres Aguilera Date: Wed, 16 Sep 2020 20:18:55 -0400 Subject: [PATCH 2/2] Correct docstrings --- kingfisher_scrapy/spiders/colombia.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index e4eb357de..e8a2abde0 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -23,7 +23,7 @@ class Colombia(LinksSpider): The year to crawl. See API documentation for valid values. from_date Download only releases from this release.date onward (YYYY-MM-DD format). - If `until_date` is provided and ``from_date`` don't, defaults to '2011-01-01'. + If ``until_date`` is provided and ``from_date`` don't, defaults to '2011-01-01'. until_date Download only releases until this release.date (YYYY-MM-DD format). If ``from_date`` is provided and ``until_date`` don't, defaults to today.