diff --git a/kingfisher_scrapy/spiders/colombia.py b/kingfisher_scrapy/spiders/colombia.py index e4eb357de..e8a2abde0 100644 --- a/kingfisher_scrapy/spiders/colombia.py +++ b/kingfisher_scrapy/spiders/colombia.py @@ -23,7 +23,7 @@ class Colombia(LinksSpider): The year to crawl. See API documentation for valid values. from_date Download only releases from this release.date onward (YYYY-MM-DD format). - If `until_date` is provided and ``from_date`` don't, defaults to '2011-01-01'. + If ``until_date`` is provided and ``from_date`` don't, defaults to '2011-01-01'. until_date Download only releases until this release.date (YYYY-MM-DD format). If ``from_date`` is provided and ``until_date`` don't, defaults to today. diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 038947bb5..3f3c80fe5 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -1,3 +1,6 @@ +from os.path import split +from urllib.parse import urlparse + import scrapy from kingfisher_scrapy.base_spider import CompressedFileSpider @@ -9,16 +12,30 @@ class HondurasONCAE(CompressedFileSpider): Bulk download documentation http://oncae.gob.hn/datosabiertos Spider arguments + system + Download only data from the provided system. + ``HC1`` for "HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones" system. + ``CE`` for "Módulo de Difusión Directa de Contratos" system. + ``DDC`` for "Catálogo Electrónico" system. sample - Downloads the first package listed on the downloads page. + Downloads the first package listed on the downloads page for each system. + If ``system'' is also provided, a single package is downloaded from that system. """ name = 'honduras_oncae' data_type = 'release_package' skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases + available_systems = ['HC1', 'CE', 'DDC'] # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 + @classmethod + def from_crawler(cls, crawler, system=None, *args, **kwargs): + spider = super().from_crawler(crawler, system=system, *args, **kwargs) + if system and spider.system not in spider.available_systems: + raise scrapy.exceptions.CloseSpider('Specified system is not recognized') + return spider + def start_requests(self): yield scrapy.Request( 'http://oncae.gob.hn/datosabiertos', @@ -28,9 +45,21 @@ def start_requests(self): @handle_http_error def parse_list(self, response): + downloaded_systems = set() urls = response.xpath('//a[contains(., "[json]")]/@href').getall() - if self.sample: - urls = [urls[0]] for url in urls: + path, file = split(urlparse(url).path) + current_system = path.replace('/datosabiertos/', "") + if self.system and current_system != self.system: + continue + if self.sample: + # if we already downloaded a package for all the available systems + if downloaded_systems == self.available_systems: + return + # if we already processed a file for the current system + if current_system in downloaded_systems: + continue + # add the current system to the set of downloaded_systems + downloaded_systems.add(current_system) # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip yield self.build_request(url, formatter=components(-1))