diff --git a/kingfisher_scrapy/spiders/honduras_oncae.py b/kingfisher_scrapy/spiders/honduras_oncae.py index 256bf212..cce198ba 100644 --- a/kingfisher_scrapy/spiders/honduras_oncae.py +++ b/kingfisher_scrapy/spiders/honduras_oncae.py @@ -1,17 +1,19 @@ -from urllib.parse import urlparse - -import scrapy - -from kingfisher_scrapy.base_spider import CompressedFileSpider +from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicSpider from kingfisher_scrapy.exceptions import SpiderArgumentError -from kingfisher_scrapy.util import components, handle_http_error +from kingfisher_scrapy.util import components -class HondurasONCAE(CompressedFileSpider): +class HondurasONCAE(CompressedFileSpider, PeriodicSpider): """ Domain Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) Spider arguments + from_date + Download only releases from this year onward (YYYY format). + If ``until_date`` is provided and ``from_date`` don't, defaults to '2000'. + until_date + Download only releases until this year (YYYY format). + If ``from_date`` is provided and ``until_date`` don't, defaults to current year. system Filter by system: @@ -32,6 +34,11 @@ class HondurasONCAE(CompressedFileSpider): # the files take too long to be downloaded, so we increase the download timeout download_timeout = 900 + # PeriodicSpider variables + date_format = 'year' + default_from_date = '2005' + pattern = 'http://200.13.162.79/datosabiertos/{}' + @classmethod def from_crawler(cls, crawler, system=None, *args, **kwargs): spider = super().from_crawler(crawler, system=system, *args, **kwargs) @@ -39,20 +46,11 @@ def from_crawler(cls, crawler, system=None, *args, **kwargs): raise SpiderArgumentError(f'spider argument `system`: {spider.system!r} not recognized') return spider - def start_requests(self): - yield scrapy.Request( - 'http://oncae.gob.hn/datosabiertos', - meta={'file_name': 'list.html'}, - callback=self.parse_list - ) - - @handle_http_error - def parse_list(self, response): - urls = response.xpath('//a[contains(., "[json]")]/@href').getall() - for url in urls: - path, file = urlparse(url).path.rsplit('/', 1) - current_system = path.replace('/datosabiertos/', "") - if self.system and current_system != self.system: + def build_urls(self, date): + for system in self.available_systems: + if self.system and system != self.system: continue - # URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip - yield self.build_request(url, formatter=components(-1)) + yield self.pattern.format(f"{system}/{system}_datos_{date}_json.zip") + + def get_formatter(self): + return components(-1) diff --git a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py index 9fef8e71..5df17f0c 100644 --- a/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py +++ b/kingfisher_scrapy/spiders/honduras_portal_bulk_files.py @@ -1,17 +1,19 @@ -import json - -import scrapy - -from kingfisher_scrapy.base_spider import SimpleSpider +from kingfisher_scrapy.base_spider import PeriodicSpider from kingfisher_scrapy.exceptions import SpiderArgumentError -from kingfisher_scrapy.util import components, handle_http_error +from kingfisher_scrapy.util import components -class HondurasPortalBulkFiles(SimpleSpider): +class HondurasPortalBulkFiles(PeriodicSpider): """ Domain Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) / Secretaria de Finanzas de Honduras (SEFIN) Spider arguments + from_date + Download only releases from this date onward (YYYY-MM format). + If ``until_date`` is provided and ``from_date`` don't, defaults to '2005-11'. + until_date + Download only releases until this date (YYYY-MM format). + If ``from_date`` is provided and ``until_date`` don't, defaults to current year-month. publisher Filter by publisher: @@ -19,36 +21,58 @@ class HondurasPortalBulkFiles(SimpleSpider): Oficina Normativa de Contratación y Adquisiciones del Estado sefin Secretaria de Finanzas de Honduras + system + Filter by oncae system: + + CE + Catálogo Electrónico + DDC + Módulo de Difusión Directa de Contratos + HC1 + HonduCompras 1.0 (Módulo de Difusión de Compras y Contrataciones) Bulk download documentation http://www.contratacionesabiertas.gob.hn/descargas/ """ name = 'honduras_portal_bulk_files' data_type = 'release_package' skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases - publishers = {'oncae': 'ONCAE', 'sefin': 'Secretaria de Finanzas'} + available_publishers = {'oncae': 'oficina_normativa', 'sefin': 'secretaria_de_fin_HN.SIAFI2'} + oncae_systems = {'HC1': 'honducompras-1', 'CE': 'catalogo-electronico', 'DDC': 'difusion-directa-contrato'} + + # PeriodicSpider variables + date_format = 'year-month' + default_from_date = '2005-11' + pattern = 'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/{}' @classmethod - def from_crawler(cls, crawler, publisher=None, *args, **kwargs): - spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs) - if publisher and spider.publisher not in spider.publishers: + def from_crawler(cls, crawler, publisher=None, system=None, *args, **kwargs): + spider = super().from_crawler(crawler, publisher=publisher, system=system, *args, **kwargs) + if publisher and spider.publisher not in spider.available_publishers: raise SpiderArgumentError(f'spider argument `publisher`: {spider.publisher!r} not recognized') - spider.publisher_name = spider.publishers.get(publisher) + if system: + if spider.publisher != 'oncae': + raise SpiderArgumentError(f'spider argument `system` is not supported for publisher: ' + f'{spider.publisher!r}') + if spider.system not in spider.oncae_systems: + raise SpiderArgumentError(f'spider argument `system`: {spider.system!r} not recognized') return spider - def start_requests(self): - yield scrapy.Request( - 'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json', - meta={'file_name': 'list.json'}, - callback=self.parse_list, - ) - - @handle_http_error - def parse_list(self, response): - items = json.loads(response.text) - for item in items: - if self.publisher and self.publisher_name not in item['publicador']: + def build_urls(self, date): + for publisher in self.available_publishers: + if self.publisher and publisher != self.publisher: continue - url = item['urls']['json'] - yield self.build_request(url, formatter=components(-1)) + + if publisher == 'oncae': + for system in self.oncae_systems: + if self.system and system != self.system: + continue + yield self.pattern.format(f"{self.available_publishers[publisher]}_" + f"{self.oncae_systems[system]}_{date.year}_{date.month:02d}.json") + else: + yield self.pattern.format(f"{self.available_publishers[publisher]}_" + f"{date.year}_{date.month:02d}.json") + + def get_formatter(self): + return components(-1)