Skip to content

Commit

Permalink
Merge 4f3c774 into 279e79c
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Sep 17, 2020
2 parents 279e79c + 4f3c774 commit d004b38
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 4 deletions.
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/colombia.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Colombia(LinksSpider):
The year to crawl. See API documentation for valid values.
from_date
Download only releases from this release.date onward (YYYY-MM-DD format).
If `until_date` is provided and ``from_date`` don't, defaults to '2011-01-01'.
If ``until_date`` is provided and ``from_date`` don't, defaults to '2011-01-01'.
until_date
Download only releases until this release.date (YYYY-MM-DD format).
If ``from_date`` is provided and ``until_date`` don't, defaults to today.
Expand Down
31 changes: 28 additions & 3 deletions kingfisher_scrapy/spiders/honduras_oncae.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from os.path import split
from urllib.parse import urlparse

import scrapy

from kingfisher_scrapy.base_spider import CompressedFileSpider
Expand All @@ -9,16 +12,29 @@ class HondurasONCAE(CompressedFileSpider):
Bulk download documentation
http://oncae.gob.hn/datosabiertos
Spider arguments
system
Download only data from the provided system.
``HC1`` for "HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones" system.
``CE`` for "Módulo de Difusión Directa de Contratos" system.
``DDC`` for "Catálogo Electrónico" system.
sample
Downloads the first package listed on the downloads page.
Downloads the first package listed on the downloads page for each system.
"""
name = 'honduras_oncae'
data_type = 'release_package'
skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases
systems = ['HC1', 'CE', 'DDC']

# the files take too long to be downloaded, so we increase the download timeout
download_timeout = 900

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
if hasattr(spider, 'system') and spider.system not in spider.systems:
raise scrapy.exceptions.CloseSpider('Specified system is not recognized')
return spider

def start_requests(self):
yield scrapy.Request(
'http://oncae.gob.hn/datosabiertos',
Expand All @@ -28,9 +44,18 @@ def start_requests(self):

@handle_http_error
def parse_list(self, response):
systems_flags = {system: False for system in self.systems}
urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
if self.sample:
urls = [urls[0]]
for url in urls:
path, file = split(urlparse(url).path)
current_system = path.replace('/datosabiertos/', "")
if hasattr(self, 'system') and current_system != self.system:
continue
if self.sample:
if systems_flags[current_system]:
continue
if next((system for system in systems_flags if not system), False):
return
systems_flags[current_system] = True
# URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
yield self.build_request(url, formatter=components(-1))

0 comments on commit d004b38

Please sign in to comment.