Skip to content

Commit

Permalink
Merge bb69f7c into 282eaf9
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Sep 21, 2020
2 parents 282eaf9 + bb69f7c commit f5b9e57
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 4 deletions.
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/colombia.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Colombia(LinksSpider):
The year to crawl. See API documentation for valid values.
from_date
Download only releases from this release.date onward (YYYY-MM-DD format).
If `until_date` is provided and ``from_date`` don't, defaults to '2011-01-01'.
If ``until_date`` is provided and ``from_date`` don't, defaults to '2011-01-01'.
until_date
Download only releases until this release.date (YYYY-MM-DD format).
If ``from_date`` is provided and ``until_date`` don't, defaults to today.
Expand Down
35 changes: 32 additions & 3 deletions kingfisher_scrapy/spiders/honduras_oncae.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from os.path import split
from urllib.parse import urlparse

import scrapy

from kingfisher_scrapy.base_spider import CompressedFileSpider
Expand All @@ -9,16 +12,30 @@ class HondurasONCAE(CompressedFileSpider):
Bulk download documentation
http://oncae.gob.hn/datosabiertos
Spider arguments
system
Download only data from the provided system.
``HC1`` for "HonduCompras 1.0 - Módulo de Difusión de Compras y Contrataciones" system.
``CE`` for "Módulo de Difusión Directa de Contratos" system.
``DDC`` for "Catálogo Electrónico" system.
sample
Downloads the first package listed on the downloads page.
Downloads the first package listed on the downloads page for each system.
If ``system'' is also provided, a single package is downloaded from that system.
"""
name = 'honduras_oncae'
data_type = 'release_package'
skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases
available_systems = ['HC1', 'CE', 'DDC']

# the files take too long to be downloaded, so we increase the download timeout
download_timeout = 900

@classmethod
def from_crawler(cls, crawler, system=None, *args, **kwargs):
spider = super().from_crawler(crawler, system=system, *args, **kwargs)
if system and spider.system not in spider.available_systems:
raise scrapy.exceptions.CloseSpider('Specified system is not recognized')
return spider

def start_requests(self):
yield scrapy.Request(
'http://oncae.gob.hn/datosabiertos',
Expand All @@ -28,9 +45,21 @@ def start_requests(self):

@handle_http_error
def parse_list(self, response):
downloaded_systems = set()
urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
if self.sample:
urls = [urls[0]]
for url in urls:
path, file = split(urlparse(url).path)
current_system = path.replace('/datosabiertos/', "")
if self.system and current_system != self.system:
continue
if self.sample:
# if we already downloaded a package for all the available systems
if downloaded_systems == self.available_systems:
return
# if we already processed a file for the current system
if current_system in downloaded_systems:
continue
# add the current system to the set of downloaded_systems
downloaded_systems.add(current_system)
# URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
yield self.build_request(url, formatter=components(-1))

0 comments on commit f5b9e57

Please sign in to comment.