Skip to content

Commit

Permalink
Merge pull request #555 from open-contracting/346-add-date-args
Browse files Browse the repository at this point in the history
Update Honduras scrapers
  • Loading branch information
yolile committed Nov 17, 2020
2 parents 9dde2cf + bf4d753 commit 7911579
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 49 deletions.
44 changes: 21 additions & 23 deletions kingfisher_scrapy/spiders/honduras_oncae.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
from urllib.parse import urlparse

import scrapy

from kingfisher_scrapy.base_spider import CompressedFileSpider
from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicSpider
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.util import components, handle_http_error
from kingfisher_scrapy.util import components


class HondurasONCAE(CompressedFileSpider):
class HondurasONCAE(CompressedFileSpider, PeriodicSpider):
"""
Domain
Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE)
Spider arguments
from_date
Download only releases from this year onward (YYYY format).
If ``until_date`` is provided and ``from_date`` don't, defaults to '2000'.
until_date
Download only releases until this year (YYYY format).
If ``from_date`` is provided and ``until_date`` don't, defaults to current year.
system
Filter by system:
Expand All @@ -32,27 +34,23 @@ class HondurasONCAE(CompressedFileSpider):
# the files take too long to be downloaded, so we increase the download timeout
download_timeout = 900

# PeriodicSpider variables
date_format = 'year'
default_from_date = '2005'
pattern = 'http://200.13.162.79/datosabiertos/{}'

@classmethod
def from_crawler(cls, crawler, system=None, *args, **kwargs):
spider = super().from_crawler(crawler, system=system, *args, **kwargs)
if system and spider.system not in spider.available_systems:
raise SpiderArgumentError(f'spider argument `system`: {spider.system!r} not recognized')
return spider

def start_requests(self):
yield scrapy.Request(
'http://oncae.gob.hn/datosabiertos',
meta={'file_name': 'list.html'},
callback=self.parse_list
)

@handle_http_error
def parse_list(self, response):
urls = response.xpath('//a[contains(., "[json]")]/@href').getall()
for url in urls:
path, file = urlparse(url).path.rsplit('/', 1)
current_system = path.replace('/datosabiertos/', "")
if self.system and current_system != self.system:
def build_urls(self, date):
for system in self.available_systems:
if self.system and system != self.system:
continue
# URL looks like http://200.13.162.79/datosabiertos/HC1/HC1_datos_2020_json.zip
yield self.build_request(url, formatter=components(-1))
yield self.pattern.format(f"{system}/{system}_datos_{date}_json.zip")

def get_formatter(self):
return components(-1)
76 changes: 50 additions & 26 deletions kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,78 @@
import json

import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.base_spider import PeriodicSpider
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.util import components, handle_http_error
from kingfisher_scrapy.util import components


class HondurasPortalBulkFiles(SimpleSpider):
class HondurasPortalBulkFiles(PeriodicSpider):
"""
Domain
Oficina Normativa de Contratación y Adquisiciones del Estado (ONCAE) / Secretaria de Finanzas de Honduras (SEFIN)
Spider arguments
from_date
Download only releases from this date onward (YYYY-MM format).
If ``until_date`` is provided and ``from_date`` don't, defaults to '2005-11'.
until_date
Download only releases until this date (YYYY-MM format).
If ``from_date`` is provided and ``until_date`` don't, defaults to current year-month.
publisher
Filter by publisher:
oncae
Oficina Normativa de Contratación y Adquisiciones del Estado
sefin
Secretaria de Finanzas de Honduras
system
Filter by oncae system:
CE
Catálogo Electrónico
DDC
Módulo de Difusión Directa de Contratos
HC1
HonduCompras 1.0 (Módulo de Difusión de Compras y Contrataciones)
Bulk download documentation
http://www.contratacionesabiertas.gob.hn/descargas/
"""
name = 'honduras_portal_bulk_files'
data_type = 'release_package'
skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases
publishers = {'oncae': 'ONCAE', 'sefin': 'Secretaria de Finanzas'}
available_publishers = {'oncae': 'oficina_normativa', 'sefin': 'secretaria_de_fin_HN.SIAFI2'}
oncae_systems = {'HC1': 'honducompras-1', 'CE': 'catalogo-electronico', 'DDC': 'difusion-directa-contrato'}

# PeriodicSpider variables
date_format = 'year-month'
default_from_date = '2005-11'
pattern = 'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/{}'

@classmethod
def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
if publisher and spider.publisher not in spider.publishers:
def from_crawler(cls, crawler, publisher=None, system=None, *args, **kwargs):
spider = super().from_crawler(crawler, publisher=publisher, system=system, *args, **kwargs)
if publisher and spider.publisher not in spider.available_publishers:
raise SpiderArgumentError(f'spider argument `publisher`: {spider.publisher!r} not recognized')

spider.publisher_name = spider.publishers.get(publisher)
if system:
if spider.publisher != 'oncae':
raise SpiderArgumentError(f'spider argument `system` is not supported for publisher: '
f'{spider.publisher!r}')
if spider.system not in spider.oncae_systems:
raise SpiderArgumentError(f'spider argument `system`: {spider.system!r} not recognized')

return spider

def start_requests(self):
yield scrapy.Request(
'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json',
meta={'file_name': 'list.json'},
callback=self.parse_list,
)

@handle_http_error
def parse_list(self, response):
items = json.loads(response.text)
for item in items:
if self.publisher and self.publisher_name not in item['publicador']:
def build_urls(self, date):
for publisher in self.available_publishers:
if self.publisher and publisher != self.publisher:
continue
url = item['urls']['json']
yield self.build_request(url, formatter=components(-1))

if publisher == 'oncae':
for system in self.oncae_systems:
if self.system and system != self.system:
continue
yield self.pattern.format(f"{self.available_publishers[publisher]}_"
f"{self.oncae_systems[system]}_{date.year}_{date.month:02d}.json")
else:
yield self.pattern.format(f"{self.available_publishers[publisher]}_"
f"{date.year}_{date.month:02d}.json")

def get_formatter(self):
return components(-1)

0 comments on commit 7911579

Please sign in to comment.