Skip to content

Commit

Permalink
Merge c4182b6 into 279e79c
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Sep 17, 2020
2 parents 279e79c + c4182b6 commit 12a5a97
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 29 deletions.
26 changes: 26 additions & 0 deletions kingfisher_scrapy/spiders/honduras_portal_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import scrapy

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.util import parameters


class HondurasPortalBase(LinksSpider):
next_pointer = '/next'
next_page_formatter = staticmethod(parameters('page'))
publishers = ['oncae', 'sefin']

download_delay = 0.9

@classmethod
def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
if publisher and publisher not in spider.publishers:
raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')

return spider

def start_requests(self):
url = self.url
if self.publisher:
url = url + '&publisher=' + self.publisher
yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
27 changes: 24 additions & 3 deletions kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,31 @@ class HondurasPortalBulkFiles(SimpleSpider):
Bulk download documentation
http://www.contratacionesabiertas.gob.hn/descargas/
Spider arguments
publisher
Filter the data by a specific publisher.
``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
sample
Downloads the first package listed in http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json.
If ``publisher'' is also provided, a single package is downloaded from that publisher.
"""
name = 'honduras_portal_bulk_files'
data_type = 'release_package'
skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases
publishers = ['oncae', 'sefin']

@classmethod
def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
if publisher and publisher not in spider.publishers:
raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')

if publisher == 'oncae':
spider.publisher_filter = 'ONCAE'
elif publisher == 'sefin':
spider.publisher_filter = 'Secretaria de Finanzas'

return spider

def start_requests(self):
yield scrapy.Request(
Expand All @@ -28,9 +47,11 @@ def start_requests(self):
@handle_http_error
def parse_list(self, response):
items = json.loads(response.text)
if self.sample:
items = [items[0]]

for item in items:
if self.publisher and self.publisher_filter not in item['publicador']:
continue
url = item['urls']['json']
yield self.build_request(url, formatter=components(-1))

if self.sample:
return
21 changes: 8 additions & 13 deletions kingfisher_scrapy/spiders/honduras_portal_records.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
import scrapy
from kingfisher_scrapy.spiders.honduras_portal_base import HondurasPortalBase

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.util import parameters


class HondurasPortalRecords(LinksSpider):
class HondurasPortalRecords(HondurasPortalBase):
"""
API documentation
http://www.contratacionesabiertas.gob.hn/manual_api/
Swagger API documentation
http://www.contratacionesabiertas.gob.hn/servicio/
Spider arguments
publisher
Filter the data by a specific publisher.
``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
sample
Download only the first record package in the dataset.
If ``publisher'' is also provided, a single package is downloaded from that publisher.
"""
name = 'honduras_portal_records'
data_type = 'record_package'
data_pointer = '/recordPackage'
next_pointer = '/next'
next_page_formatter = staticmethod(parameters('page'))
skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases

download_delay = 0.9

def start_requests(self):
url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json'
yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json'
21 changes: 8 additions & 13 deletions kingfisher_scrapy/spiders/honduras_portal_releases.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,22 @@
import scrapy
from kingfisher_scrapy.spiders.honduras_portal_base import HondurasPortalBase

from kingfisher_scrapy.base_spider import LinksSpider
from kingfisher_scrapy.util import parameters


class HondurasPortalReleases(LinksSpider):
class HondurasPortalReleases(HondurasPortalBase):
"""
API documentation
http://www.contratacionesabiertas.gob.hn/manual_api/
Swagger API documentation
http://www.contratacionesabiertas.gob.hn/servicio/
Spider arguments
publisher
Filter the data by a specific publisher.
``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
sample
Download only the first release package in the dataset.
If ``publisher'' is also provided, a single package is downloaded from that publisher.
"""
name = 'honduras_portal_releases'
data_type = 'release_package'
data_pointer = '/releasePackage'
next_pointer = '/next'
next_page_formatter = staticmethod(parameters('page'))

download_delay = 0.9

def start_requests(self):
url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json'
yield scrapy.Request(url, meta={'file_name': 'page-1.json'})
url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json'

0 comments on commit 12a5a97

Please sign in to comment.