-
Notifications
You must be signed in to change notification settings - Fork 12
/
honduras_portal_bulk_files.py
50 lines (41 loc) · 1.86 KB
/
honduras_portal_bulk_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import json
import scrapy
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, handle_http_error
class HondurasPortalBulkFiles(SimpleSpider):
"""
Bulk download documentation
http://www.contratacionesabiertas.gob.hn/descargas/
Spider arguments
publisher
Filter the data by a specific publisher.
``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
sample
If ``publisher`` is also provided, the set number of release packages is downloaded from that publisher.
"""
name = 'honduras_portal_bulk_files'
data_type = 'release_package'
skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases
publishers = {'oncae': 'ONCAE', 'sefin': 'Secretaria de Finanzas'}
@classmethod
def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
if publisher and publisher not in spider.publishers:
raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')
spider.publisher_name = spider.publishers.get(publisher)
return spider
def start_requests(self):
yield scrapy.Request(
'http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json',
meta={'file_name': 'list.json'},
callback=self.parse_list,
)
@handle_http_error
def parse_list(self, response):
items = json.loads(response.text)
for item in items:
if self.publisher and self.publisher_name not in item['publicador']:
continue
url = item['urls']['json']
yield self.build_request(url, formatter=components(-1))