-
Notifications
You must be signed in to change notification settings - Fork 12
/
mexico_inai_portal.py
41 lines (35 loc) · 1.44 KB
/
mexico_inai_portal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import scrapy
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, handle_http_error
class MexicoINAIPortal(SimpleSpider):
"""
Spider arguments
sample
Downloads the records listed on the first page at
http://contratacionesabiertas.inai.org.mx/contratacionesabiertas/contratos/.
"""
name = 'mexico_inai_portal'
data_type = 'record'
def build_url(self, page):
return scrapy.FormRequest(
'http://contratacionesabiertas.inai.org.mx/contratacionesabiertas/pagination',
meta={'file_name': f'{page}.html'},
callback=self.parse_list,
formdata={'npage': page,
'keyword': '',
'process': '',
'stage': '',
'status': '',
'year': '',
'orderby': 'datesigned'}
)
def start_requests(self):
yield self.build_url('1')
@handle_http_error
def parse_list(self, response):
for row in response.xpath('//div[@class="contract-download col-md-1"]'):
url = row.xpath('div/a/@href').extract_first()
yield self.build_request(url, formatter=components(-1))
next_page = response.xpath('//ul[@class="pagination"]/li/a[@aria-label="Next"]/@data-page').extract_first()
if next_page and not self.sample:
yield self.build_url(next_page)