diff --git a/kingfisher_scrapy/spiders/ecuador_emergency.py b/kingfisher_scrapy/spiders/ecuador_emergency.py index 1e631f1e..0e8431a9 100644 --- a/kingfisher_scrapy/spiders/ecuador_emergency.py +++ b/kingfisher_scrapy/spiders/ecuador_emergency.py @@ -14,43 +14,16 @@ class EcuadorEmergency(SimpleSpider): """ name = 'ecuador_emergency' data_type = 'release_package' - custom_settings = { - 'CONCURRENT_REQUESTS': 1, - } - urls = [] def start_requests(self): - url = 'https://portal.compraspublicas.gob.ec/sercop/data-estandar-ocds/' + url = 'https://datosabiertos.compraspublicas.gob.ec/OCDS/' yield scrapy.Request(url, meta={'file_name': 'list.html'}, callback=self.parse_list) @handle_http_error def parse_list(self, response): - for row in response.xpath('//tr'): - html_url = row.xpath('td/strong/a/@href').extract_first() - filename = row.xpath('td/p/strong/text()').extract_first() - if html_url: - data_url = f'{html_url.replace("sharing", "fsdownload")}/ocds-{filename}.json' - self.urls.append((html_url, data_url)) - if self.sample: - break + html_urls = response.xpath('//a/@href').getall() + for html_url in html_urls: + yield self.build_request(response.request.url + html_url, formatter=components(-1)) - yield self.request_cookie() - - def request_cookie(self): - # This request sets a cookie, which must be used immediately to download the data. So, we set - # `CONCURRENT_REQUESTS` to 1, and yield the requests in order. - html_url, data_url = self.urls.pop() - return self.build_request(html_url, meta={'next': data_url}, formatter=components(-1), - callback=self.parse_page) - - @handle_http_error - def parse_page(self, response): - # If there is an error, a request for the data URL redirects to the html URL. To treat this as an error, we set - # `dont_redirect`. - yield self.build_request(response.meta['next'], meta={'dont_redirect': True}, formatter=components(-1)) - - def parse(self, response): - yield from super().parse(response) - - if self.urls: - yield self.request_cookie() + if self.sample: + break