Skip to content

Commit

Permalink
Merge c6755df into e3140c2
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed Nov 12, 2020
2 parents e3140c2 + c6755df commit 4031d00
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/bolivia_agetic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, components
from kingfisher_scrapy.util import components, handle_http_error


class BoliviaAgetic(SimpleSpider):
Expand Down
44 changes: 31 additions & 13 deletions kingfisher_scrapy/spiders/nigeria_portal.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

import scrapy

from kingfisher_scrapy.base_spider import SimpleSpider, browser_user_agent
Expand All @@ -18,22 +20,38 @@ class NigeriaPortal(SimpleSpider):
def start_requests(self):
yield scrapy.Request(
'http://nocopo.bpp.gov.ng/OpenData.aspx',
meta={'file_name': 'form.html'},
callback=self.parse_list
meta={'file_name': 'page-0.html', 'page': 0},
callback=self.parse_list,
)

@handle_http_error
def parse_list(self, response):
formdata = {
'__VIEWSTATE': response.css('input#__VIEWSTATE::attr(value)').extract_first(),
'__VIEWSTATEGENERATOR': 'CA0B0334',
'__EVENTVALIDATION': response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
'dnn$ctr561$no_JsonReport$lbtnExportAll': 'Export Checked to JSON',
}

checks = response.css('input').xpath('@name').getall()
for item in checks:
if 'dnn$ctr' in item and 'chbIsDoing' in item:
self.logger.debug('Crawled page {}'.format(response.xpath('//td[@colspan="8"]/span/text()').get()))

page = response.request.meta['page']

formdata = {}
for item in response.xpath('//input/@name').getall():
if re.search(r'^dnn\$ctr561\$no_JsonReport\$DGno_Proc_PlanningPublished\$ctl\d+\$chbIsDoing$', item):
formdata.update({item: 'on'})

yield scrapy.FormRequest.from_response(response, formdata=formdata, meta={'file_name': 'all.json'})
yield scrapy.FormRequest.from_response(
response,
formdata=formdata,
clickdata={'name': 'dnn$ctr561$no_JsonReport$lbtnExportAll'},
meta={'file_name': f'page-{page}.json'},
)

# Get the next pagination link, and simulate clicking on it.
href = response.xpath('//td[@colspan="8"]/span/following-sibling::a[1]/@href').get()
if href:
yield scrapy.FormRequest.from_response(
response,
formdata={
'__EVENTTARGET': re.search(r"'(.+?)'", href)[1],
'__EVENTARGUMENT': '',
},
dont_click=True,
meta={'file_name': f'page-{page}.html', 'page': page + 1},
callback=self.parse_list,
)
5 changes: 2 additions & 3 deletions tests/pipelines/test_unflatten.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from openpyxl import Workbook
from openpyxl.writer.excel import save_virtual_workbook

import pytest
from flattentool.input import BadXLSXZipFile
from openpyxl import Workbook
from openpyxl.writer.excel import save_virtual_workbook

from kingfisher_scrapy.items import File
from kingfisher_scrapy.pipelines import Unflatten
Expand Down

0 comments on commit 4031d00

Please sign in to comment.