Skip to content

Commit

Permalink
Update Colombia Base spider to use ZipSpider
Browse files Browse the repository at this point in the history
closes #376

Signed-off-by: Yohanna Lisnichuk <yohanitalisnichuk@gmail.com>
  • Loading branch information
yolile committed May 6, 2020
1 parent 24fa049 commit a625a32
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 44 deletions.
7 changes: 4 additions & 3 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,12 @@ def _build_file_item(self, number, line, data_type, url, encoding):
'encoding': encoding,
}

def parse_json_lines(self, f, data_type, url, encoding='utf-8'):
def parse_json_lines(self, f, data_type, url, encoding='utf-8', in_object=None):
for number, line in enumerate(f, 1):
if self.sample and number > self.MAX_SAMPLE:
break
if isinstance(line, bytes):
line = line.decode()
line = line.decode(encoding=encoding)
yield self._build_file_item(number, line, data_type, url, encoding)

def get_package(self, f, array_name):
Expand Down Expand Up @@ -217,7 +217,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
"""
if response.status == 200:
if file_format:
self.save_response_to_disk(response, 'file.zip')
self.save_response_to_disk(response, '{}.zip'.format(hashlib.md5(response.url.encode('utf-8'))
.hexdigest()))
zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
filename = finfo.filename
Expand Down
61 changes: 20 additions & 41 deletions kingfisher_scrapy/spiders/colombia_bulk.py
Original file line number Diff line number Diff line change
@@ -1,65 +1,44 @@
import codecs
import json
from io import BytesIO
from urllib.parse import urlparse
from zipfile import ZipFile

import scrapy

from kingfisher_scrapy.base_spider import BaseSpider
from kingfisher_scrapy.base_spider import ZipSpider


class ColombiaBulk(BaseSpider):
class ColombiaBulk(ZipSpider):
"""
Bulk download documentation
https://www.colombiacompra.gov.co/transparencia/datos-json
Spider arguments
sample
Downloads the zip file and sends 10 releases to kingfisher process.
"""
name = 'colombia_bulk'
start_urls = ['https://www.colombiacompra.gov.co/transparencia/datos-json']
download_warnsize = 0
download_timeout = 99999
custom_settings = {
'DOWNLOAD_FAIL_ON_DATALOSS': False,
}

def parse(self, response):
def start_requests(self):
yield scrapy.Request(
url='https://www.colombiacompra.gov.co/transparencia/datos-json',
callback=self.parse_list
)

def parse_list(self, response):
if response.status == 200:
urls = response.css('.enlaces_contenido').css('a::attr(href)').getall()
urls = [urls[0]] if self.is_sample() else urls
urls = [urls[0]] if self.sample else urls
for url in urls:
filename = urlparse(url).path.split('/')[-1]
yield scrapy.Request(url, meta={'kf_filename': filename}, callback=self.parse_items)
yield scrapy.Request(url, meta={'kf_filename': filename})
else:
yield {
'success': False,
'url': response.request.url,
'errors': {'http_code': response.status}
}

def parse_items(self, response):
if response.status == 200:
release_list = []
page = 0
with ZipFile(BytesIO(response.body)) as zfile:
for name in zfile.namelist():
with zfile.open(name, 'r') as read_file:
for line in codecs.iterdecode(read_file, 'iso-8859-1'):
release_data = json.loads(line)['Release']
release_list.append(release_data)
if len(release_list) == 1000:
yield self.save_data_to_disk(release_list, '{}-{}'.format(page,
name),
encoding='iso-8859-1',
data_type='release_list', url=response.request.url)
release_list = []
page = page + 1
if release_list:
yield self.save_data_to_disk(release_list, '{}-{}'.format(page,
name),
encoding='iso-8859-1',
data_type='release_list', url=response.request.url)
if self.is_sample():
break
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
'url': response.request.url,
'errors': {'http_code': response.status}
}
def parse(self, response):
yield from self.parse_zipfile(response, 'release_in_Release', file_format='json_lines', encoding='iso-8859-1')

0 comments on commit a625a32

Please sign in to comment.