diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 744c831c..c03534ab 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -169,12 +169,12 @@ def _build_file_item(self, number, line, data_type, url, encoding): 'encoding': encoding, } - def parse_json_lines(self, f, data_type, url, encoding='utf-8'): + def parse_json_lines(self, f, data_type, url, encoding='utf-8', in_object=None): for number, line in enumerate(f, 1): if self.sample and number > self.MAX_SAMPLE: break if isinstance(line, bytes): - line = line.decode() + line = line.decode(encoding=encoding) yield self._build_file_item(number, line, data_type, url, encoding) def get_package(self, f, array_name): @@ -217,7 +217,8 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8') """ if response.status == 200: if file_format: - self.save_response_to_disk(response, 'file.zip') + self.save_response_to_disk(response, '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')) + .hexdigest())) zip_file = ZipFile(BytesIO(response.body)) for finfo in zip_file.infolist(): filename = finfo.filename diff --git a/kingfisher_scrapy/spiders/colombia_bulk.py b/kingfisher_scrapy/spiders/colombia_bulk.py index 13d03815..908dcf5a 100644 --- a/kingfisher_scrapy/spiders/colombia_bulk.py +++ b/kingfisher_scrapy/spiders/colombia_bulk.py @@ -1,30 +1,38 @@ -import codecs -import json -from io import BytesIO from urllib.parse import urlparse -from zipfile import ZipFile import scrapy -from kingfisher_scrapy.base_spider import BaseSpider +from kingfisher_scrapy.base_spider import ZipSpider -class ColombiaBulk(BaseSpider): +class ColombiaBulk(ZipSpider): + """ + Bulk download documentation + https://www.colombiacompra.gov.co/transparencia/datos-json + Spider arguments + sample + Downloads the zip file and sends 10 releases to kingfisher process. + """ name = 'colombia_bulk' - start_urls = ['https://www.colombiacompra.gov.co/transparencia/datos-json'] download_warnsize = 0 download_timeout = 99999 custom_settings = { 'DOWNLOAD_FAIL_ON_DATALOSS': False, } - def parse(self, response): + def start_requests(self): + yield scrapy.Request( + url='https://www.colombiacompra.gov.co/transparencia/datos-json', + callback=self.parse_list + ) + + def parse_list(self, response): if response.status == 200: urls = response.css('.enlaces_contenido').css('a::attr(href)').getall() - urls = [urls[0]] if self.is_sample() else urls + urls = [urls[0]] if self.sample else urls for url in urls: filename = urlparse(url).path.split('/')[-1] - yield scrapy.Request(url, meta={'kf_filename': filename}, callback=self.parse_items) + yield scrapy.Request(url, meta={'kf_filename': filename}) else: yield { 'success': False, @@ -32,34 +40,5 @@ def parse(self, response): 'errors': {'http_code': response.status} } - def parse_items(self, response): - if response.status == 200: - release_list = [] - page = 0 - with ZipFile(BytesIO(response.body)) as zfile: - for name in zfile.namelist(): - with zfile.open(name, 'r') as read_file: - for line in codecs.iterdecode(read_file, 'iso-8859-1'): - release_data = json.loads(line)['Release'] - release_list.append(release_data) - if len(release_list) == 1000: - yield self.save_data_to_disk(release_list, '{}-{}'.format(page, - name), - encoding='iso-8859-1', - data_type='release_list', url=response.request.url) - release_list = [] - page = page + 1 - if release_list: - yield self.save_data_to_disk(release_list, '{}-{}'.format(page, - name), - encoding='iso-8859-1', - data_type='release_list', url=response.request.url) - if self.is_sample(): - break - else: - yield { - 'success': False, - 'file_name': response.request.meta['kf_filename'], - 'url': response.request.url, - 'errors': {'http_code': response.status} - } + def parse(self, response): + yield from self.parse_zipfile(response, 'release_in_Release', file_format='json_lines', encoding='iso-8859-1')