Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix portugal scraper with json lines #357

Merged
merged 14 commits into from
Apr 22, 2020
34 changes: 27 additions & 7 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,20 +107,40 @@ def _get_crawl_path(self):
# https://docs.python.org/3.8/library/functions.html#super
# https://rhettinger.wordpress.com/2011/05/26/super-considered-super/
class BaseSpider(KingfisherSpiderMixin, scrapy.Spider):
def parse_zipfile(self, response, data_type):

def parse_json_lines(self, f, data_type, url, encoding='utf-8'):
for number, line in enumerate(f):
number += 1
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
yield {
'success': True,
'number': number,
'file_name': 'data.json',
'data': line,
'data_type': data_type,
'url': url,
'encoding': encoding,
}
if self.sample and number > 10:
break

def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'):
"""
Handling response with JSON data in ZIP files
"""
if response.status == 200:
if file_format == 'json_lines':
self.save_response_to_disk(response, 'file.zip')
zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
data = zip_file.open(finfo.filename).read()
if finfo.filename.endswith('.json'):
filename = finfo.filename
filename = finfo.filename
if not filename.endswith('.json'):
filename += '.json'
data = zip_file.open(finfo.filename)
if file_format == 'json_lines':
yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding)
else:
filename = finfo.filename + '.json'
yield self.save_data_to_disk(data, filename, data_type=data_type, url=response.request.url)

yield self.save_data_to_disk(data.read(), filename, data_type, response.request.url,
encoding=encoding)
else:
yield {
'success': False,
Expand Down
17 changes: 1 addition & 16 deletions kingfisher_scrapy/spiders/digiwhist_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,9 @@ def parse(self, response):
}))

# Load a line at the time, pass it to API
number = 0
with tarfile.open(save_file_name, "r:gz") as tar:
with tar.extractfile(tar.getnames()[0]) as readfp:
line = readfp.readline()
while line:
number += 1
yield {
'success': True,
'number': number,
'file_name': 'data.json',
'data': line,
'data_type': 'release_package',
'url': self.start_urls[0],
}
line = readfp.readline()
if self.sample and number > 10:
break

yield from self.parse_json_lines(readfp, 'release_package', url=self.start_urls[0])
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved
else:
yield {
'success': False,
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/spiders/portugal.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,5 @@ def parse_list(self, response):
}

def parse(self, response):
yield from self.parse_zipfile(response, data_type='record_package_json_lines')
yield from self.parse_zipfile(response, data_type='record_package_json_lines',
file_format='json_lines', encoding='ISO-8859-1')
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved