Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix portugal scraper with json lines #357

Merged
merged 14 commits into from
Apr 22, 2020
40 changes: 33 additions & 7 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,21 @@ def _get_crawl_path(self):
# https://docs.python.org/3.8/library/functions.html#super
# https://rhettinger.wordpress.com/2011/05/26/super-considered-super/
class BaseSpider(KingfisherSpiderMixin, scrapy.Spider):

def parse_json_lines(self, f, data_type, url, encoding='utf-8'):
for number, line in enumerate(f, 1):
yield {
'success': True,
'number': number,
'file_name': 'data.json',
'data': line,
'data_type': data_type,
'url': url,
'encoding': encoding,
}
if self.sample and number > 9:
break

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BaseSpider, cls).from_crawler(crawler, *args, **kwargs)
Expand Down Expand Up @@ -151,20 +166,31 @@ def from_crawler(cls, crawler, *args, **kwargs):

return spider

def parse_zipfile(self, response, data_type):
def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8'):
"""
Handling response with JSON data in ZIP files

:param str file_format: The zipped files format. If this is set to 'json_lines', then the zipped file will be
slitted by lines before send it to kingfisher-process and only the zip file will be
stored as file.
:param response response: the response that contains the zip file.
:param str data_type: the zipped files data_type
:param str encoding: the zipped files encoding. Default to utf-8
"""
if response.status == 200:
if file_format == 'json_lines':
self.save_response_to_disk(response, 'file.zip')
zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
data = zip_file.open(finfo.filename).read()
if finfo.filename.endswith('.json'):
filename = finfo.filename
filename = finfo.filename
if not filename.endswith('.json'):
filename += '.json'
data = zip_file.open(finfo.filename)
if file_format == 'json_lines':
yield from self.parse_json_lines(data, data_type, response.request.url, encoding=encoding)
else:
filename = finfo.filename + '.json'
yield self.save_data_to_disk(data, filename, data_type=data_type, url=response.request.url)

yield self.save_data_to_disk(data.read(), filename, data_type, response.request.url,
encoding=encoding)
else:
yield {
'success': False,
Expand Down
17 changes: 1 addition & 16 deletions kingfisher_scrapy/spiders/digiwhist_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,9 @@ def parse(self, response):
}))

# Load a line at the time, pass it to API
number = 0
with tarfile.open(save_file_name, "r:gz") as tar:
with tar.extractfile(tar.getnames()[0]) as readfp:
line = readfp.readline()
while line:
number += 1
yield {
'success': True,
'number': number,
'file_name': 'data.json',
'data': line,
'data_type': 'release_package',
'url': self.start_urls[0],
}
line = readfp.readline()
if self.sample and number > 10:
break

yield from self.parse_json_lines(readfp, 'release_package', self.start_urls[0])
else:
yield {
'success': False,
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/spiders/portugal.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,5 @@ def parse_list(self, response):
}

def parse(self, response):
yield from self.parse_zipfile(response, data_type='record_package_json_lines')
yield from self.parse_zipfile(response, data_type='record_package_json_lines',
file_format='json_lines', encoding='iso-8859-1')
30 changes: 30 additions & 0 deletions tests/test_base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,36 @@ def test_parse_zipfile_200():
assert actual['success'] is True and actual['file_name'].find('.json')


def test_parse_zipfile_json_lines():
response = text.TextResponse('test')
response.status = 200
response.request = Mock()
response.request.meta = {'kf_filename': 'test.json'}
response.request.url = 'url'
with TemporaryDirectory() as tmpdirname:
files_store = os.path.join(tmpdirname, 'data')
tmp = os.path.join(files_store, 'test/20010203_040506')
os.makedirs(tmp)
with open(tmp + "test.json", 'w') as f:
f.write('{"key": "value"}\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}'
'\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}\n{"key": "value"}'
'\n{"key": "value"}')
with ZipFile(tmp + '/test.zip', 'w') as z:
z.write(tmp + "test.json")
with open(tmp + '/test.zip', 'rb') as z:
response = response.replace(body=z.read())
spider = spider_with_crawler(spider_class=BaseSpider)
spider.crawler.settings['FILES_STORE'] = files_store
actual = spider.parse_zipfile(response, None, file_format='json_lines').__next__()
assert actual['success'] is True and actual['number'] == 1
spider.sample = True
total = 0
for item in spider.parse_zipfile(response, None, file_format='json_lines'):
total = total + 1
assert item['success'] is True and item['number'] == total
assert total == 10


def test_date_arguments():
test_date = '2000-01-01'
error_message = "time data 'test' does not match format '%Y-%m-%d'"
Expand Down