Skip to content

Commit

Permalink
utils: correctly write chunks of records to files
Browse files Browse the repository at this point in the history
* Previously, the root tag was not repeated in every file.
* Fixes inveniosoftware#41.

Signed-off-by: Micha Moskovic <michamos@gmail.com>
  • Loading branch information
michamos committed Sep 22, 2017
1 parent c8a8f6c commit 50da7be
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 14 deletions.
36 changes: 22 additions & 14 deletions invenio_oaiharvester/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from __future__ import absolute_import, print_function, unicode_literals

import codecs
import itertools
import os
import re
import tempfile
Expand Down Expand Up @@ -182,6 +183,16 @@ def create_file_name(output_dir):
return file_name


def chunks(iterable, size):
"""Yield successive chunks of specific size from iterable."""
iterable = iter(iterable)
while True:
chunk = tuple(itertools.islice(iterable, size))
if not chunk:
return
yield chunk


def write_to_dir(records, output_dir, max_records=1000, encoding='utf-8'):
"""Check if the output directory exists, and creates it if it does not.
Expand All @@ -195,19 +206,16 @@ def write_to_dir(records, output_dir, max_records=1000, encoding='utf-8'):
return [], 0

output_path = check_or_create_dir(output_dir)

files_created = [create_file_name(output_path)]
files_created = []
total = 0 # total number of records processed
f = codecs.open(files_created[0], 'w+', encoding=encoding)
f.write('<ListRecords>')
for record in records:
total += 1
if total > 1 and total % max_records == 0:
# we need a new file to write to
f.close()
files_created.append(create_file_name(output_path))
f = codecs.open(files_created[-1], 'w+', encoding=encoding)
f.write(record.raw)
f.write('</ListRecords>')
f.close()

for chunk in chunks(records, max_records):
files_created.append(create_file_name(output_path))
with codecs.open(files_created[-1], 'w+', encoding=encoding) as f:
f.write('<ListRecords>')
for record in chunk:
f.write(record.raw)
total += 1
f.write('</ListRecords>')

return files_created, total
6 changes: 6 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@ def test_write_to_dir(app, tmpdir):
assert len(files) == 2
assert total == 2

for file_name in files:
with open(file_name) as f:
content = f.read()
assert content.startswith('<ListRecords>')
assert content.endswith('</ListRecords>')

files, total = write_to_dir([], tmpdir.dirname, max_records=1)
assert len(files) == 0
assert total == 0
Expand Down

0 comments on commit 50da7be

Please sign in to comment.