Skip to content

Commit

Permalink
Fix decoding error when chunk breaks multibyte character
Browse files Browse the repository at this point in the history
When there is an incomplete multibyte sequence, process the data only
until the start of this sequence. When next chunk is read, prepend the
left overs to it. This should complete the sequence and processing
should continue normally.

Fixes: release-engineering#119
  • Loading branch information
lubomir committed Oct 2, 2019
1 parent 750a0eb commit e03a8b4
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 6 deletions.
30 changes: 29 additions & 1 deletion kobo/shortcuts.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,8 @@ def run(self):

output = "" if universal_newlines else b""
sentinel = "" if universal_newlines else b""
leftover = None
exception = None
while True:
if buffer_size == -1:
lines = proc.stdout.readline()
Expand All @@ -336,16 +338,42 @@ def run(self):

if lines == sentinel:
break

if leftover:
lines = leftover + lines
leftover = None

if stdout:
if not universal_newlines:
sys.stdout.write(lines.decode('utf-8'))
try:
sys.stdout.write(lines.decode('utf-8'))
except UnicodeDecodeError as exc:
if exc.reason != "unexpected end of data":
# This error was not caused by us. If there is an
# incomplete sequence in the middle of the string,
# we would get "invalid continuation byte".
raise
# We split the chunk in the middle of a multibyte
# sequence. Print text until this character, and save
# the rest for later. It will be prepended to the next
# chunk. If there is no next chunk, we will re-raise
# the error.
exception = exc
leftover = lines[exc.start:]
lines = lines[:exc.start]
sys.stdout.write(lines.decode('utf-8'))
else:
sys.stdout.write(lines)
if logfile:
log.write(lines)
if return_stdout:
output += lines
proc.wait()
if leftover:
# There is some data left over. That means there was an unfinished
# multibyte sequence not caused by our splitting. Let's raise the
# stored exception to report it.
raise exception

finally:
if logfile:
Expand Down
19 changes: 14 additions & 5 deletions tests/test_shortcuts.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
import tempfile
from six.moves import StringIO

import pytest

from kobo.shortcuts import force_list, force_tuple, allof, anyof, noneof, oneof, is_empty, iter_chunks, save_to_file, read_from_file, run, read_checksum_file, compute_file_checksums, makedirs, split_path, relative_path
from six.moves import range

Expand Down Expand Up @@ -201,12 +199,23 @@ def test_run_show_cmd_logfile_stdout(self, mock_out):
self.assertEqual(mock_out.getvalue(),
'COMMAND: echo foo\n-----------------\nfoo\n')

@pytest.mark.xfail(reason="Not fixed yet (#119)", strict=True)
def test_run_split_in_middle_of_utf8_sequence(self):
logfile = os.path.join(self.tmp_dir, 'output.log')
cmd = "printf ' ' && bash -c \"printf 'č%.0s' {1..10000}\""
ret, out = run(cmd, show_cmd=True, logfile=logfile, stdout=True)
ret, out = run(cmd, stdout=True)
self.assertEqual(ret, 0)
self.assertEqual(out, b" " + b"\xc4\x8d" * 10000)

def test_run_chunk_ends_with_incomplete_char(self):
cmd = "bash -c \"printf 'a b \\xc4'\""
self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)

def test_run_chunk_with_incomplete_char_in_middle(self):
cmd = "bash -c \"printf 'a \\xc4 b'\""
self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)

def test_run_other_unicode_decode_error(self):
cmd = "bash -c \"printf 'a \\x80 b'\""
self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)

@mock.patch('sys.stdout', new_callable=StringIO)
def test_run_univ_nl_logfile_stdout(self, mock_out):
Expand Down

0 comments on commit e03a8b4

Please sign in to comment.