Fix decoding error when chunk breaks multibyte character

When there is an incomplete multibyte sequence, process the data only until the start of this sequence. When next chunk is read, prepend the left overs to it. This should complete the sequence and processing should continue normally. Fixes: release-engineering#119
lubomir · Oct 2, 2019 · e03a8b4 · e03a8b4
1 parent 750a0eb
commit e03a8b4
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 6 deletions.
diff --git a/kobo/shortcuts.py b/kobo/shortcuts.py
@@ -321,6 +321,8 @@ def run(self):
 
         output = "" if universal_newlines else b""
         sentinel = "" if universal_newlines else b""
+        leftover = None
+        exception = None
         while True:
             if buffer_size == -1:
                 lines = proc.stdout.readline()
@@ -336,16 +338,42 @@ def run(self):
 
             if lines == sentinel:
                 break
+
+            if leftover:
+                lines = leftover + lines
+                leftover = None
+
             if stdout:
                 if not universal_newlines:
-                    sys.stdout.write(lines.decode('utf-8'))
+                    try:
+                        sys.stdout.write(lines.decode('utf-8'))
+                    except UnicodeDecodeError as exc:
+                        if exc.reason != "unexpected end of data":
+                            # This error was not caused by us. If there is an
+                            # incomplete sequence in the middle of the string,
+                            # we would get "invalid continuation byte".
+                            raise
+                        # We split the chunk in the middle of a multibyte
+                        # sequence. Print text until this character, and save
+                        # the rest for later. It will be prepended to the next
+                        # chunk. If there is no next chunk, we will re-raise
+                        # the error.
+                        exception = exc
+                        leftover = lines[exc.start:]
+                        lines = lines[:exc.start]
+                        sys.stdout.write(lines.decode('utf-8'))
                 else:
                     sys.stdout.write(lines)
             if logfile:
                 log.write(lines)
             if return_stdout:
                 output += lines
         proc.wait()
+        if leftover:
+            # There is some data left over. That means there was an unfinished
+            # multibyte sequence not caused by our splitting. Let's raise the
+            # stored exception to report it.
+            raise exception
 
     finally:
         if logfile:

diff --git a/tests/test_shortcuts.py b/tests/test_shortcuts.py
@@ -10,8 +10,6 @@
 import tempfile
 from six.moves import StringIO
 
-import pytest
-
 from kobo.shortcuts import force_list, force_tuple, allof, anyof, noneof, oneof, is_empty, iter_chunks, save_to_file, read_from_file, run, read_checksum_file, compute_file_checksums, makedirs, split_path, relative_path
 from six.moves import range
 
@@ -201,12 +199,23 @@ def test_run_show_cmd_logfile_stdout(self, mock_out):
         self.assertEqual(mock_out.getvalue(),
                          'COMMAND: echo foo\n-----------------\nfoo\n')
 
-    @pytest.mark.xfail(reason="Not fixed yet (#119)", strict=True)
     def test_run_split_in_middle_of_utf8_sequence(self):
-        logfile = os.path.join(self.tmp_dir, 'output.log')
         cmd = "printf ' ' && bash -c \"printf 'č%.0s' {1..10000}\""
-        ret, out = run(cmd, show_cmd=True, logfile=logfile, stdout=True)
+        ret, out = run(cmd, stdout=True)
         self.assertEqual(ret, 0)
+        self.assertEqual(out, b" " + b"\xc4\x8d" * 10000)
+
+    def test_run_chunk_ends_with_incomplete_char(self):
+        cmd = "bash -c \"printf 'a b \\xc4'\""
+        self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)
+
+    def test_run_chunk_with_incomplete_char_in_middle(self):
+        cmd = "bash -c \"printf 'a \\xc4 b'\""
+        self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)
+
+    def test_run_other_unicode_decode_error(self):
+        cmd = "bash -c \"printf 'a \\x80 b'\""
+        self.assertRaises(UnicodeDecodeError, run, cmd, stdout=True)
 
     @mock.patch('sys.stdout', new_callable=StringIO)
     def test_run_univ_nl_logfile_stdout(self, mock_out):