raise a parser error even in recovery mode when encountering undecoda…

…ble input to avoid having to deal with mixed-encoding trees
lxml · May 24, 2014 · 46e9309 · 46e9309
1 parent fd4c255
commit 46e9309
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 4 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -14,13 +14,18 @@ Features added
 * ``xmlfile()`` accepts a new argument ``close=True`` to close file(-like)
   objects after writing to them.
 
+* Allow "bytearray" type for text input.
+
 Bugs fixed
 ----------
 
 Other changes
 -------------
 
-* Allow "bytearray" type for text input.
+* LP#400588: decoding errors have become hard errors even in recovery mode.
+  Previously, they could lead to an internal tree representation in a mixed
+  encoding state, which lead to very late errors or even silently incorrect
+  behaviour during tree traversal or serialisation.
 
 * Requires Python 2.6, 2.7, 3.2 or later. No longer supports
   Python 2.4, 2.5 and 3.1, use lxml 3.3.x for those.

diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi
@@ -642,9 +642,18 @@ cdef xmlDoc* _handleParseResult(_ParserContext context,
         c_ctxt.myDoc = NULL
 
     if result is not NULL:
-        if context._validator is not None and \
+        if (context._validator is not None and
-                not context._validator.isvalid():
+                not context._validator.isvalid()):
-            well_formed = 0 # actually not 'valid', but anyway ...
+            well_formed = 0  # actually not 'valid', but anyway ...
+        elif (not c_ctxt.wellFormed and
+                c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
+                [1 for error in context._error_log
+                 if error.type == ErrorTypes.ERR_INVALID_CHAR]):
+            # An encoding error occurred and libxml2 switched from UTF-8
+            # input to (undecoded) Latin-1, at some arbitrary point in the
+            # document.  Better raise an error than allowing for a broken
+            # tree with mixed encodings.
+            well_formed = 0
         elif recover or (c_ctxt.wellFormed and
                          c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
             well_formed = 1

diff --git a/src/lxml/tests/test_unicode.py b/src/lxml/tests/test_unicode.py
@@ -26,6 +26,7 @@
 uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>"
               ).decode("unicode_escape")
 
+
 class UnicodeTestCase(HelperTestCase):
     def test_unicode_xml(self):
         tree = etree.XML('<p>%s</p>' % uni)
@@ -95,7 +96,20 @@ def test_unicode_parse_stringio(self):
 ##         self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'),
 ##                           uxml)
 
+
+class EncodingsTestCase(HelperTestCase):
+    def test_illegal_utf8(self):
+        data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
+        self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data)
+
+    def test_illegal_utf8_recover(self):
+        data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
+        parser = etree.XMLParser(recover=True)
+        self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser)
+
+
 def test_suite():
     suite = unittest.TestSuite()
     suite.addTests([unittest.makeSuite(UnicodeTestCase)])
+    suite.addTests([unittest.makeSuite(EncodingsTestCase)])
     return suite