Permalink
Browse files

raise a parser error even in recovery mode when encountering undecoda…

…ble input to avoid having to deal with mixed-encoding trees
  • Loading branch information...
scoder committed May 24, 2014
1 parent fd4c255 commit 46e9309a585da7b3de7ecab137e7427cf2dcebf4
Showing with 32 additions and 4 deletions.
  1. +6 −1 CHANGES.txt
  2. +12 −3 src/lxml/parser.pxi
  3. +14 −0 src/lxml/tests/test_unicode.py
View
@@ -14,13 +14,18 @@ Features added
* ``xmlfile()`` accepts a new argument ``close=True`` to close file(-like)
objects after writing to them.
+* Allow "bytearray" type for text input.
+
Bugs fixed
----------
Other changes
-------------
-* Allow "bytearray" type for text input.
+* LP#400588: decoding errors have become hard errors even in recovery mode.
+ Previously, they could lead to an internal tree representation in a mixed
+ encoding state, which lead to very late errors or even silently incorrect
+ behaviour during tree traversal or serialisation.
* Requires Python 2.6, 2.7, 3.2 or later. No longer supports
Python 2.4, 2.5 and 3.1, use lxml 3.3.x for those.
View
@@ -642,9 +642,18 @@ cdef xmlDoc* _handleParseResult(_ParserContext context,
c_ctxt.myDoc = NULL
if result is not NULL:
- if context._validator is not None and \
- not context._validator.isvalid():
- well_formed = 0 # actually not 'valid', but anyway ...
+ if (context._validator is not None and
+ not context._validator.isvalid()):
+ well_formed = 0 # actually not 'valid', but anyway ...
+ elif (not c_ctxt.wellFormed and
+ c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
+ [1 for error in context._error_log
+ if error.type == ErrorTypes.ERR_INVALID_CHAR]):
+ # An encoding error occurred and libxml2 switched from UTF-8
+ # input to (undecoded) Latin-1, at some arbitrary point in the
+ # document. Better raise an error than allowing for a broken
+ # tree with mixed encodings.
+ well_formed = 0
elif recover or (c_ctxt.wellFormed and
c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
well_formed = 1
@@ -26,6 +26,7 @@
uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>"
).decode("unicode_escape")
+
class UnicodeTestCase(HelperTestCase):
def test_unicode_xml(self):
tree = etree.XML('<p>%s</p>' % uni)
@@ -95,7 +96,20 @@ def test_unicode_parse_stringio(self):
## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'),
## uxml)
+
+class EncodingsTestCase(HelperTestCase):
+ def test_illegal_utf8(self):
+ data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
+ self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data)
+
+ def test_illegal_utf8_recover(self):
+ data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
+ parser = etree.XMLParser(recover=True)
+ self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser)
+
+
def test_suite():
suite = unittest.TestSuite()
suite.addTests([unittest.makeSuite(UnicodeTestCase)])
+ suite.addTests([unittest.makeSuite(EncodingsTestCase)])
return suite

0 comments on commit 46e9309

Please sign in to comment.