Skip to content

Commit

Permalink
raise a parser error even in recovery mode when encountering undecoda…
Browse files Browse the repository at this point in the history
…ble input to avoid having to deal with mixed-encoding trees
  • Loading branch information
scoder committed May 24, 2014
1 parent fd4c255 commit 46e9309
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 4 deletions.
7 changes: 6 additions & 1 deletion CHANGES.txt
Expand Up @@ -14,13 +14,18 @@ Features added
* ``xmlfile()`` accepts a new argument ``close=True`` to close file(-like) * ``xmlfile()`` accepts a new argument ``close=True`` to close file(-like)
objects after writing to them. objects after writing to them.


* Allow "bytearray" type for text input.

Bugs fixed Bugs fixed
---------- ----------


Other changes Other changes
------------- -------------


* Allow "bytearray" type for text input. * LP#400588: decoding errors have become hard errors even in recovery mode.
Previously, they could lead to an internal tree representation in a mixed
encoding state, which lead to very late errors or even silently incorrect
behaviour during tree traversal or serialisation.


* Requires Python 2.6, 2.7, 3.2 or later. No longer supports * Requires Python 2.6, 2.7, 3.2 or later. No longer supports
Python 2.4, 2.5 and 3.1, use lxml 3.3.x for those. Python 2.4, 2.5 and 3.1, use lxml 3.3.x for those.
Expand Down
15 changes: 12 additions & 3 deletions src/lxml/parser.pxi
Expand Up @@ -642,9 +642,18 @@ cdef xmlDoc* _handleParseResult(_ParserContext context,
c_ctxt.myDoc = NULL c_ctxt.myDoc = NULL


if result is not NULL: if result is not NULL:
if context._validator is not None and \ if (context._validator is not None and
not context._validator.isvalid(): not context._validator.isvalid()):
well_formed = 0 # actually not 'valid', but anyway ... well_formed = 0 # actually not 'valid', but anyway ...
elif (not c_ctxt.wellFormed and
c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
[1 for error in context._error_log
if error.type == ErrorTypes.ERR_INVALID_CHAR]):
# An encoding error occurred and libxml2 switched from UTF-8
# input to (undecoded) Latin-1, at some arbitrary point in the
# document. Better raise an error than allowing for a broken
# tree with mixed encodings.
well_formed = 0
elif recover or (c_ctxt.wellFormed and elif recover or (c_ctxt.wellFormed and
c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR): c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
well_formed = 1 well_formed = 1
Expand Down
14 changes: 14 additions & 0 deletions src/lxml/tests/test_unicode.py
Expand Up @@ -26,6 +26,7 @@
uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>" uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>"
).decode("unicode_escape") ).decode("unicode_escape")



class UnicodeTestCase(HelperTestCase): class UnicodeTestCase(HelperTestCase):
def test_unicode_xml(self): def test_unicode_xml(self):
tree = etree.XML('<p>%s</p>' % uni) tree = etree.XML('<p>%s</p>' % uni)
Expand Down Expand Up @@ -95,7 +96,20 @@ def test_unicode_parse_stringio(self):
## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'), ## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'),
## uxml) ## uxml)



class EncodingsTestCase(HelperTestCase):
def test_illegal_utf8(self):
data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data)

def test_illegal_utf8_recover(self):
data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
parser = etree.XMLParser(recover=True)
self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser)


def test_suite(): def test_suite():
suite = unittest.TestSuite() suite = unittest.TestSuite()
suite.addTests([unittest.makeSuite(UnicodeTestCase)]) suite.addTests([unittest.makeSuite(UnicodeTestCase)])
suite.addTests([unittest.makeSuite(EncodingsTestCase)])
return suite return suite

0 comments on commit 46e9309

Please sign in to comment.