From cb4db7ccb62af45193f693ade582ef539a3421ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mi=C5=A1o=20Belica?= Date: Tue, 27 May 2014 22:22:45 +0200 Subject: [PATCH] Respect parameter "errors" for default encoding (closes #9) --- CHANGELOG.rst | 4 ++++ justext/__init__.py | 2 +- justext/core.py | 2 +- setup.py | 2 +- tests/test_html_encoding.py | 11 ++++++++--- 5 files changed, 15 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d3996a5..ad02ed6 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,10 @@ Changelog for jusText ------------------ - *INCOMPATIBLE CHANGE:* Stop words are case insensitive. +2.1.1 (2014-05-27) +------------------ +- *BUG FIX:* Function ``decode_html`` now respects parameter ``errors`` when falling to ``default_encoding`` `#9 `_. + 2.1.0 (2014-01-25) ------------------ - *FEATURE:* Added XPath selector to the paragrahs. XPath selector is also available in detailed output as ``xpath`` attribute of ``

`` tag `#5 `_. diff --git a/justext/__init__.py b/justext/__init__.py index 88043a3..15b2b6b 100644 --- a/justext/__init__.py +++ b/justext/__init__.py @@ -11,4 +11,4 @@ from .core import justext, get_stoplists, get_stoplist -__version__ = "2.1.0" +__version__ = "2.1.1" diff --git a/justext/core.py b/justext/core.py index 639967c..bc34691 100644 --- a/justext/core.py +++ b/justext/core.py @@ -94,7 +94,7 @@ def decode_html(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=D except UnicodeDecodeError: # try lucky with default encoding try: - return html.decode(default_encoding) + return html.decode(default_encoding, errors) except UnicodeDecodeError as e: raise JustextError("Unable to decode the HTML to Unicode: " + unicode(e)) diff --git a/setup.py b/setup.py index 46ab482..ef92629 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ setup( name="jusText", - version="2.1.0", + version="2.1.1", description="Heuristic based boilerplate removal tool", long_description=long_description, author="Jan Pomikálek", diff --git a/tests/test_html_encoding.py b/tests/test_html_encoding.py index d5cfe36..dcd9427 100644 --- a/tests/test_html_encoding.py +++ b/tests/test_html_encoding.py @@ -98,9 +98,14 @@ def test_meta_detection_charset_outside_3(self): self.assert_strings_equal(html, decoded_html) - def test_unknown_encoding(self): + def test_unknown_encoding_in_strict_mode(self): html = 'ľščťžäňôě' - tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2")) + tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"), errors='strict') + + def test_unknown_encoding_with_default_error_handler(self): + html = 'ľščťžäňôě' + decoded = decode_html(html.encode("iso-8859-2"), default_encoding="iso-8859-2") + self.assertEqual(decoded, html) def test_default_encoding(self): html = 'ľščťžäňôě' @@ -123,4 +128,4 @@ def test_given_wrong_encoding(self): def test_fake_encoding_in_meta(self): html = ' ľščťžäňôě' - tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2")) + tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"), errors='strict')