Skip to content

Commit

Permalink
Respect parameter "errors" for default encoding (closes #9)
Browse files Browse the repository at this point in the history
  • Loading branch information
miso-belica committed May 27, 2014
1 parent 3dd3525 commit cb4db7c
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Expand Up @@ -7,6 +7,10 @@ Changelog for jusText
------------------
- *INCOMPATIBLE CHANGE:* Stop words are case insensitive.

2.1.1 (2014-05-27)
------------------
- *BUG FIX:* Function ``decode_html`` now respects parameter ``errors`` when falling to ``default_encoding`` `#9 <https://github.com/miso-belica/jusText/issues/9>`_.

2.1.0 (2014-01-25)
------------------
- *FEATURE:* Added XPath selector to the paragrahs. XPath selector is also available in detailed output as ``xpath`` attribute of ``<p>`` tag `#5 <https://github.com/miso-belica/jusText/pull/5>`_.
Expand Down
2 changes: 1 addition & 1 deletion justext/__init__.py
Expand Up @@ -11,4 +11,4 @@
from .core import justext, get_stoplists, get_stoplist


__version__ = "2.1.0"
__version__ = "2.1.1"
2 changes: 1 addition & 1 deletion justext/core.py
Expand Up @@ -94,7 +94,7 @@ def decode_html(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=D
except UnicodeDecodeError:
# try lucky with default encoding
try:
return html.decode(default_encoding)
return html.decode(default_encoding, errors)
except UnicodeDecodeError as e:
raise JustextError("Unable to decode the HTML to Unicode: " + unicode(e))

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -21,7 +21,7 @@

setup(
name="jusText",
version="2.1.0",
version="2.1.1",
description="Heuristic based boilerplate removal tool",
long_description=long_description,
author="Jan Pomikálek",
Expand Down
11 changes: 8 additions & 3 deletions tests/test_html_encoding.py
Expand Up @@ -98,9 +98,14 @@ def test_meta_detection_charset_outside_3(self):

self.assert_strings_equal(html, decoded_html)

def test_unknown_encoding(self):
def test_unknown_encoding_in_strict_mode(self):
html = 'ľščťžäňôě'
tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"))
tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"), errors='strict')

def test_unknown_encoding_with_default_error_handler(self):
html = 'ľščťžäňôě'
decoded = decode_html(html.encode("iso-8859-2"), default_encoding="iso-8859-2")
self.assertEqual(decoded, html)

def test_default_encoding(self):
html = 'ľščťžäňôě'
Expand All @@ -123,4 +128,4 @@ def test_given_wrong_encoding(self):
def test_fake_encoding_in_meta(self):
html = '<meta charset="iso-fake-2"/> ľščťžäňôě'

tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"))
tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"), errors='strict')

0 comments on commit cb4db7c

Please sign in to comment.