From cb4db7ccb62af45193f693ade582ef539a3421ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mi=C5=A1o=20Belica?= <miso.belica@gmail.com>
Date: Tue, 27 May 2014 22:22:45 +0200
Subject: [PATCH] Respect parameter "errors" for default encoding (closes #9)

---
 CHANGELOG.rst               |  4 ++++
 justext/__init__.py         |  2 +-
 justext/core.py             |  2 +-
 setup.py                    |  2 +-
 tests/test_html_encoding.py | 11 ++++++++---
 5 files changed, 15 insertions(+), 6 deletions(-)
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index d3996a5..ad02ed6 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -7,6 +7,10 @@ Changelog for jusText
 ------------------
 - *INCOMPATIBLE CHANGE:* Stop words are case insensitive.
 
+2.1.1 (2014-05-27)
+------------------
+- *BUG FIX:* Function ``decode_html`` now respects parameter ``errors`` when falling to ``default_encoding`` `#9 <https://github.com/miso-belica/jusText/issues/9>`_.
+
 2.1.0 (2014-01-25)
 ------------------
 - *FEATURE:* Added XPath selector to the paragrahs. XPath selector is also available in detailed output as ``xpath`` attribute of ``<p>`` tag `#5 <https://github.com/miso-belica/jusText/pull/5>`_.
diff --git a/justext/__init__.py b/justext/__init__.py
index 88043a3..15b2b6b 100644
--- a/justext/__init__.py
+++ b/justext/__init__.py
@@ -11,4 +11,4 @@
 from .core import justext, get_stoplists, get_stoplist
 
 
-__version__ = "2.1.0"
+__version__ = "2.1.1"
diff --git a/justext/core.py b/justext/core.py
index 639967c..bc34691 100644
--- a/justext/core.py
+++ b/justext/core.py
@@ -94,7 +94,7 @@ def decode_html(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=D
     except UnicodeDecodeError:
         # try lucky with default encoding
         try:
-            return html.decode(default_encoding)
+            return html.decode(default_encoding, errors)
         except UnicodeDecodeError as e:
             raise JustextError("Unable to decode the HTML to Unicode: " + unicode(e))
 
diff --git a/setup.py b/setup.py
index 46ab482..ef92629 100644
--- a/setup.py
+++ b/setup.py
@@ -21,7 +21,7 @@
 
 setup(
     name="jusText",
-    version="2.1.0",
+    version="2.1.1",
     description="Heuristic based boilerplate removal tool",
     long_description=long_description,
     author="Jan Pomikálek",
diff --git a/tests/test_html_encoding.py b/tests/test_html_encoding.py
index d5cfe36..dcd9427 100644
--- a/tests/test_html_encoding.py
+++ b/tests/test_html_encoding.py
@@ -98,9 +98,14 @@ def test_meta_detection_charset_outside_3(self):
 
         self.assert_strings_equal(html, decoded_html)
 
-    def test_unknown_encoding(self):
+    def test_unknown_encoding_in_strict_mode(self):
         html = 'ľščťžäňôě'
-        tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"))
+        tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"), errors='strict')
+
+    def test_unknown_encoding_with_default_error_handler(self):
+        html = 'ľščťžäňôě'
+        decoded = decode_html(html.encode("iso-8859-2"), default_encoding="iso-8859-2")
+        self.assertEqual(decoded, html)
 
     def test_default_encoding(self):
         html = 'ľščťžäňôě'
@@ -123,4 +128,4 @@ def test_given_wrong_encoding(self):
     def test_fake_encoding_in_meta(self):
         html = '<meta charset="iso-fake-2"/> ľščťžäňôě'
 
-        tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"))
+        tools.assert_raises(JustextError, decode_html, html.encode("iso-8859-2"), errors='strict')