diff --git a/src/mog_commons/__init__.py b/src/mog_commons/__init__.py index 8ce9b36..7525d19 100644 --- a/src/mog_commons/__init__.py +++ b/src/mog_commons/__init__.py @@ -1 +1 @@ -__version__ = '0.1.3' +__version__ = '0.1.4' diff --git a/src/mog_commons/collection.py b/src/mog_commons/collection.py index 21a8f62..621e572 100644 --- a/src/mog_commons/collection.py +++ b/src/mog_commons/collection.py @@ -2,6 +2,8 @@ import six +__all__ = ['get_single_item', 'get_single_key', 'get_single_value', 'distinct'] + def get_single_item(d): """Get an item from a dict which contains just one item.""" @@ -19,3 +21,10 @@ def get_single_value(d): """Get a value from a dict which contains just one item.""" assert len(d) == 1, 'Single-item dict must have just one item, not %d.' % len(d) return next(six.itervalues(d)) + + +def distinct(xs): + """Get the list of distinct values with preserving order.""" + # don't use collections.OrderedDict because we do support Python 2.6 + seen = set() + return [x for x in xs if x not in seen and not seen.add(x)] diff --git a/src/mog_commons/string.py b/src/mog_commons/string.py index 0e6c894..321c6a2 100644 --- a/src/mog_commons/string.py +++ b/src/mog_commons/string.py @@ -3,6 +3,21 @@ from unicodedata import east_asian_width import six +from mog_commons.collection import distinct + +__all__ = [ + 'is_unicode', + 'is_strlike', + 'unicode_width', + 'to_unicode', + 'to_str', + 'to_bytes', + 'edge_just', + 'unicode_right', + 'unicode_left', + 'unicode_decode', +] + __unicode_width_mapping = {'F': 2, 'H': 1, 'W': 2, 'Na': 1, 'A': 2, 'N': 1} @@ -104,3 +119,22 @@ def unicode_right(s, width): break i -= 1 return s[i:] + + +def unicode_decode(data, encoding_list): + """ + Decode string data with one or more encodings, trying sequentially + :param data: bytes: encoded string data + :param encoding_list: list[string] or string: encoding names + :return: string: decoded string + """ + assert encoding_list, 'encodings must not be empty.' + + xs = distinct(encoding_list if isinstance(encoding_list, list) else [encoding_list]) + init, last = xs[:-1], xs[-1] + for encoding in init: + try: + return data.decode(encoding) + except UnicodeDecodeError: + pass + return data.decode(last) diff --git a/src/mog_commons/unittest.py b/src/mog_commons/unittest.py index 27445ba..06bc135 100644 --- a/src/mog_commons/unittest.py +++ b/src/mog_commons/unittest.py @@ -19,6 +19,7 @@ class StringBuffer(object): We don't use StringIO because there are many differences between PY2 and PY3. """ + def __init__(self, init_buffer=None): self._buffer = init_buffer or b'' @@ -37,9 +38,17 @@ def getvalue(self, encoding='utf-8', errors='strict'): class TestCase(base_unittest.TestCase): def assertRaisesRegexp(self, expected_exception, expected_regexp, callable_obj=None, *args, **kwargs): - """Accept difference of the function name between PY2 and PY3.""" - f = base_unittest.TestCase.assertRaisesRegex if six.PY3 else base_unittest.TestCase.assertRaisesRegexp - f(self, expected_exception, expected_regexp, callable_obj, *args, **kwargs) + """ + Accept difference of the function name between PY2 and PY3. + + We don't use built-in assertRaisesRegexp because it is unicode-unsafe. + """ + with self.assertRaises(expected_exception) as cm: + callable_obj(*args, **kwargs) + if six.PY2: + self.assertRegexpMatches(str(cm.exception), expected_regexp) + else: + self.assertRegex(str(cm.exception), expected_regexp) def assertOutput(self, expected_stdout, expected_stderr, function, encoding='utf-8'): with self.withOutput() as (out, err): diff --git a/tests/mog_commons/test_collection.py b/tests/mog_commons/test_collection.py index 2bd136e..da6fda2 100644 --- a/tests/mog_commons/test_collection.py +++ b/tests/mog_commons/test_collection.py @@ -1,6 +1,6 @@ from __future__ import division, print_function, absolute_import, unicode_literals -from mog_commons.collection import get_single_item, get_single_key, get_single_value +from mog_commons.collection import * from mog_commons import unittest @@ -31,3 +31,12 @@ def test_get_single_value_error(self): {}) self.assertRaisesRegexp(AssertionError, 'Single-item dict must have just one item, not 2.', get_single_value, {'x': 123, 'y': 45}) + + def test_distinct(self): + self.assertEqual(distinct([]), []) + self.assertEqual(distinct([1]), [1]) + self.assertEqual(distinct([1] * 100), [1]) + self.assertEqual(distinct([1, 2, 3, 4, 5]), [1, 2, 3, 4, 5]) + self.assertEqual(distinct([1, 2, 1, 2, 1]), [1, 2]) + self.assertEqual(distinct([2, 1, 2, 1, 1]), [2, 1]) + self.assertEqual(distinct('mog-commons-python'), ['m', 'o', 'g', '-', 'c', 'n', 's', 'p', 'y', 't', 'h']) diff --git a/tests/mog_commons/test_string.py b/tests/mog_commons/test_string.py index 2f00173..399abc8 100644 --- a/tests/mog_commons/test_string.py +++ b/tests/mog_commons/test_string.py @@ -92,3 +92,19 @@ def test_unicode_right(self): self.assertEqual(string.unicode_right('あいうえお', 11), 'あいうえお') self.assertEqual(string.unicode_right('あxいxうxえxお', 4), 'xお') self.assertEqual(string.unicode_right('あxいxうxえxお', 5), 'えxお') + + def test_unicode_decode(self): + self.assertRaisesRegexp(AssertionError, 'encodings must not be empty.', string.unicode_decode, 'abc', []) + self.assertEqual(string.unicode_decode(b'abc', 'ascii'), 'abc') + self.assertEqual(string.unicode_decode(b'abc', ['ascii']), 'abc') + self.assertRaisesRegexp( + UnicodeDecodeError, "'ascii' codec can't decode", + string.unicode_decode, 'あいうえお'.encode('utf-8'), 'ascii') + self.assertEqual(string.unicode_decode('あいうえお'.encode('utf-8'), ['ascii', 'sjis', 'utf-8']), 'あいうえお') + self.assertEqual(string.unicode_decode('あいうえお'.encode('utf-8'), ['ascii', 'utf-8', 'sjis']), 'あいうえお') + self.assertEqual(string.unicode_decode('あいうえお'.encode('utf-8'), ['utf-8', 'ascii', 'sjis']), 'あいうえお') + self.assertEqual(string.unicode_decode('あいうえお'.encode('utf-8'), ['utf-8', 'utf-8', 'utf-8']), 'あいうえお') + self.assertEqual(string.unicode_decode('あいうえお'.encode('sjis'), ['ascii', 'utf-8', 'sjis']), 'あいうえお') + self.assertRaisesRegexp( + UnicodeDecodeError, "'shift_jis' codec can't decode", + string.unicode_decode, 'あいうえお'.encode('utf-8'), ['ascii', 'sjis'])