mogproject · mogproject · Oct 24, 2015 · Oct 24, 2015 · Oct 24, 2015 · Oct 24, 2015
diff --git a/src/mog_commons/__init__.py b/src/mog_commons/__init__.py
@@ -1 +1 @@
-__version__ = '0.1.3'
+__version__ = '0.1.4'
diff --git a/src/mog_commons/collection.py b/src/mog_commons/collection.py
@@ -2,6 +2,8 @@
 
 import six
 
+__all__ = ['get_single_item', 'get_single_key', 'get_single_value', 'distinct']
+
 
 def get_single_item(d):
     """Get an item from a dict which contains just one item."""
@@ -19,3 +21,10 @@ def get_single_value(d):
     """Get a value from a dict which contains just one item."""
     assert len(d) == 1, 'Single-item dict must have just one item, not %d.' % len(d)
     return next(six.itervalues(d))
+
+
+def distinct(xs):
+    """Get the list of distinct values with preserving order."""
+    # don't use collections.OrderedDict because we do support Python 2.6
+    seen = set()
+    return [x for x in xs if x not in seen and not seen.add(x)]
diff --git a/src/mog_commons/string.py b/src/mog_commons/string.py
@@ -3,6 +3,21 @@
 from unicodedata import east_asian_width
 import six
 
+from mog_commons.collection import distinct
+
+__all__ = [
+    'is_unicode',
+    'is_strlike',
+    'unicode_width',
+    'to_unicode',
+    'to_str',
+    'to_bytes',
+    'edge_just',
+    'unicode_right',
+    'unicode_left',
+    'unicode_decode',
+]
+
 __unicode_width_mapping = {'F': 2, 'H': 1, 'W': 2, 'Na': 1, 'A': 2, 'N': 1}
 
 
@@ -104,3 +119,22 @@ def unicode_right(s, width):
             break
         i -= 1
     return s[i:]
+
+
+def unicode_decode(data, encoding_list):
+    """
+    Decode string data with one or more encodings, trying sequentially
+    :param data: bytes: encoded string data
+    :param encoding_list: list[string] or string: encoding names
+    :return: string: decoded string
+    """
+    assert encoding_list, 'encodings must not be empty.'
+
+    xs = distinct(encoding_list if isinstance(encoding_list, list) else [encoding_list])
+    init, last = xs[:-1], xs[-1]
+    for encoding in init:
+        try:
+            return data.decode(encoding)
+        except UnicodeDecodeError:
+            pass
+    return data.decode(last)
diff --git a/src/mog_commons/unittest.py b/src/mog_commons/unittest.py
@@ -19,6 +19,7 @@ class StringBuffer(object):
 
     We don't use StringIO because there are many differences between PY2 and PY3.
     """
+
     def __init__(self, init_buffer=None):
         self._buffer = init_buffer or b''
 
@@ -37,9 +38,17 @@ def getvalue(self, encoding='utf-8', errors='strict'):
 
 class TestCase(base_unittest.TestCase):
     def assertRaisesRegexp(self, expected_exception, expected_regexp, callable_obj=None, *args, **kwargs):
-        """Accept difference of the function name between PY2 and PY3."""
-        f = base_unittest.TestCase.assertRaisesRegex if six.PY3 else base_unittest.TestCase.assertRaisesRegexp
-        f(self, expected_exception, expected_regexp, callable_obj, *args, **kwargs)
+        """
+        Accept difference of the function name between PY2 and PY3.
+
+        We don't use built-in assertRaisesRegexp because it is unicode-unsafe.
+        """
+        with self.assertRaises(expected_exception) as cm:
+            callable_obj(*args, **kwargs)
+        if six.PY2:
+            self.assertRegexpMatches(str(cm.exception), expected_regexp)
+        else:
+            self.assertRegex(str(cm.exception), expected_regexp)
 
     def assertOutput(self, expected_stdout, expected_stderr, function, encoding='utf-8'):
         with self.withOutput() as (out, err):

diff --git a/tests/mog_commons/test_collection.py b/tests/mog_commons/test_collection.py
@@ -1,6 +1,6 @@
 from __future__ import division, print_function, absolute_import, unicode_literals
 
-from mog_commons.collection import get_single_item, get_single_key, get_single_value
+from mog_commons.collection import *
 from mog_commons import unittest
 
 
@@ -31,3 +31,12 @@ def test_get_single_value_error(self):
                                 {})
         self.assertRaisesRegexp(AssertionError, 'Single-item dict must have just one item, not 2.', get_single_value,
                                 {'x': 123, 'y': 45})
+
+    def test_distinct(self):
+        self.assertEqual(distinct([]), [])
+        self.assertEqual(distinct([1]), [1])
+        self.assertEqual(distinct([1] * 100), [1])
+        self.assertEqual(distinct([1, 2, 3, 4, 5]), [1, 2, 3, 4, 5])
+        self.assertEqual(distinct([1, 2, 1, 2, 1]), [1, 2])
+        self.assertEqual(distinct([2, 1, 2, 1, 1]), [2, 1])
+        self.assertEqual(distinct('mog-commons-python'), ['m', 'o', 'g', '-', 'c', 'n', 's', 'p', 'y', 't', 'h'])
diff --git a/tests/mog_commons/test_string.py b/tests/mog_commons/test_string.py
@@ -92,3 +92,19 @@ def test_unicode_right(self):
         self.assertEqual(string.unicode_right('あいうえお', 11), 'あいうえお')
         self.assertEqual(string.unicode_right('あxいxうxえxお', 4), 'xお')
         self.assertEqual(string.unicode_right('あxいxうxえxお', 5), 'えxお')
+
+    def test_unicode_decode(self):
+        self.assertRaisesRegexp(AssertionError, 'encodings must not be empty.', string.unicode_decode, 'abc', [])
+        self.assertEqual(string.unicode_decode(b'abc', 'ascii'), 'abc')
+        self.assertEqual(string.unicode_decode(b'abc', ['ascii']), 'abc')
+        self.assertRaisesRegexp(
+            UnicodeDecodeError, "'ascii' codec can't decode",
+            string.unicode_decode, 'あいうえお'.encode('utf-8'), 'ascii')
+        self.assertEqual(string.unicode_decode('あいうえお'.encode('utf-8'), ['ascii', 'sjis', 'utf-8']), 'あいうえお')
+        self.assertEqual(string.unicode_decode('あいうえお'.encode('utf-8'), ['ascii', 'utf-8', 'sjis']), 'あいうえお')
+        self.assertEqual(string.unicode_decode('あいうえお'.encode('utf-8'), ['utf-8', 'ascii', 'sjis']), 'あいうえお')
+        self.assertEqual(string.unicode_decode('あいうえお'.encode('utf-8'), ['utf-8', 'utf-8', 'utf-8']), 'あいうえお')
+        self.assertEqual(string.unicode_decode('あいうえお'.encode('sjis'), ['ascii', 'utf-8', 'sjis']), 'あいうえお')
+        self.assertRaisesRegexp(
+            UnicodeDecodeError, "'shift_jis' codec can't decode",
+            string.unicode_decode, 'あいうえお'.encode('utf-8'), ['ascii', 'sjis'])