numba · seibert · Feb 19, 2019 · Jan 14, 2019 · Jan 14, 2019 · Jan 14, 2019
diff --git a/.flake8 b/.flake8
@@ -35,7 +35,6 @@ exclude =
     numba/withcontexts.py
     numba/analysis.py
     numba/_version.py
-    numba/unicode.py
     numba/inline_closurecall.py
     numba/ir_utils.py
     numba/pylowering.py
@@ -230,7 +229,6 @@ exclude =
     numba/tests/enum_usecases.py
     numba/tests/test_func_lifetime.py
     numba/tests/test_typeinfer.py
-    numba/tests/test_unicode.py
     numba/tests/test_return_values.py
     numba/tests/test_parallel_backend.py
     numba/tests/test_nrt.py
@@ -297,8 +295,6 @@ exclude =
     numba/tests/test_indexing.py
     numba/tests/test_pycc.py
     numba/tests/annotation_usecases.py
-    numba/tests/test_unicode_names.py
-    numba/tests/test_unicode_literals.py
     numba/tests/test_extended_arg.py
     numba/tests/test_alignment.py
     numba/tests/test_multi3.py

diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst
@@ -171,15 +171,21 @@ The following functions, attributes and methods are currently supported:
 * ``.startswith()``
 * ``.endswith()``
 * ``.find()``
+* ``.split()``
+* ``.join()``
 
 Additional operations as well as support for Python 2 strings / Python 3 bytes
 will be added in a future version of Numba.  Python 2 Unicode objects will
 likely never be supported.
 
 .. warning::
-    The performance of the substring search operations (``in``,
-    ``.contains()`` and ``find()``) is poor in version 0.41 and will be improved in
-    version 0.42.
+    The performance of some operations is known to be slower than the CPython
+    implementation. These include substring search (``in``, ``.contains()``
+    and ``find()``) and string creation (like ``.split()``).  Improving the
+    string performance is an ongoing task, but the speed of CPython is
+    unlikely to be surpassed for basic string operation in isolation.
+    Numba is most successfuly used for larger algorithms that happen to
+    involve strings, where basic string operations are not the bottleneck.
 
 
 tuple

diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py
@@ -11,6 +11,7 @@
 from numba import njit
 import numba.unittest_support as unittest
 from .support import (TestCase, no_pyobj_flags, MemoryLeakMixin)
+from numba.errors import TypingError
 
 _py34_or_later = sys.version_info[:2] >= (3, 4)
 
@@ -38,10 +39,12 @@ def getitem_usecase(x, i):
 def concat_usecase(x, y):
     return x + y
 
+
 def inplace_concat_usecase(x, y):
     x += y
     return x
 
+
 def in_usecase(x, y):
     return x in y
 
@@ -74,6 +77,33 @@ def endswith_usecase(x, y):
     return x.endswith(y)
 
 
+def split_usecase(x, y):
+    return x.split(y)
+
+
+def split_with_maxsplit_usecase(x, y, maxsplit):
+    return x.split(y, maxsplit)
+
+
+def split_with_maxsplit_kwarg_usecase(x, y, maxsplit):
+    return x.split(y, maxsplit=maxsplit)
+
+
+def split_whitespace_usecase(x):
+    return x.split()
+
+
+def join_usecase(x, y):
+    return x.join(y)
+
+
+def join_empty_usecase(x):
+    # hack to make empty typed list
+    l = ['']
+    l.pop()
+    return x.join(l)
+
+
 class BaseTest(MemoryLeakMixin, TestCase):
     def setUp(self):
         super(BaseTest, self).setUp()
@@ -138,21 +168,21 @@ def _check_ordering_op(self, usecase):
                 pyfunc(a, a),
                 cfunc(a, a),
                 '%s: "%s", "%s"' % (usecase.__name__, a, a),
-                )
+            )
 
         # Check comparison to adjacent
         for a, b in permutations(UNICODE_ORDERING_EXAMPLES, r=2):
             self.assertEqual(
                 pyfunc(a, b),
                 cfunc(a, b),
                 '%s: "%s", "%s"' % (usecase.__name__, a, b),
-                )
+            )
             # and reversed
             self.assertEqual(
                 pyfunc(b, a),
                 cfunc(b, a),
                 '%s: "%s", "%s"' % (usecase.__name__, b, a),
-                )
+            )
 
     def test_lt(self, flags=no_pyobj_flags):
         self._check_ordering_op(lt_usecase)
@@ -294,6 +324,161 @@ def test_concat(self, flags=no_pyobj_flags):
                                  cfunc(a, b),
                                  "'%s' + '%s'?" % (a, b))
 
+    def test_split_exception_empty_sep(self):
+        self.disable_leak_check()
+
+        pyfunc = split_usecase
+        cfunc = njit(pyfunc)
+
+        # Handle empty separator exception
+        for func in [pyfunc, cfunc]:
+            with self.assertRaises(ValueError) as raises:
+                    func('a', '')
+            self.assertIn('empty separator', str(raises.exception))
+
+    def test_split_exception_noninteger_maxsplit(self):
+        pyfunc = split_with_maxsplit_usecase
+        cfunc = njit(pyfunc)
+
+        # Handle non-integer maxsplit exception
+        for sep in [' ', None]:
+            with self.assertRaises(TypingError) as raises:
+                    cfunc('a', sep, 2.4)
+            self.assertIn('float64', str(raises.exception),
+                          'non-integer maxsplit with sep = %s' % sep)
+
+    def test_split(self):
+        pyfunc = split_usecase
+        cfunc = njit(pyfunc)
+
+        CASES = [
+            (' a ', None),
+            ('', '⚡'),
+            ('abcabc', '⚡'),
+            ('🐍⚡', '⚡'),
+            ('🐍⚡🐍', '⚡'),
+            ('abababa', 'a'),
+            ('abababa', 'b'),
+            ('abababa', 'c'),
+            ('abababa', 'ab'),
+            ('abababa', 'aba'),
+        ]
+
+        for test_str, splitter in CASES:
+            self.assertEqual(pyfunc(test_str, splitter),
+                             cfunc(test_str, splitter),
+                             "'%s'.split('%s')?" % (test_str, splitter))
+
+    def test_split_with_maxsplit(self):
+        CASES = [
+            (' a ', None, 1),
+            ('', '⚡', 1),
+            ('abcabc', '⚡', 1),
+            ('🐍⚡', '⚡', 1),
+            ('🐍⚡🐍', '⚡', 1),
+            ('abababa', 'a', 2),
+            ('abababa', 'b', 1),
+            ('abababa', 'c', 2),
+            ('abababa', 'ab', 1),
+            ('abababa', 'aba', 5),
+        ]
+
+        for pyfunc, fmt_str in [(split_with_maxsplit_usecase, "'%s'.split('%s', %d)?"),
+                                (split_with_maxsplit_kwarg_usecase, "'%s'.split('%s', maxsplit=%d)?")]:
+
+            cfunc = njit(pyfunc)
+            for test_str, splitter, maxsplit in CASES:
+                self.assertEqual(pyfunc(test_str, splitter, maxsplit),
+                                 cfunc(test_str, splitter, maxsplit),
+                                 fmt_str % (test_str, splitter, maxsplit))
+
+    def test_split_whitespace(self):
+        # explicit sep=None cases covered in test_split and test_split_with_maxsplit
+        pyfunc = split_whitespace_usecase
+        cfunc = njit(pyfunc)
+
+        #list copied from https://github.com/python/cpython/blob/master/Objects/unicodetype_db.h
+        all_whitespace = ''.join(map(chr, [
+            0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x001C, 0x001D, 0x001E, 0x001F, 0x0020,
+            0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006,
+            0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000
+        ]))
+
+        CASES = [
+            '',
+            'abcabc',
+            '🐍 ⚡',
+            '🐍 ⚡ 🐍',
+            '🐍   ⚡ 🐍  ',
+            '  🐍   ⚡ 🐍',
+            ' 🐍' + all_whitespace + '⚡ 🐍  ',
+        ]
+        for test_str in CASES:
+            self.assertEqual(pyfunc(test_str),
+                             cfunc(test_str),
+                             "'%s'.split()?" % (test_str,))
+
+    def test_join_empty(self):
+        # Can't pass empty list to nopython mode, so we have to make a
+        # separate test case
+        pyfunc = join_empty_usecase
+        cfunc = njit(pyfunc)
+
+        CASES = [
+            '',
+            '🐍🐍🐍',
+        ]
+
+        for sep in CASES:
+            self.assertEqual(pyfunc(sep),
+                             cfunc(sep),
+                             "'%s'.join([])?" % (sep,))
+
+    def test_join_non_string_exception(self):
+        # Verify that join of list of integers raises typing exception
+        pyfunc = join_usecase
+        cfunc = njit(pyfunc)
+
+        # Handle empty separator exception
+        with self.assertRaises(TypingError) as raises:
+            cfunc('', [1,2,3])
+        # This error message is obscure, but indicates the error was trapped in typing of str.join()
+        # Feel free to change this as we update error messages.
+        exc_message = str(raises.exception)
+        self.assertIn("Invalid use of BoundFunction", exc_message)
+        self.assertIn("(reflected list(int", exc_message)  # could be int32 or int64
+
+    def test_join(self):
+        pyfunc = join_usecase
+        cfunc = njit(pyfunc)
+
+        CASES = [
+            ('', ['', '', '']),
+            ('a', ['', '', '']),
+            ('', ['a', 'bbbb', 'c']),
+            ('🐍🐍🐍', ['⚡⚡'] * 5),
+        ]
+
+        for sep, parts in CASES:
+            self.assertEqual(pyfunc(sep, parts),
+                             cfunc(sep, parts),
+                             "'%s'.join('%s')?" % (sep, parts))
+
+    def test_join_interleave_str(self):
+        # can pass a string as the parts iterable
+        pyfunc = join_usecase
+        cfunc = njit(pyfunc)
+
+        CASES = [
+            ('abc', '123'),
+            ('🐍🐍🐍', '⚡⚡'),
+        ]
+
+        for sep, parts in CASES:
+            self.assertEqual(pyfunc(sep, parts),
+                             cfunc(sep, parts),
+                             "'%s'.join('%s')?" % (sep, parts))
+
     def test_inplace_concat(self, flags=no_pyobj_flags):
         pyfunc = inplace_concat_usecase
         cfunc = njit(pyfunc)
@@ -361,7 +546,7 @@ def pyfunc(option, x, y):
             for cmpop in ['==', '!=', '<', '>', '<=', '>=', '']:
                 args = [cmpop, x, y]
                 self.assertEqual(pyfunc(*args), cfunc(*args),
-                                msg='failed on {}'.format(args))
+                                 msg='failed on {}'.format(args))
 
     def test_literal_concat(self):
         def pyfunc(x):

diff --git a/numba/tests/test_unicode_literals.py b/numba/tests/test_unicode_literals.py
@@ -2,10 +2,8 @@
 
 import sys
 
-import numpy as np
-
 import numba.unittest_support as unittest
-from numba import utils, jit
+from numba import jit
 from .support import TestCase
 
 

diff --git a/numba/tests/test_unicode_names.py b/numba/tests/test_unicode_names.py
@@ -64,4 +64,3 @@ def test_normalize_ir_text_py3(self):
 
 if __name__ == '__main__':
     unittest.main()
-
diff --git a/numba/tests/test_unsafe_intrinsics.py b/numba/tests/test_unsafe_intrinsics.py
@@ -8,6 +8,7 @@
 from numba import njit
 from numba.unsafe.tuple import tuple_setitem
 from numba.unsafe.ndarray import to_fixed_tuple, empty_inferred
+from numba.unsafe.bytes import memcpy_region
 from numba.errors import TypingError
 
 
@@ -109,3 +110,23 @@ def func():
         got = func()
         expect = np.asarray([3.1] * 10)
         np.testing.assert_array_equal(got, expect)
+
+
+class TestBytesIntrinsic(TestCase):
+    """Tests for numba.unsafe.bytes
+    """
+    def test_memcpy_region(self):
+        @njit
+        def foo(dst, dst_index, src, src_index, nbytes):
+            # last arg is assume 1 byte alignment
+            memcpy_region(dst.ctypes.data, dst_index,
+                          src.ctypes.data, src_index, nbytes, 1)
+
+        d = np.zeros(10, dtype=np.int8)
+        s = np.arange(10, dtype=np.int8)
+
+        # copy s[1:6] to d[4:9]
+        foo(d, 4, s, 1, 5)
+
+        expected = [0, 0, 0, 0, 1, 2, 3, 4, 5, 0]
+        np.testing.assert_array_equal(d, expected)