Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for str.split and str.join #3678

Merged
merged 30 commits into from
Feb 19, 2019
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
31adc51
fix existing flake8 failures and remove flake8 exclusions for these f…
seibert Jan 14, 2019
81f2020
support for str.split
seibert Jan 14, 2019
e1f4313
str.join support
seibert Jan 14, 2019
dfc55a6
Special cast join for lists of strings for massive speedup.
seibert Jan 15, 2019
5ee14c9
Add fast path memcpy when string character widths match
seibert Jan 18, 2019
5e7dbf9
Speed up split by 20x
seibert Jan 21, 2019
f8b5973
add split, join to docs
seibert Jan 21, 2019
d753704
Disable NRT on functions that should not change refcount
seibert Jan 21, 2019
c927c97
Fast path for slicing with stride 1
seibert Jan 22, 2019
b266931
Clarify string performance caveats
seibert Jan 22, 2019
8dda70c
Merge branch 'master' into str_split_join
seibert Jan 23, 2019
fd1bbac
Fix flake8 fail caused by merging with master
seibert Jan 23, 2019
96f2219
Merge master
seibert Feb 4, 2019
70abb58
Add maxsplit support
seibert Feb 4, 2019
28cedc4
Fix flake8
seibert Feb 7, 2019
c0d31d3
Fix up some docs
seibert Feb 18, 2019
ff7e835
Add split on whitespace
seibert Feb 18, 2019
93e047f
merge master
seibert Feb 18, 2019
bc2b9ad
Only accept str.join on list<str>
seibert Feb 18, 2019
0d8d96e
Support join on standalone string
seibert Feb 18, 2019
802d9ed
check maxsplit as kwarg
seibert Feb 18, 2019
9f99521
Create memcpy_region intrinsic in numba.unsafe.bytes
seibert Feb 18, 2019
1f0b6b6
Fix flake8
seibert Feb 18, 2019
78c489f
Make exception message test more general to cover all platforms
seibert Feb 19, 2019
1a14c1d
respond to review comments
seibert Feb 19, 2019
592be35
Fix error message test
seibert Feb 19, 2019
feec704
raise typing error on non-integer maxsplit
seibert Feb 19, 2019
07f4691
missed testing non-integer maxsplit when sep is None
seibert Feb 19, 2019
b23b152
flake8
seibert Feb 19, 2019
ccb00de
fix merge conflict with master
seibert Feb 19, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 0 additions & 4 deletions .flake8
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ exclude =
numba/withcontexts.py
numba/analysis.py
numba/_version.py
numba/unicode.py
numba/inline_closurecall.py
numba/ir_utils.py
numba/pylowering.py
Expand Down Expand Up @@ -230,7 +229,6 @@ exclude =
numba/tests/enum_usecases.py
numba/tests/test_func_lifetime.py
numba/tests/test_typeinfer.py
numba/tests/test_unicode.py
numba/tests/test_return_values.py
numba/tests/test_parallel_backend.py
numba/tests/test_nrt.py
Expand Down Expand Up @@ -297,8 +295,6 @@ exclude =
numba/tests/test_indexing.py
numba/tests/test_pycc.py
numba/tests/annotation_usecases.py
numba/tests/test_unicode_names.py
numba/tests/test_unicode_literals.py
numba/tests/test_extended_arg.py
numba/tests/test_alignment.py
numba/tests/test_multi3.py
Expand Down
12 changes: 9 additions & 3 deletions docs/source/reference/pysupported.rst
Original file line number Diff line number Diff line change
Expand Up @@ -171,15 +171,21 @@ The following functions, attributes and methods are currently supported:
* ``.startswith()``
* ``.endswith()``
* ``.find()``
* ``.split()``
* ``.join()``

Additional operations as well as support for Python 2 strings / Python 3 bytes
will be added in a future version of Numba. Python 2 Unicode objects will
likely never be supported.

.. warning::
The performance of the substring search operations (``in``,
``.contains()`` and ``find()``) is poor in version 0.41 and will be improved in
version 0.42.
The performance of some operations is known to be slower than the CPython
implementation. These include substring search (``in``, ``.contains()``
and ``find()``) and string creation (like ``.split()``). Improving the
string performance is an ongoing task, but the speed of CPython is
unlikely to be surpassed for basic string operation in isolation.
Numba is most successfuly used for larger algorithms that happen to
involve strings, where basic string operations are not the bottleneck.


tuple
Expand Down
193 changes: 189 additions & 4 deletions numba/tests/test_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from numba import njit
import numba.unittest_support as unittest
from .support import (TestCase, no_pyobj_flags, MemoryLeakMixin)
from numba.errors import TypingError

_py34_or_later = sys.version_info[:2] >= (3, 4)

Expand Down Expand Up @@ -38,10 +39,12 @@ def getitem_usecase(x, i):
def concat_usecase(x, y):
return x + y


def inplace_concat_usecase(x, y):
x += y
return x


def in_usecase(x, y):
return x in y

Expand Down Expand Up @@ -74,6 +77,33 @@ def endswith_usecase(x, y):
return x.endswith(y)


def split_usecase(x, y):
return x.split(y)


def split_with_maxsplit_usecase(x, y, maxsplit):
return x.split(y, maxsplit)
seibert marked this conversation as resolved.
Show resolved Hide resolved


def split_with_maxsplit_kwarg_usecase(x, y, maxsplit):
return x.split(y, maxsplit=maxsplit)


def split_whitespace_usecase(x):
return x.split()


def join_usecase(x, y):
return x.join(y)


def join_empty_usecase(x):
# hack to make empty typed list
seibert marked this conversation as resolved.
Show resolved Hide resolved
l = ['']
l.pop()
return x.join(l)


class BaseTest(MemoryLeakMixin, TestCase):
def setUp(self):
super(BaseTest, self).setUp()
Expand Down Expand Up @@ -138,21 +168,21 @@ def _check_ordering_op(self, usecase):
pyfunc(a, a),
cfunc(a, a),
'%s: "%s", "%s"' % (usecase.__name__, a, a),
)
)

# Check comparison to adjacent
for a, b in permutations(UNICODE_ORDERING_EXAMPLES, r=2):
self.assertEqual(
pyfunc(a, b),
cfunc(a, b),
'%s: "%s", "%s"' % (usecase.__name__, a, b),
)
)
# and reversed
self.assertEqual(
pyfunc(b, a),
cfunc(b, a),
'%s: "%s", "%s"' % (usecase.__name__, b, a),
)
)

def test_lt(self, flags=no_pyobj_flags):
self._check_ordering_op(lt_usecase)
Expand Down Expand Up @@ -294,6 +324,161 @@ def test_concat(self, flags=no_pyobj_flags):
cfunc(a, b),
"'%s' + '%s'?" % (a, b))

def test_split_exception_empty_sep(self):
self.disable_leak_check()

pyfunc = split_usecase
cfunc = njit(pyfunc)

# Handle empty separator exception
for func in [pyfunc, cfunc]:
with self.assertRaises(ValueError) as raises:
func('a', '')
self.assertIn('empty separator', str(raises.exception))

def test_split_exception_noninteger_maxsplit(self):
pyfunc = split_with_maxsplit_usecase
cfunc = njit(pyfunc)

# Handle non-integer maxsplit exception
for sep in [' ', None]:
with self.assertRaises(TypingError) as raises:
cfunc('a', sep, 2.4)
self.assertIn('float64', str(raises.exception),
'non-integer maxsplit with sep = %s' % sep)

def test_split(self):
pyfunc = split_usecase
cfunc = njit(pyfunc)

CASES = [
(' a ', None),
('', '⚡'),
('abcabc', '⚡'),
('🐍⚡', '⚡'),
('🐍⚡🐍', '⚡'),
('abababa', 'a'),
('abababa', 'b'),
('abababa', 'c'),
('abababa', 'ab'),
('abababa', 'aba'),
]

for test_str, splitter in CASES:
self.assertEqual(pyfunc(test_str, splitter),
cfunc(test_str, splitter),
"'%s'.split('%s')?" % (test_str, splitter))

def test_split_with_maxsplit(self):
CASES = [
(' a ', None, 1),
('', '⚡', 1),
('abcabc', '⚡', 1),
('🐍⚡', '⚡', 1),
('🐍⚡🐍', '⚡', 1),
('abababa', 'a', 2),
('abababa', 'b', 1),
('abababa', 'c', 2),
('abababa', 'ab', 1),
('abababa', 'aba', 5),
]

for pyfunc, fmt_str in [(split_with_maxsplit_usecase, "'%s'.split('%s', %d)?"),
(split_with_maxsplit_kwarg_usecase, "'%s'.split('%s', maxsplit=%d)?")]:

cfunc = njit(pyfunc)
for test_str, splitter, maxsplit in CASES:
self.assertEqual(pyfunc(test_str, splitter, maxsplit),
cfunc(test_str, splitter, maxsplit),
fmt_str % (test_str, splitter, maxsplit))

def test_split_whitespace(self):
# explicit sep=None cases covered in test_split and test_split_with_maxsplit
pyfunc = split_whitespace_usecase
cfunc = njit(pyfunc)

#list copied from https://github.com/python/cpython/blob/master/Objects/unicodetype_db.h
all_whitespace = ''.join(map(chr, [
0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x001C, 0x001D, 0x001E, 0x001F, 0x0020,
0x0085, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006,
0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000
]))

CASES = [
'',
'abcabc',
'🐍 ⚡',
'🐍 ⚡ 🐍',
'🐍 ⚡ 🐍 ',
' 🐍 ⚡ 🐍',
' 🐍' + all_whitespace + '⚡ 🐍 ',
]
for test_str in CASES:
self.assertEqual(pyfunc(test_str),
cfunc(test_str),
"'%s'.split()?" % (test_str,))

def test_join_empty(self):
# Can't pass empty list to nopython mode, so we have to make a
# separate test case
pyfunc = join_empty_usecase
cfunc = njit(pyfunc)

CASES = [
'',
'🐍🐍🐍',
]

for sep in CASES:
self.assertEqual(pyfunc(sep),
cfunc(sep),
"'%s'.join([])?" % (sep,))

def test_join_non_string_exception(self):
# Verify that join of list of integers raises typing exception
pyfunc = join_usecase
cfunc = njit(pyfunc)

# Handle empty separator exception
with self.assertRaises(TypingError) as raises:
cfunc('', [1,2,3])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did flake8 not complain about this line ?!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I enabled flake8, I copied over Dask's error suppressions (as that seemed a good style to copy):

ignore =
    E20,   # Extra space in brackets
    E231,E241,  # Multiple spaces around ","
    E26,   # Comments
    E731,  # Assigning lambda expression
    E741,  # Ambiguous variable names
    W503,  # line break before binary operator
    W504,  # line break after binary operator
max-line-length = 120

# This error message is obscure, but indicates the error was trapped in typing of str.join()
# Feel free to change this as we update error messages.
exc_message = str(raises.exception)
self.assertIn("Invalid use of BoundFunction", exc_message)
self.assertIn("(reflected list(int", exc_message) # could be int32 or int64

def test_join(self):
pyfunc = join_usecase
cfunc = njit(pyfunc)

CASES = [
('', ['', '', '']),
('a', ['', '', '']),
('', ['a', 'bbbb', 'c']),
('🐍🐍🐍', ['⚡⚡'] * 5),
]

for sep, parts in CASES:
self.assertEqual(pyfunc(sep, parts),
cfunc(sep, parts),
"'%s'.join('%s')?" % (sep, parts))

def test_join_interleave_str(self):
# can pass a string as the parts iterable
pyfunc = join_usecase
cfunc = njit(pyfunc)

CASES = [
('abc', '123'),
('🐍🐍🐍', '⚡⚡'),
]

for sep, parts in CASES:
self.assertEqual(pyfunc(sep, parts),
cfunc(sep, parts),
"'%s'.join('%s')?" % (sep, parts))

def test_inplace_concat(self, flags=no_pyobj_flags):
pyfunc = inplace_concat_usecase
cfunc = njit(pyfunc)
Expand Down Expand Up @@ -361,7 +546,7 @@ def pyfunc(option, x, y):
for cmpop in ['==', '!=', '<', '>', '<=', '>=', '']:
args = [cmpop, x, y]
self.assertEqual(pyfunc(*args), cfunc(*args),
msg='failed on {}'.format(args))
msg='failed on {}'.format(args))

def test_literal_concat(self):
def pyfunc(x):
Expand Down
4 changes: 1 addition & 3 deletions numba/tests/test_unicode_literals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@

import sys

import numpy as np

import numba.unittest_support as unittest
from numba import utils, jit
from numba import jit
from .support import TestCase


Expand Down
1 change: 0 additions & 1 deletion numba/tests/test_unicode_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,3 @@ def test_normalize_ir_text_py3(self):

if __name__ == '__main__':
unittest.main()

21 changes: 21 additions & 0 deletions numba/tests/test_unsafe_intrinsics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from numba import njit
from numba.unsafe.tuple import tuple_setitem
from numba.unsafe.ndarray import to_fixed_tuple, empty_inferred
from numba.unsafe.bytes import memcpy_region
from numba.errors import TypingError


Expand Down Expand Up @@ -109,3 +110,23 @@ def func():
got = func()
expect = np.asarray([3.1] * 10)
np.testing.assert_array_equal(got, expect)


class TestBytesIntrinsic(TestCase):
"""Tests for numba.unsafe.bytes
"""
def test_memcpy_region(self):
@njit
def foo(dst, dst_index, src, src_index, nbytes):
# last arg is assume 1 byte alignment
memcpy_region(dst.ctypes.data, dst_index,
src.ctypes.data, src_index, nbytes, 1)

d = np.zeros(10, dtype=np.int8)
s = np.arange(10, dtype=np.int8)

# copy s[1:6] to d[4:9]
foo(d, 4, s, 1, 5)

expected = [0, 0, 0, 0, 1, 2, 3, 4, 5, 0]
np.testing.assert_array_equal(d, expected)