numba · seibert · Dec 10, 2019 · Nov 16, 2019 · Nov 16, 2019 · Nov 16, 2019
diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst
@@ -171,6 +171,7 @@ The following functions, attributes and methods are currently supported:
 * ``==``, ``<``, ``<=``, ``>``, ``>=`` (comparison)
 * ``.startswith()``
 * ``.endswith()``
+* ``.expandtabs()``
 * ``.isspace()``
 * ``.isidentifier()``
 * ``.find()``

diff --git a/numba/tests/test_unicode.py b/numba/tests/test_unicode.py
@@ -118,6 +118,18 @@ def endswith_usecase(x, y):
     return x.endswith(y)
 
 
+def expandtabs_usecase(s):
+    return s.expandtabs()
+
+
+def expandtabs_with_tabsize_usecase(s, tabsize):
+    return s.expandtabs(tabsize)
+
+
+def expandtabs_with_tabsize_kwarg_usecase(s, tabsize):
+    return s.expandtabs(tabsize=tabsize)
+
+
 def split_usecase(x, y):
     return x.split(y)
 
@@ -398,6 +410,43 @@ def test_endswith(self, flags=no_pyobj_flags):
                                  cfunc(a, b),
                                  '%s, %s' % (a, b))
 
+    def test_expandtabs(self):
+        pyfunc = expandtabs_usecase
+        cfunc = njit(pyfunc)
+
+        cases = ['', '\t', 't\tt\t', 'a\t', '\t⚡', 'a\tbc\nab\tc',
+                 '🐍\t⚡', '🐍⚡\n\t\t🐍\t', 'ab\rab\t\t\tab\r\n\ta']
+
+        msg = 'Results of "{}".expandtabs() must be equal'
+        for s in cases:
+            self.assertEqual(pyfunc(s), cfunc(s), msg=msg.format(s))
+
+    def test_expandtabs_with_tabsize(self):
+        pyfuncs = [expandtabs_with_tabsize_usecase,
+                   expandtabs_with_tabsize_kwarg_usecase]
+        messages = ['Results of "{}".expandtabs({}) must be equal',
+                    'Results of "{}".expandtabs(tabsize={}) must be equal']
+
+        cases = ['', '\t', 't\tt\t', 'a\t', '\t⚡', 'a\tbc\nab\tc',
+                 '🐍\t⚡', '🐍⚡\n\t\t🐍\t', 'ab\rab\t\t\tab\r\n\ta']
+
+        for s in cases:
+            for tabsize in range(-1, 10):
+                for pyfunc, msg in zip(pyfuncs, messages):
+                    cfunc = njit(pyfunc)
+                    self.assertEqual(pyfunc(s, tabsize), cfunc(s, tabsize),
+                                     msg=msg.format(s, tabsize))
+
+    def test_expandtabs_exception_noninteger_tabsize(self):
+        pyfunc = expandtabs_with_tabsize_usecase
+        cfunc = njit(pyfunc)
+
+        accepted_types = (types.Integer, int)
+        with self.assertRaises(TypingError) as raises:
+            cfunc('\t', 2.4)
+        msg = '"tabsize" must be {}, not float'.format(accepted_types)
+        self.assertIn(msg, str(raises.exception))
+
     def test_in(self, flags=no_pyobj_flags):
         pyfunc = in_usecase
         cfunc = njit(pyfunc)

diff --git a/numba/unicode.py b/numba/unicode.py
@@ -1,4 +1,5 @@
 import operator
+import sys
 
 import numpy as np
 from llvmlite.ir import IntType, Constant
@@ -675,6 +676,71 @@ def endswith_impl(a, b):
         return endswith_impl
 
 
+# https://github.com/python/cpython/blob/1d4b6ba19466aba0eb91c4ba01ba509acf18c723/Objects/unicodeobject.c#L11519-L11595    # noqa: E501
+@overload_method(types.UnicodeType, 'expandtabs')
+def unicode_expandtabs(data, tabsize=8):
+    """Implements str.expandtabs()"""
+    thety = tabsize
+    if isinstance(tabsize, types.Omitted):
+        thety = tabsize.value
+        # if the type is optional, the concrete type is the captured type
-        # if the type is optional, the concrete type is the captured type
+    # if the type is optional, the concrete type is the captured type
-        # if the type is optional, the concrete type is the captured type
+    # if the type is optional, the concrete type is the captured type
+    elif isinstance(tabsize, types.Optional):
+        thety = tabsize.type
+
+    accepted = (types.Integer, int)
+    if thety is not None and not isinstance(thety, accepted):
+        raise TypingError(
+            '"tabsize" must be {}, not {}'.format(accepted, tabsize))
+
+    def expandtabs_impl(data, tabsize=8):
+        length = len(data)
+        j = line_pos = 0
+        found = False
+        for i in range(length):
+            code_point = _get_code_point(data, i)
+            if code_point == 9: # 0x9 '\t'
+                found = True
+                if tabsize > 0:
+                    # cannot overflow
+                    incr = tabsize - (line_pos % tabsize)
+                    if j > sys.maxsize - incr:
+                        raise OverflowError('new string is too long')
+                    line_pos += incr
+                    j += incr
+            else:
+                if j > sys.maxsize - 1:
+                    raise OverflowError('new string is too long')
+                line_pos += 1
+                j += 1
+                if code_point in (10, 13): # (0xa '\n', 0xd '\r')
+                    line_pos = 0
+
+        if not found:
+            return data
+
+        res = _empty_string(data._kind, j, data._is_ascii)
+        j = line_pos = 0
+        for i in range(length):
+            code_point = _get_code_point(data, i)
+            if code_point == 9:  # 0x9 '\t'
+                if tabsize > 0:
+                    incr = tabsize - (line_pos % tabsize)
+                    line_pos += incr
+                    for idx in range(j, j + incr):
+                        _set_code_point(res, idx, 32) # 0x20 ' '
+                    j += incr
+            else:
+                line_pos += 1
+                _set_code_point(res, j, code_point)
+                j += 1
+                if code_point in (10, 13): # (0xa '\n', 0xd '\r')
+                    line_pos = 0
+
+        return res
+
+    return expandtabs_impl
+
+
 @overload_method(types.UnicodeType, 'split')
 def unicode_split(a, sep=None, maxsplit=-1):
     if not (maxsplit == -1 or