numba · seibert · Jan 9, 2020 · Dec 16, 2019 · Dec 16, 2019 · Jan 2, 2020
diff --git a/numba/cuda/api.py b/numba/cuda/api.py
@@ -231,8 +231,35 @@ def _fill_stride_by_order(shape, dtype, order):
 def device_array_like(ary, stream=0):
     """Call cuda.devicearray() with information from the array.
     """
-    return device_array(shape=ary.shape, dtype=ary.dtype,
-                        strides=ary.strides, stream=stream)
+    # Avoid attempting to recompute strides if the default strides will be
+    # sufficient to create a contiguous array.
+    if ary.flags['C_CONTIGUOUS'] or ary.ndim <= 1:
+        return device_array(shape=ary.shape, dtype=ary.dtype, stream=stream)
+    elif ary.flags['F_CONTIGUOUS']:
+        return device_array(shape=ary.shape, dtype=ary.dtype, order='F',
+                            stream=stream)
+
+    # Otherwise, we need to compute new strides using an algorithm adapted from
+    # NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
+    # core/src/multiarray/ctors.c. We permute the strides in ascending order
+    # then compute the stride for the dimensions with the same permutation.
+
+    # Stride permuation. E.g. a stride array (4, -2, 12) becomes
+    # [(1, -2), (0, 4), (2, 12)]
+    strideperm = [ x for x in enumerate(ary.strides) ]
+    strideperm.sort(key = lambda x: x[1])
+
+    # Compute new strides using permutation
+    strides = [0] * len(ary.strides)
+    stride = ary.dtype.itemsize
+    for i_perm, _ in strideperm:
+        strides[i_perm] = stride
+        stride *= ary.shape[i_perm]
+    strides = tuple(strides)
+
+    return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
+                        stream=stream)
+
 
 # Stream helper
 @require_context

diff --git a/numba/cuda/simulator/cudadrv/devicearray.py b/numba/cuda/simulator/cudadrv/devicearray.py
@@ -219,7 +219,33 @@ def device_array(*args, **kwargs):
 
 
 def device_array_like(ary, stream=0):
-    return FakeCUDAArray(np.empty_like(ary))
+    # Avoid attempting to recompute strides if the default strides will be
+    # sufficient to create a contiguous array.
+    if ary.flags['C_CONTIGUOUS'] or ary.ndim <= 1:
+        return FakeCUDAArray(np.ndarray(shape=ary.shape, dtype=ary.dtype))
+    elif ary.flags['F_CONTIGUOUS']:
+        return FakeCUDAArray(np.ndarray(shape=ary.shape, dtype=ary.dtype,
+                                        order='F'))
+
+    # Otherwise, we need to compute new strides using an algorithm adapted from
+    # NumPy's v1.17.4's PyArray_NewLikeArrayWithShape in
+    # core/src/multiarray/ctors.c. We permute the strides in ascending order
+    # then compute the stride for the dimensions with the same permutation.
+
+    # Stride permuation. E.g. a stride array (4, -2, 12) becomes
+    # [(1, -2), (0, 4), (2, 12)]
+    strideperm = [ x for x in enumerate(ary.strides) ]
+    strideperm.sort(key = lambda x: x[1])
+
+    # Compute new strides using permutation
+    strides = [0] * len(ary.strides)
+    stride = ary.dtype.itemsize
+    for i_perm, _ in strideperm:
+        strides[i_perm] = stride
+        stride *= ary.shape[i_perm]
+    strides = tuple(strides)
+
+    return FakeCUDAArray(np.ndarray(shape=ary.shape, dtype=ary.dtype, strides=strides))
 
 
 def auto_device(ary, stream=0, copy=True):
@@ -260,4 +286,3 @@ def require_cuda_ndarray(obj):
     if not is_cuda_ndarray(obj):
         raise ValueError('require an cuda ndarray object')
 
-
diff --git a/numba/cuda/tests/cudapy/test_array.py b/numba/cuda/tests/cudapy/test_array.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from numba.cuda.testing import unittest, SerialMixin
-from numba.cuda.testing import skip_on_cudasim
+from numba.cuda.testing import skip_on_cudasim, skip_unless_cudasim
 from numba import cuda
 
 
@@ -62,6 +62,127 @@ def test_auto_device_const(self):
         d, _ = cuda.devicearray.auto_device(2)
         self.assertTrue(np.all(d.copy_to_host() == np.array(2)))
 
+    def _test_device_array_like_same(self, d_a):
+        """
+        Tests of device_array_like where shape, strides, dtype, and flags should
+        all be equal.
+        """
+        d_a_like = cuda.device_array_like(d_a)
+        self.assertEqual(d_a.shape, d_a_like.shape)
+        self.assertEqual(d_a.strides, d_a_like.strides)
+        self.assertEqual(d_a.dtype, d_a_like.dtype)
+        self.assertEqual(d_a.flags['C_CONTIGUOUS'], d_a_like.flags['C_CONTIGUOUS'])
+        self.assertEqual(d_a.flags['F_CONTIGUOUS'], d_a_like.flags['F_CONTIGUOUS'])
+
+    def test_device_array_like_1d(self):
+        d_a = cuda.device_array(10, order='C')
+        self._test_device_array_like_same(d_a)
+
+    def test_device_array_like_2d(self):
+        d_a = cuda.device_array((10, 12), order='C')
+        self._test_device_array_like_same(d_a)
+
+    def test_device_array_like_2d_transpose(self):
+        d_a = cuda.device_array((10, 12), order='C')
+        self._test_device_array_like_same(d_a.T)
+
+    def test_device_array_like_3d(self):
+        d_a = cuda.device_array((10, 12, 14), order='C')
+        self._test_device_array_like_same(d_a)
+
+    def test_device_array_like_1d_f(self):
+        d_a = cuda.device_array(10, order='F')
+        self._test_device_array_like_same(d_a)
+
+    def test_device_array_like_2d_f(self):
+        d_a = cuda.device_array((10, 12), order='F')
+        self._test_device_array_like_same(d_a)
+
+    def test_device_array_like_2d_f_transpose(self):
+        d_a = cuda.device_array((10, 12), order='F')
+        self._test_device_array_like_same(d_a.T)
+
+    def test_device_array_like_3d_f(self):
+        d_a = cuda.device_array((10, 12, 14), order='F')
+        self._test_device_array_like_same(d_a)
+
+    def _test_device_array_like_view(self, view, d_view):
+        """
+        Tests of device_array_like where the original array is a view - the
+        strides should not be equal because a contiguous array is expected.
+        """
+        d_like = cuda.device_array_like(d_view)
+        self.assertEqual(d_view.shape, d_like.shape)
+        self.assertEqual(d_view.dtype, d_like.dtype)
+
+        # Use NumPy as a reference for the expected strides
+        like = np.zeros_like(view)
+        self.assertEqual(d_like.strides, like.strides)
+        self.assertEqual(d_like.flags['C_CONTIGUOUS'], like.flags['C_CONTIGUOUS'])
+        self.assertEqual(d_like.flags['F_CONTIGUOUS'], like.flags['F_CONTIGUOUS'])
+
+    def test_device_array_like_1d_view(self):
+        shape = 10
+        view = np.zeros(shape)[::2]
+        d_view = cuda.device_array(shape)[::2]
+        self._test_device_array_like_view(view, d_view)
+
+    def test_device_array_like_1d_view_f(self):
+        shape = 10
+        view = np.zeros(shape, order='F')[::2]
+        d_view = cuda.device_array(shape, order='F')[::2]
+        self._test_device_array_like_view(view, d_view)
+
+    def test_device_array_like_2d_view(self):
+        shape = (10, 12)
+        view = np.zeros(shape)[::2, ::2]
+        d_view = cuda.device_array(shape)[::2, ::2]
+        self._test_device_array_like_view(view, d_view)
+
+    def test_device_array_like_2d_view_f(self):
+        shape = (10, 12)
+        view = np.zeros(shape, order='F')[::2, ::2]
+        d_view = cuda.device_array(shape, order='F')[::2, ::2]
+        self._test_device_array_like_view(view, d_view)
+
+    @skip_on_cudasim('Numba and NumPy stride semantics differ for transpose')
+    def test_device_array_like_2d_view_transpose_device(self):
+        shape = (10, 12)
+        view = np.zeros(shape)[::2, ::2].T
+        d_view = cuda.device_array(shape)[::2, ::2].T
+        # This is a special case (see issue #4974) because creating the
+        # transpose creates a new contiguous allocation with different strides.
+        # In this case, rather than comparing against NumPy, we can only compare
+        # against expected values.
+        d_like = cuda.device_array_like(d_view)
+        self.assertEqual(d_view.shape, d_like.shape)
+        self.assertEqual(d_view.dtype, d_like.dtype)
+        self.assertEqual((40, 8), d_like.strides)
+        self.assertTrue(d_like.is_c_contiguous())
+        self.assertFalse(d_like.is_f_contiguous())
+
+    @skip_unless_cudasim('Numba and NumPy stride semantics differ for transpose')
+    def test_device_array_like_2d_view_transpose_simulator(self):
+        shape = (10, 12)
+        view = np.zeros(shape)[::2, ::2].T
+        d_view = cuda.device_array(shape)[::2, ::2].T
+        # On the simulator, the transpose has different strides to on a CUDA
+        # device (See issue #4974). Here we can compare strides against NumPy as
+        # a reference.
+        like = np.zeros_like(view)
+        d_like = cuda.device_array_like(d_view)
+        self.assertEqual(d_view.shape, d_like.shape)
+        self.assertEqual(d_view.dtype, d_like.dtype)
+        self.assertEqual(like.strides, d_like.strides)
+        self.assertEqual(like.flags['C_CONTIGUOUS'], d_like.flags['C_CONTIGUOUS'])
+        self.assertEqual(like.flags['F_CONTIGUOUS'], d_like.flags['F_CONTIGUOUS'])
+
+    def test_device_array_like_2d_view_f_transpose(self):
+        shape = (10, 12)
+        view = np.zeros(shape, order='F')[::2, ::2].T
+        d_view = cuda.device_array(shape, order='F')[::2, ::2].T
+        self._test_device_array_like_view(view, d_view)
+
     @skip_on_cudasim('Kernel definitions not created in the simulator')
     def test_issue_4628(self):
         # CUDA Device arrays were reported as always being typed with 'A' order

diff --git a/numba/cuda/tests/cudapy/test_transpose.py b/numba/cuda/tests/cudapy/test_transpose.py
@@ -60,6 +60,21 @@ def test_transpose_bool(self):
                 host_transposed = d_transposed.copy_to_host()
                 np.testing.assert_array_equal(transposed, host_transposed)
 
+    def test_transpose_view(self):
+        # Because the strides of transposes of views differ to those in NumPy
+        # (see issue #4974), we test the shape and strides of a transpose.
+        a = np.arange(120, dtype=np.int64).reshape((10, 12))
+        a_view_t = a[::2, ::2].T
+
+        d_a = cuda.to_device(a)
+        d_a_view_t = d_a[::2, ::2].T
+
+        self.assertEqual(d_a_view_t.shape, (6, 5))
+        self.assertEqual(d_a_view_t.strides, (40, 8))
+
+        h_a_view_t = d_a_view_t.copy_to_host()
+        np.testing.assert_array_equal(a_view_t, h_a_view_t)
+
 
 if __name__ == '__main__':
     unittest.main()