Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make device_array_like create contiguous arrays (Fixes #4832) #4975

Merged
merged 8 commits into from
Jan 9, 2020
31 changes: 29 additions & 2 deletions numba/cuda/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,35 @@ def _fill_stride_by_order(shape, dtype, order):
def device_array_like(ary, stream=0):
"""Call cuda.devicearray() with information from the array.
"""
return device_array(shape=ary.shape, dtype=ary.dtype,
strides=ary.strides, stream=stream)
# Avoid attempting to recompute strides if the default strides will be
# sufficient to create a contiguous array.
if ary.flags['C_CONTIGUOUS'] or ary.ndim <= 1:
return device_array(shape=ary.shape, dtype=ary.dtype, stream=stream)
elif ary.flags['F_CONTIGUOUS']:
return device_array(shape=ary.shape, dtype=ary.dtype, order='F',
stream=stream)

# Otherwise, we need to compute new strides using an algorithm adapted from
# NumPy v1.17.4's PyArray_NewLikeArrayWithShape in
# core/src/multiarray/ctors.c. We permute the strides in ascending order
# then compute the stride for the dimensions with the same permutation.

# Stride permuation. E.g. a stride array (4, -2, 12) becomes
# [(1, -2), (0, 4), (2, 12)]
strideperm = [ x for x in enumerate(ary.strides) ]
strideperm.sort(key = lambda x: x[1])

# Compute new strides using permutation
strides = [0] * len(ary.strides)
stride = ary.dtype.itemsize
for i_perm, _ in strideperm:
strides[i_perm] = stride
stride *= ary.shape[i_perm]
strides = tuple(strides)

return device_array(shape=ary.shape, dtype=ary.dtype, strides=strides,
stream=stream)


# Stream helper
@require_context
Expand Down
29 changes: 27 additions & 2 deletions numba/cuda/simulator/cudadrv/devicearray.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,33 @@ def device_array(*args, **kwargs):


def device_array_like(ary, stream=0):
return FakeCUDAArray(np.empty_like(ary))
# Avoid attempting to recompute strides if the default strides will be
# sufficient to create a contiguous array.
if ary.flags['C_CONTIGUOUS'] or ary.ndim <= 1:
return FakeCUDAArray(np.ndarray(shape=ary.shape, dtype=ary.dtype))
elif ary.flags['F_CONTIGUOUS']:
return FakeCUDAArray(np.ndarray(shape=ary.shape, dtype=ary.dtype,
order='F'))

# Otherwise, we need to compute new strides using an algorithm adapted from
# NumPy's v1.17.4's PyArray_NewLikeArrayWithShape in
# core/src/multiarray/ctors.c. We permute the strides in ascending order
# then compute the stride for the dimensions with the same permutation.

# Stride permuation. E.g. a stride array (4, -2, 12) becomes
# [(1, -2), (0, 4), (2, 12)]
strideperm = [ x for x in enumerate(ary.strides) ]
strideperm.sort(key = lambda x: x[1])

# Compute new strides using permutation
strides = [0] * len(ary.strides)
stride = ary.dtype.itemsize
for i_perm, _ in strideperm:
strides[i_perm] = stride
stride *= ary.shape[i_perm]
strides = tuple(strides)
stuartarchibald marked this conversation as resolved.
Show resolved Hide resolved

return FakeCUDAArray(np.ndarray(shape=ary.shape, dtype=ary.dtype, strides=strides))


def auto_device(ary, stream=0, copy=True):
Expand Down Expand Up @@ -260,4 +286,3 @@ def require_cuda_ndarray(obj):
if not is_cuda_ndarray(obj):
raise ValueError('require an cuda ndarray object')


123 changes: 122 additions & 1 deletion numba/cuda/tests/cudapy/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np

from numba.cuda.testing import unittest, SerialMixin
from numba.cuda.testing import skip_on_cudasim
from numba.cuda.testing import skip_on_cudasim, skip_unless_cudasim
from numba import cuda


Expand Down Expand Up @@ -62,6 +62,127 @@ def test_auto_device_const(self):
d, _ = cuda.devicearray.auto_device(2)
self.assertTrue(np.all(d.copy_to_host() == np.array(2)))

def _test_device_array_like_same(self, d_a):
stuartarchibald marked this conversation as resolved.
Show resolved Hide resolved
"""
Tests of device_array_like where shape, strides, dtype, and flags should
stuartarchibald marked this conversation as resolved.
Show resolved Hide resolved
all be equal.
"""
d_a_like = cuda.device_array_like(d_a)
self.assertEqual(d_a.shape, d_a_like.shape)
self.assertEqual(d_a.strides, d_a_like.strides)
self.assertEqual(d_a.dtype, d_a_like.dtype)
self.assertEqual(d_a.flags['C_CONTIGUOUS'], d_a_like.flags['C_CONTIGUOUS'])
self.assertEqual(d_a.flags['F_CONTIGUOUS'], d_a_like.flags['F_CONTIGUOUS'])

def test_device_array_like_1d(self):
d_a = cuda.device_array(10, order='C')
self._test_device_array_like_same(d_a)

def test_device_array_like_2d(self):
d_a = cuda.device_array((10, 12), order='C')
self._test_device_array_like_same(d_a)

def test_device_array_like_2d_transpose(self):
d_a = cuda.device_array((10, 12), order='C')
self._test_device_array_like_same(d_a.T)

def test_device_array_like_3d(self):
d_a = cuda.device_array((10, 12, 14), order='C')
self._test_device_array_like_same(d_a)

def test_device_array_like_1d_f(self):
d_a = cuda.device_array(10, order='F')
self._test_device_array_like_same(d_a)

def test_device_array_like_2d_f(self):
d_a = cuda.device_array((10, 12), order='F')
self._test_device_array_like_same(d_a)

def test_device_array_like_2d_f_transpose(self):
d_a = cuda.device_array((10, 12), order='F')
self._test_device_array_like_same(d_a.T)

def test_device_array_like_3d_f(self):
d_a = cuda.device_array((10, 12, 14), order='F')
self._test_device_array_like_same(d_a)

def _test_device_array_like_view(self, view, d_view):
"""
Tests of device_array_like where the original array is a view - the
strides should not be equal because a contiguous array is expected.
"""
d_like = cuda.device_array_like(d_view)
self.assertEqual(d_view.shape, d_like.shape)
self.assertEqual(d_view.dtype, d_like.dtype)

# Use NumPy as a reference for the expected strides
like = np.zeros_like(view)
self.assertEqual(d_like.strides, like.strides)
self.assertEqual(d_like.flags['C_CONTIGUOUS'], like.flags['C_CONTIGUOUS'])
self.assertEqual(d_like.flags['F_CONTIGUOUS'], like.flags['F_CONTIGUOUS'])

def test_device_array_like_1d_view(self):
shape = 10
view = np.zeros(shape)[::2]
d_view = cuda.device_array(shape)[::2]
self._test_device_array_like_view(view, d_view)

def test_device_array_like_1d_view_f(self):
shape = 10
view = np.zeros(shape, order='F')[::2]
d_view = cuda.device_array(shape, order='F')[::2]
self._test_device_array_like_view(view, d_view)

def test_device_array_like_2d_view(self):
shape = (10, 12)
view = np.zeros(shape)[::2, ::2]
d_view = cuda.device_array(shape)[::2, ::2]
self._test_device_array_like_view(view, d_view)

def test_device_array_like_2d_view_f(self):
shape = (10, 12)
view = np.zeros(shape, order='F')[::2, ::2]
d_view = cuda.device_array(shape, order='F')[::2, ::2]
self._test_device_array_like_view(view, d_view)

@skip_on_cudasim('Numba and NumPy stride semantics differ for transpose')
def test_device_array_like_2d_view_transpose_device(self):
shape = (10, 12)
view = np.zeros(shape)[::2, ::2].T
d_view = cuda.device_array(shape)[::2, ::2].T
# This is a special case (see issue #4974) because creating the
# transpose creates a new contiguous allocation with different strides.
# In this case, rather than comparing against NumPy, we can only compare
# against expected values.
d_like = cuda.device_array_like(d_view)
self.assertEqual(d_view.shape, d_like.shape)
self.assertEqual(d_view.dtype, d_like.dtype)
self.assertEqual((40, 8), d_like.strides)
self.assertTrue(d_like.is_c_contiguous())
self.assertFalse(d_like.is_f_contiguous())

@skip_unless_cudasim('Numba and NumPy stride semantics differ for transpose')
def test_device_array_like_2d_view_transpose_simulator(self):
shape = (10, 12)
view = np.zeros(shape)[::2, ::2].T
d_view = cuda.device_array(shape)[::2, ::2].T
# On the simulator, the transpose has different strides to on a CUDA
# device (See issue #4974). Here we can compare strides against NumPy as
# a reference.
like = np.zeros_like(view)
d_like = cuda.device_array_like(d_view)
self.assertEqual(d_view.shape, d_like.shape)
self.assertEqual(d_view.dtype, d_like.dtype)
self.assertEqual(like.strides, d_like.strides)
self.assertEqual(like.flags['C_CONTIGUOUS'], d_like.flags['C_CONTIGUOUS'])
self.assertEqual(like.flags['F_CONTIGUOUS'], d_like.flags['F_CONTIGUOUS'])

def test_device_array_like_2d_view_f_transpose(self):
shape = (10, 12)
view = np.zeros(shape, order='F')[::2, ::2].T
d_view = cuda.device_array(shape, order='F')[::2, ::2].T
self._test_device_array_like_view(view, d_view)

@skip_on_cudasim('Kernel definitions not created in the simulator')
def test_issue_4628(self):
# CUDA Device arrays were reported as always being typed with 'A' order
Expand Down
15 changes: 15 additions & 0 deletions numba/cuda/tests/cudapy/test_transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,21 @@ def test_transpose_bool(self):
host_transposed = d_transposed.copy_to_host()
np.testing.assert_array_equal(transposed, host_transposed)

def test_transpose_view(self):
# Because the strides of transposes of views differ to those in NumPy
# (see issue #4974), we test the shape and strides of a transpose.
a = np.arange(120, dtype=np.int64).reshape((10, 12))
a_view_t = a[::2, ::2].T

d_a = cuda.to_device(a)
d_a_view_t = d_a[::2, ::2].T

self.assertEqual(d_a_view_t.shape, (6, 5))
self.assertEqual(d_a_view_t.strides, (40, 8))

h_a_view_t = d_a_view_t.copy_to_host()
np.testing.assert_array_equal(a_view_t, h_a_view_t)


if __name__ == '__main__':
unittest.main()