Skip to content

Commit

Permalink
Merge pull request #3186 from njwhite/feature/const
Browse files Browse the repository at this point in the history
Support Records in CUDA Const Memory
  • Loading branch information
seibert committed Sep 10, 2018
2 parents f849cc7 + 972358f commit 73fc6fe
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 34 deletions.
27 changes: 6 additions & 21 deletions numba/cuda/cudaimpl.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from functools import reduce
import operator
import six

from llvmlite.llvmpy.core import Type
import llvmlite.llvmpy.core as lc
Expand Down Expand Up @@ -100,29 +101,13 @@ def ptx_sreg_impl(context, builder, sig, args):
def ptx_cmem_arylike(context, builder, sig, args):
lmod = builder.module
[arr] = args
flat = arr.flatten(order='A')
aryty = sig.return_type
dtype = aryty.dtype

if isinstance(dtype, types.Complex):
elemtype = (types.float32
if dtype == types.complex64
else types.float64)
constvals = []
for i in range(flat.size):
elem = flat[i]
real = context.get_constant(elemtype, elem.real)
imag = context.get_constant(elemtype, elem.imag)
constvals.extend([real, imag])

elif dtype in types.number_domain:
constvals = [context.get_constant(dtype, flat[i])
for i in range(flat.size)]

else:
raise TypeError("unsupport type: %s" % dtype)

constary = lc.Constant.array(constvals[0].type, constvals)
constvals = [
context.get_constant(types.byte, i)
for i in six.iterbytes(arr.tobytes(order='A'))
]
constary = lc.Constant.array(Type.int(8), constvals)

addrspace = nvvm.ADDRSPACE_CONSTANT
gv = lmod.add_global_variable(constary.type, name="_cudapy_cmem",
Expand Down
4 changes: 4 additions & 0 deletions numba/cuda/simulator/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ def __getitem__(self, configuration):
def bind(self):
pass


def specialize(self, *args):
return self

def forall(self, ntasks, tpb=0, stream=0, sharedmem=0):
return self[ntasks, 1, stream, sharedmem]

Expand Down
160 changes: 147 additions & 13 deletions numba/cuda/tests/cudapy/test_constmem.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,46 @@

from numba import cuda
from numba.cuda.testing import unittest, SerialMixin
from numba.config import ENABLE_CUDASIM


CONST_EMPTY = np.array([])
CONST1D = np.arange(10, dtype=np.float64) / 2.
CONST2D = np.asfortranarray(
np.arange(100, dtype=np.int32).reshape(10, 10))
CONST3D = ((np.arange(5*5*5, dtype=np.complex64).reshape(5, 5, 5) + 1j) /
2j)
np.arange(100, dtype=np.int32).reshape(10, 10))
CONST3D = ((np.arange(5 * 5 * 5, dtype=np.complex64).reshape(5, 5, 5) + 1j) /
2j)

CONST_RECORD_EMPTY = np.array(
[],
dtype=[('x', float), ('y', int)])
CONST_RECORD = np.array(
[(1.0, 2), (3.0, 4)],
dtype=[('x', float), ('y', int)])
CONST_RECORD_ALIGN = np.array(
[(1, 2, 3, 0xDEADBEEF, 8), (4, 5, 6, 0xBEEFDEAD, 10)],
dtype=np.dtype(
dtype=[
('a', np.uint8),
('b', np.uint8),
('x', np.uint8),
('y', np.uint32),
('z', np.uint8),
],
align=True))


def cuconstEmpty(A):
C = cuda.const.array_like(CONST_EMPTY)
i = cuda.grid(1)
A[i] = len(C)


def cuconst(A):
C = cuda.const.array_like(CONST1D)
i = cuda.grid(1)
A[i] = C[i]

# +1 or it'll be loaded & stored as a u32
A[i] = C[i] + 1.0


def cuconst2d(A):
Expand All @@ -33,28 +60,135 @@ def cuconst3d(A):
A[i, j, k] = C[i, j, k]


def cuconstRecEmpty(A):
C = cuda.const.array_like(CONST_RECORD_EMPTY)
i = cuda.grid(1)
A[i] = len(C)


def cuconstRec(A, B):
C = cuda.const.array_like(CONST_RECORD)
i = cuda.grid(1)
A[i] = C[i]['x']
B[i] = C[i]['y']


def cuconstRecAlign(A, B, C, D, E):
Z = cuda.const.array_like(CONST_RECORD_ALIGN)
i = cuda.grid(1)
A[i] = Z[i]['a']
B[i] = Z[i]['b']
C[i] = Z[i]['x']
D[i] = Z[i]['y']
E[i] = Z[i]['z']


class TestCudaConstantMemory(SerialMixin, unittest.TestCase):
def test_const_array(self):
jcuconst = cuda.jit('void(float64[:])')(cuconst)
self.assertTrue('.const' in jcuconst.ptx)
A = np.empty_like(CONST1D)
A = np.zeros_like(CONST1D)
jcuconst[2, 5](A)
self.assertTrue(np.all(A == CONST1D))
self.assertTrue(np.all(A == CONST1D + 1))

if not ENABLE_CUDASIM:
self.assertIn(
'ld.const.f64',
jcuconst.ptx,
"as we're adding to it, load as a double")

def test_const_empty(self):
jcuconstEmpty = cuda.jit('void(float64[:])')(cuconstEmpty)
A = np.full(1, fill_value=-1, dtype=int)
jcuconstEmpty[1, 1](A)
self.assertTrue(np.all(A == 0))

def test_const_array_2d(self):
jcuconst2d = cuda.jit('void(int32[:,:])')(cuconst2d)
self.assertTrue('.const' in jcuconst2d.ptx)
A = np.empty_like(CONST2D, order='C')
jcuconst2d[(2,2), (5,5)](A)
A = np.zeros_like(CONST2D, order='C')
jcuconst2d[(2, 2), (5, 5)](A)
self.assertTrue(np.all(A == CONST2D))

if not ENABLE_CUDASIM:
self.assertIn(
'ld.const.u32',
jcuconst2d.ptx,
"load the ints as ints")

def test_const_array_3d(self):
jcuconst3d = cuda.jit('void(complex64[:,:,:])')(cuconst3d)
self.assertTrue('.const' in jcuconst3d.ptx)
A = np.empty_like(CONST3D, order='F')
A = np.zeros_like(CONST3D, order='F')
jcuconst3d[1, (5, 5, 5)](A)
self.assertTrue(np.all(A == CONST3D))

if not ENABLE_CUDASIM:
self.assertIn(
'ld.const.v2.u32',
jcuconst3d.ptx,
"load the two halves of the complex as u32s")

def test_const_record_empty(self):
jcuconstRecEmpty = cuda.jit('void(float64[:])')(cuconstRecEmpty)
A = np.full(1, fill_value=-1, dtype=int)
jcuconstRecEmpty[1, 1](A)
self.assertTrue(np.all(A == 0))

def test_const_record(self):
A = np.zeros(2, dtype=float)
B = np.zeros(2, dtype=int)
jcuconst = cuda.jit(cuconstRec).specialize(A, B)

if not ENABLE_CUDASIM:
if not any(c in jcuconst.ptx for c in [
# a vector load: the compiler fuses the load
# of the x and y fields into a single instruction!
'ld.const.v2.u64',

# for some reason Win64 / Py3 / CUDA 9.1 decides
# to do two u32 loads, and shifts and ors the
# values to get the float `x` field, then uses
# another ld.const.u32 to load the int `y` as
# a 32-bit value!
'ld.const.u32',
]):
raise AssertionError(
"the compiler should realise it doesn't " \
"need to interpret the bytes as float!")

jcuconst[2, 1](A, B)
np.testing.assert_allclose(A, CONST_RECORD['x'])
np.testing.assert_allclose(B, CONST_RECORD['y'])

def test_const_record_align(self):
A = np.zeros(2, dtype=np.float64)
B = np.zeros(2, dtype=np.float64)
C = np.zeros(2, dtype=np.float64)
D = np.zeros(2, dtype=np.float64)
E = np.zeros(2, dtype=np.float64)
jcuconst = cuda.jit(cuconstRecAlign).specialize(A, B, C, D, E)

if not ENABLE_CUDASIM:
self.assertIn(
'ld.const.v4.u8',
jcuconst.ptx,
'load the first three bytes as a vector')

self.assertIn(
'ld.const.u32',
jcuconst.ptx,
'load the uint32 natively')

self.assertIn(
'ld.const.u8',
jcuconst.ptx,
'load the last byte by itself')

jcuconst[2, 1](A, B, C, D, E)
np.testing.assert_allclose(A, CONST_RECORD_ALIGN['a'])
np.testing.assert_allclose(B, CONST_RECORD_ALIGN['b'])
np.testing.assert_allclose(C, CONST_RECORD_ALIGN['x'])
np.testing.assert_allclose(D, CONST_RECORD_ALIGN['y'])
np.testing.assert_allclose(E, CONST_RECORD_ALIGN['z'])


if __name__ == '__main__':
unittest.main()

0 comments on commit 73fc6fe

Please sign in to comment.