From 7e169e1f9dc796f0c89f78e2a510af245412871d Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 10 Jan 2020 05:51:34 -0800 Subject: [PATCH 1/9] Remove the autotuner The autotuner has been deprecated since Numba 0.40. --- numba/cuda/compiler.py | 48 +-- numba/cuda/cudadrv/autotune.py | 343 -------------------- numba/cuda/tests/cudapy/test_deprecation.py | 44 --- 3 files changed, 2 insertions(+), 433 deletions(-) delete mode 100644 numba/cuda/cudadrv/autotune.py delete mode 100644 numba/cuda/tests/cudapy/test_deprecation.py diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index af8e9ad1c71..19d6b0ccea6 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -16,7 +16,6 @@ from numba import funcdesc, typing, utils, serialize from numba.compiler_lock import global_compiler_lock -from .cudadrv.autotune import AutoTuner from .cudadrv.devices import get_context from .cudadrv import nvvm, devicearray, driver from .errors import normalize_kernel_dimensions @@ -284,7 +283,7 @@ def _compute_thread_per_block(self, kernel): # Prefer user-specified config if tpb != 0: return tpb - # Else, ask the driver to give a good cofnig + # Else, ask the driver to give a good config else: ctx = get_context() kwargs = dict( @@ -293,24 +292,9 @@ def _compute_thread_per_block(self, kernel): memsize=self.sharedmem, blocksizelimit=1024, ) - try: - # Raises from the driver if the feature is unavailable - _, tpb = ctx.get_max_potential_block_size(**kwargs) - except AttributeError: - # Fallback to table-based approach. - tpb = self._fallback_autotune_best(kernel) - raise + _, tpb = ctx.get_max_potential_block_size(**kwargs) return tpb - def _fallback_autotune_best(self, kernel): - try: - tpb = kernel.autotune.best() - except ValueError: - warnings.warn('Could not autotune, using default tpb of 128') - tpb = 128 - - return tpb - class CUDAKernelBase(object): """Define interface for configurable kernels @@ -719,34 +703,6 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs): else: raise NotImplementedError(ty, val) - @property - def autotune(self): - """Return the autotuner object associated with this kernel.""" - warnings.warn(_deprec_warn_msg.format('autotune'), DeprecationWarning) - has_autotune = hasattr(self, '_autotune') - if has_autotune and self._autotune.dynsmem == self.sharedmem: - return self._autotune - else: - # Get CUDA Function - cufunc = self._func.get() - at = AutoTuner(info=cufunc.attrs, cc=cufunc.device.compute_capability) - self._autotune = at - return self._autotune - - @property - def occupancy(self): - """Occupancy is the ratio of the number of active warps per multiprocessor to the maximum - number of warps that can be active on the multiprocessor at once. - Calculate the theoretical occupancy of the kernel given the - current configuration.""" - warnings.warn(_deprec_warn_msg.format('occupancy'), DeprecationWarning) - thread_per_block = reduce(operator.mul, self.blockdim, 1) - return self.autotune.closest(thread_per_block) - - -_deprec_warn_msg = ("The .{} attribute is is deprecated and will be " - "removed in a future release") - class AutoJitCUDAKernel(CUDAKernelBase): ''' diff --git a/numba/cuda/cudadrv/autotune.py b/numba/cuda/cudadrv/autotune.py deleted file mode 100644 index 2dae5293f24..00000000000 --- a/numba/cuda/cudadrv/autotune.py +++ /dev/null @@ -1,343 +0,0 @@ -""" -- Parse jit compile info -- Compute warp occupancy histogram -""" -from __future__ import division, absolute_import, print_function -import math -import re - -SMEM0K = 0 -SMEM8K = 8 * 2 ** 10 -SMEM16K = 16 * 2 ** 10 -SMEM48K = 48 * 2 ** 10 -SMEM64K = 64 * 2 ** 10 -SMEM80K = 80 * 2 ** 10 -SMEM96K = 96 * 2 ** 10 -SMEM112K = 112 * 2 ** 10 - -#------------------------------------------------------------------------------ -# autotuning - - -class OccupancyThreadKey(object): - def __init__(self, item): - self.occupancy, self.threads = item - self.comparison = self.occupancy, 1 / self.threads - - def __lt__(self, other): - return self.comparison < other.comparison - - def __eq__(self, other): - return self.comparison == other.comparison - - def __ne__(self, other): - return self.comparison != other.comparison - - def __gt__(self, other): - return self.comparison > other.comparison - - def __le__(self, other): - return self.comparison <= other.comparison - - def __ge__(self, other): - return self.comparison >= other.comparison - - -class AutoTuner(object): - """Autotune a kernel based upon the theoretical occupancy. - """ - def __init__(self, cc, info, smem_config=None, dynsmem=0): - self.cc = cc - self.dynsmem = dynsmem - self._table = warp_occupancy(info=info, cc=cc) - self._by_occupancy = list(reversed(sorted(((occup, tpb) - for tpb, (occup, factor) - in self.table.items()), - key=OccupancyThreadKey))) - - @property - def table(self): - """A dict with thread-per-block as keys and tuple-2 of - (occupency, limiting factor) as values. - """ - return self._table - - @property - def by_occupancy(self): - """A list of tuple-2 of (occupancy, thread-per-block) sorted in - descending. - - The first item has the highest occupancy and the lowest number of - thread-per-block. - """ - return self._by_occupancy - - def best(self): - return self.max_occupancy_min_blocks() - - def max_occupancy_min_blocks(self): - """Returns the thread-per-block that optimizes for - maximum occupancy and minimum blocks. - - Maximum blocks allows for the best utilization of parallel execution - because each block can be executed concurrently on different SM. - """ - return self.by_occupancy[0][1] - - def closest(self, tpb): - """Find the occupancy of the closest tpb - """ - # round to the nearest multiple of warpsize - warpsize = PHYSICAL_LIMITS[self.cc]['thread_per_warp'] - tpb = ceil(tpb, warpsize) - # search - return self.table.get(tpb, [0])[0] - - - def best_within(self, mintpb, maxtpb): - """Returns the best tpb in the given range inclusively. - """ - warpsize = PHYSICAL_LIMITS[self.cc]['thread_per_warp'] - mintpb = int(ceil(mintpb, warpsize)) - maxtpb = int(floor(maxtpb, warpsize)) - return self.prefer(*range(mintpb, maxtpb + 1, warpsize)) - - def prefer(self, *tpblist): - """Prefer the thread-per-block with the highest warp occupancy - and the lowest thread-per-block. - - May return None if all threads-per-blocks are invalid - """ - bin = [] - for tpb in tpblist: - occ = self.closest(tpb) - if occ > 0: - bin.append((occ, tpb)) - if bin: - return sorted(bin, key=OccupancyThreadKey)[-1][1] - - -#------------------------------------------------------------------------------ -# warp occupancy calculator - -# Reference: NVIDIA CUDA Toolkit v10.2.89 Programming Guide, Appendix H. -# URL: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities - -LIMITS_CC_20 = { - 'thread_per_warp': 32, - 'warp_per_sm': 48, - 'thread_per_sm': 1536, - 'block_per_sm': 8, - 'registers': 32768, - 'reg_alloc_unit': 64, - 'reg_alloc_gran': 'warp', - 'reg_per_thread': 63, - 'smem_per_sm': SMEM48K, - 'smem_alloc_unit': 128, - 'warp_alloc_gran': 2, - 'max_block_size': 1024, - 'default_smem_config': SMEM16K, -} - -LIMITS_CC_21 = LIMITS_CC_20 - -LIMITS_CC_30 = { - 'thread_per_warp': 32, - 'warp_per_sm': 64, - 'thread_per_sm': 2048, - 'block_per_sm': 16, - 'registers': 65536, - 'reg_alloc_unit': 256, - 'reg_alloc_gran': 'warp', - 'reg_per_thread': 63, - 'smem_per_sm': SMEM48K, - 'smem_alloc_unit': 256, - 'warp_alloc_gran': 4, - 'max_block_size': 1024, - 'default_smem_config': SMEM48K, -} - -LIMITS_CC_35 = LIMITS_CC_30.copy() -LIMITS_CC_35.update({ - 'reg_per_thread': 255, -}) - -LIMITS_CC_37 = LIMITS_CC_35.copy() - -LIMITS_CC_37.update({ - 'registers': 131072, - 'default_smem_config': SMEM112K, -}) - - -LIMITS_CC_50 = { - 'thread_per_warp': 32, - 'warp_per_sm': 64, - 'thread_per_sm': 2048, - 'block_per_sm': 32, - 'registers': 65536, - 'reg_alloc_unit': 256, - 'reg_alloc_gran': 'warp', - 'reg_per_thread': 255, - 'smem_per_sm': SMEM64K, - 'smem_per_block': SMEM48K, - 'smem_alloc_unit': 256, - 'warp_alloc_gran': 4, - 'max_block_size': 1024, - 'default_smem_config': SMEM64K, -} - -LIMITS_CC_52 = LIMITS_CC_50.copy() -LIMITS_CC_52.update({ - 'smem_per_sm': SMEM96K, - 'default_smem_config': SMEM96K, -}) -LIMITS_CC_53 = LIMITS_CC_50.copy() -LIMITS_CC_53.update({ - 'registers': 32768, -}) - -LIMITS_CC_60 = LIMITS_CC_50.copy() -LIMITS_CC_60.update({ - 'warp_alloc_gran': 2, -}) -LIMITS_CC_61 = LIMITS_CC_60.copy() -LIMITS_CC_61.update({ - 'smem_per_sm': SMEM96K, - 'default_smem_config': SMEM96K, - 'warp_alloc_gran': 4, -}) -LIMITS_CC_62 = LIMITS_CC_60.copy() -LIMITS_CC_62.update({ - 'thread_per_sm': 4096, - 'warp_per_sm': 128, - 'warp_alloc_gran': 4, -}) - -LIMITS_CC_70 = LIMITS_CC_62.copy() -LIMITS_CC_70.update({ - 'smem_per_sm': SMEM96K, - 'smem_per_block': SMEM96K, - 'default_smem_config': SMEM96K, -}) - -LIMITS_CC_75 = LIMITS_CC_70.copy() -LIMITS_CC_75.update({ - 'warp_per_sm': 32, - 'thread_per_sm': 1024, - 'block_per_sm': 16, - 'smem_per_sm': SMEM64K, - 'smem_per_block': SMEM64K, - 'default_smem_config': SMEM64K, -}) - -PHYSICAL_LIMITS = { - (2, 0): LIMITS_CC_20, - (2, 1): LIMITS_CC_21, - (3, 0): LIMITS_CC_30, - (3, 5): LIMITS_CC_35, - (3, 7): LIMITS_CC_35, - (5, 0): LIMITS_CC_50, - (5, 2): LIMITS_CC_52, - (5, 3): LIMITS_CC_53, - (6, 0): LIMITS_CC_50, - (6, 1): LIMITS_CC_61, - (6, 2): LIMITS_CC_62, - (7, 0): LIMITS_CC_70, - (7, 5): LIMITS_CC_75, -} - - -def ceil(x, s=1): - return s * math.ceil(x / s) - - -def floor(x, s=1): - return s * math.floor(x / s) - - -def warp_occupancy(info, cc, smem_config=None): - """Returns a dictionary of {threadperblock: occupancy, factor} - - Only threadperblock of multiple of warpsize is used. - Only threadperblock of non-zero occupancy is returned. - """ - ret = {} - try: - limits = PHYSICAL_LIMITS[cc] - except KeyError: - raise ValueError("%s is not a supported compute capability" - % ".".join(str(c) for c in cc)) - if smem_config is None: - smem_config = limits['default_smem_config'] - warpsize = limits['thread_per_warp'] - max_thread = info.maxthreads - - for tpb in range(warpsize, max_thread + 1, warpsize): - result = compute_warp_occupancy(tpb=tpb, - reg=info.regs, - smem=info.shared, - smem_config=smem_config, - limits=limits) - if result[0]: - ret[tpb] = result - return ret - - -def compute_warp_occupancy(tpb, reg, smem, smem_config, limits): - assert limits['reg_alloc_gran'] == 'warp', \ - "assume warp register allocation granularity" - limit_block_per_sm = limits['block_per_sm'] - limit_warp_per_sm = limits['warp_per_sm'] - limit_thread_per_warp = limits['thread_per_warp'] - limit_reg_per_thread = limits['reg_per_thread'] - limit_total_regs = limits['registers'] - limit_total_smem = min(limits['smem_per_sm'], smem_config) - my_smem_alloc_unit = limits['smem_alloc_unit'] - reg_alloc_unit = limits['reg_alloc_unit'] - warp_alloc_gran = limits['warp_alloc_gran'] - - my_warp_per_block = ceil(tpb / limit_thread_per_warp) - my_reg_count = reg - my_reg_per_block = my_warp_per_block - my_smem = smem - my_smem_per_block = ceil(my_smem, my_smem_alloc_unit) - - # allocated resource - limit_blocks_due_to_warps = min(limit_block_per_sm, - floor( - limit_warp_per_sm / my_warp_per_block)) - - c39 = floor(limit_total_regs / ceil(my_reg_count * limit_thread_per_warp, - reg_alloc_unit), - warp_alloc_gran) - - limit_blocks_due_to_regs = (0 - if my_reg_count > limit_reg_per_thread - else (floor(c39 / my_reg_per_block) - if my_reg_count > 0 - else limit_block_per_sm)) - - limit_blocks_due_to_smem = (floor(limit_total_smem / - my_smem_per_block) - if my_smem_per_block > 0 - else limit_block_per_sm) - - # occupancy - active_block_per_sm = min(limit_blocks_due_to_smem, - limit_blocks_due_to_warps, - limit_blocks_due_to_regs) - - if active_block_per_sm == limit_blocks_due_to_warps: - factor = 'warps' - elif active_block_per_sm == limit_blocks_due_to_regs: - factor = 'regs' - else: - factor = 'smem' - - active_warps_per_sm = active_block_per_sm * my_warp_per_block - #active_threads_per_sm = active_warps_per_sm * limit_thread_per_warp - - occupancy = active_warps_per_sm / limit_warp_per_sm - return occupancy, factor - diff --git a/numba/cuda/tests/cudapy/test_deprecation.py b/numba/cuda/tests/cudapy/test_deprecation.py deleted file mode 100644 index f3888425020..00000000000 --- a/numba/cuda/tests/cudapy/test_deprecation.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import print_function, absolute_import - -import warnings -from contextlib import contextmanager - -from numba.tests.support import override_config, TestCase -from numba.cuda.testing import skip_on_cudasim -from numba import unittest_support as unittest -from numba import cuda, types -from numba.cuda.testing import SerialMixin - - -@skip_on_cudasim("Skipped on simulator") -class TestCudaDebugInfo(SerialMixin, TestCase): - """Tests features that will be deprecated - """ - @contextmanager - def assert_deprecation_warning(self): - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - yield w - - def test_autotune(self): - @cuda.jit("(int32[:],)") - def foo(xs): - xs[0] = 1 - - with self.assert_deprecation_warning() as w: - foo.autotune - assert len(w) == 1 - assert issubclass(w[-1].category, DeprecationWarning) - assert ".autotune" in str(w[-1].message) - - with self.assert_deprecation_warning() as w: - foo.occupancy - assert len(w) == 2 - assert issubclass(w[0].category, DeprecationWarning) - assert ".occupancy" in str(w[0].message) - assert issubclass(w[1].category, DeprecationWarning) - assert ".autotune" in str(w[1].message) - - -if __name__ == '__main__': - unittest.main() From c67e2b37c21522926c2cb5e9af5d64f6c1bf150f Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 10 Jan 2020 05:43:07 -0800 Subject: [PATCH 2/9] Tests: ensure all kernel calls are configured Any unconfigured calls were using the default of one thread and one block, so this is made explicit. --- .../cuda/tests/cudadrv/test_context_stack.py | 2 +- numba/cuda/tests/cudadrv/test_linker.py | 2 +- numba/cuda/tests/cudapy/test_autojit.py | 10 ++--- numba/cuda/tests/cudapy/test_boolean.py | 4 +- numba/cuda/tests/cudapy/test_casting.py | 2 +- numba/cuda/tests/cudapy/test_freevar.py | 2 +- numba/cuda/tests/cudapy/test_idiv.py | 4 +- numba/cuda/tests/cudapy/test_inspect.py | 4 +- numba/cuda/tests/cudapy/test_intrinsics.py | 40 +++++++++---------- numba/cuda/tests/cudapy/test_lang.py | 4 +- numba/cuda/tests/cudapy/test_localmem.py | 6 +-- numba/cuda/tests/cudapy/test_macro.py | 6 +-- numba/cuda/tests/cudapy/test_multithreads.py | 2 +- numba/cuda/tests/cudapy/test_print.py | 4 +- numba/cuda/tests/cudapy/test_record_dtype.py | 16 ++++---- .../test_retrieve_autoconverted_arrays.py | 16 ++++---- numba/cuda/tests/cudapy/test_serialize.py | 6 +-- numba/cuda/tests/cudapy/test_sm.py | 4 +- numba/cuda/tests/cudapy/test_userexc.py | 6 +-- 19 files changed, 70 insertions(+), 70 deletions(-) diff --git a/numba/cuda/tests/cudadrv/test_context_stack.py b/numba/cuda/tests/cudadrv/test_context_stack.py index e41ceb2d1f2..35b1e490783 100644 --- a/numba/cuda/tests/cudadrv/test_context_stack.py +++ b/numba/cuda/tests/cudadrv/test_context_stack.py @@ -118,7 +118,7 @@ def foo(a): a[i] = i a = cuda.device_array(10) - foo(a) + foo[1, 1](a) self.assertEqual(list(a.copy_to_host()), list(range(10))) self.test_attached_primary(do) diff --git a/numba/cuda/tests/cudadrv/test_linker.py b/numba/cuda/tests/cudadrv/test_linker.py index ffee8be5a5a..293dc466af2 100644 --- a/numba/cuda/tests/cudadrv/test_linker.py +++ b/numba/cuda/tests/cudadrv/test_linker.py @@ -82,7 +82,7 @@ def foo(x, y): A = np.array([123]) B = np.array([321]) - foo(A, B) + foo[1, 1](A, B) self.assertTrue(A[0] == 123 + 2 * 321) diff --git a/numba/cuda/tests/cudapy/test_autojit.py b/numba/cuda/tests/cudapy/test_autojit.py index ab349b52e63..ba65735108f 100644 --- a/numba/cuda/tests/cudapy/test_autojit.py +++ b/numba/cuda/tests/cudapy/test_autojit.py @@ -12,11 +12,11 @@ def test_autojit(self): def what(a, b, c): pass - what(np.empty(1), 1.0, 21) - what(np.empty(1), 1.0, 21) - what(np.empty(1), np.empty(1, dtype=np.int32), 21) - what(np.empty(1), np.empty(1, dtype=np.int32), 21) - what(np.empty(1), 1.0, 21) + what[1, 1](np.empty(1), 1.0, 21) + what[1, 1](np.empty(1), 1.0, 21) + what[1, 1](np.empty(1), np.empty(1, dtype=np.int32), 21) + what[1, 1](np.empty(1), np.empty(1, dtype=np.int32), 21) + what[1, 1](np.empty(1), 1.0, 21) self.assertTrue(len(what.definitions) == 2) diff --git a/numba/cuda/tests/cudapy/test_boolean.py b/numba/cuda/tests/cudapy/test_boolean.py index 3c788b02c18..ba81abbb6a1 100644 --- a/numba/cuda/tests/cudapy/test_boolean.py +++ b/numba/cuda/tests/cudapy/test_boolean.py @@ -15,9 +15,9 @@ class TestCudaBoolean(SerialMixin, unittest.TestCase): def test_boolean(self): func = cuda.jit('void(float64[:], bool_)')(boolean_func) A = np.array([0], dtype='float64') - func(A, True) + func[1, 1](A, True) self.assertTrue(A[0] == 123) - func(A, False) + func[1, 1](A, False) self.assertTrue(A[0] == 321) diff --git a/numba/cuda/tests/cudapy/test_casting.py b/numba/cuda/tests/cudapy/test_casting.py index 33e63966f27..f4b91526936 100644 --- a/numba/cuda/tests/cudapy/test_casting.py +++ b/numba/cuda/tests/cudapy/test_casting.py @@ -33,7 +33,7 @@ def wrapper_fn(arg): argarray = np.zeros(1, dtype=intype) argarray[0] = arg resarray = np.zeros(1, dtype=outtype) - cuda_wrapper_fn(argarray, resarray) + cuda_wrapper_fn[1, 1](argarray, resarray) return resarray[0] return wrapper_fn diff --git a/numba/cuda/tests/cudapy/test_freevar.py b/numba/cuda/tests/cudapy/test_freevar.py index 24c18b873b6..592da9aced0 100644 --- a/numba/cuda/tests/cudapy/test_freevar.py +++ b/numba/cuda/tests/cudapy/test_freevar.py @@ -23,7 +23,7 @@ def foo(A, i): A[i] = sdata[i] A = np.arange(2, dtype="float32") - foo(A, 0) + foo[1, 1](A, 0) if __name__ == '__main__': diff --git a/numba/cuda/tests/cudapy/test_idiv.py b/numba/cuda/tests/cudapy/test_idiv.py index 635a9e4c1d0..bf233d38285 100644 --- a/numba/cuda/tests/cudapy/test_idiv.py +++ b/numba/cuda/tests/cudapy/test_idiv.py @@ -15,7 +15,7 @@ def div(grid, l_x, l_y): x = np.ones((2, 2), dtype=np.float32) grid = cuda.to_device(x) - div(grid, 2, 2) + div[1, 1](grid, 2, 2) y = grid.copy_to_host() self.assertTrue(np.all(y == 0.5)) @@ -30,7 +30,7 @@ def div_double(grid, l_x, l_y): x = np.ones((2, 2), dtype=np.float64) grid = cuda.to_device(x) - div_double(grid, 2, 2) + div_double[1, 1](grid, 2, 2) y = grid.copy_to_host() self.assertTrue(np.all(y == 0.5)) diff --git a/numba/cuda/tests/cudapy/test_inspect.py b/numba/cuda/tests/cudapy/test_inspect.py index 9ecacd06381..c57198b8a04 100644 --- a/numba/cuda/tests/cudapy/test_inspect.py +++ b/numba/cuda/tests/cudapy/test_inspect.py @@ -39,8 +39,8 @@ def test_polytyped(self): def foo(x, y): pass - foo(1, 1) - foo(1.2, 2.4) + foo[1, 1](1, 1) + foo[1, 1](1.2, 2.4) file = StringIO() foo.inspect_types(file=file) diff --git a/numba/cuda/tests/cudapy/test_intrinsics.py b/numba/cuda/tests/cudapy/test_intrinsics.py index 3dab9f3d034..432e3c6daa8 100644 --- a/numba/cuda/tests/cudapy/test_intrinsics.py +++ b/numba/cuda/tests/cudapy/test_intrinsics.py @@ -117,7 +117,7 @@ class TestCudaIntrinsic(SerialMixin, unittest.TestCase): def test_simple_threadidx(self): compiled = cuda.jit("void(int32[:])")(simple_threadidx) ary = np.ones(1, dtype=np.int32) - compiled(ary) + compiled[1, 1](ary) self.assertTrue(ary[0] == 0) def test_fill_threadidx(self): @@ -264,44 +264,44 @@ def foo(out): def test_popc_u4(self): compiled = cuda.jit("void(int32[:], uint32)")(simple_popc) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0xF0) + compiled[1, 1](ary, 0xF0) self.assertEquals(ary[0], 4) def test_popc_u8(self): compiled = cuda.jit("void(int32[:], uint64)")(simple_popc) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0xF00000000000) + compiled[1, 1](ary, 0xF00000000000) self.assertEquals(ary[0], 4) def test_fma_f4(self): compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma) ary = np.zeros(1, dtype=np.float32) - compiled(ary, 2., 3., 4.) + compiled[1, 1](ary, 2., 3., 4.) np.testing.assert_allclose(ary[0], 2 * 3 + 4) def test_fma_f8(self): compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma) ary = np.zeros(1, dtype=np.float64) - compiled(ary, 2., 3., 4.) + compiled[1, 1](ary, 2., 3., 4.) np.testing.assert_allclose(ary[0], 2 * 3 + 4) def test_brev_u4(self): compiled = cuda.jit("void(uint32[:], uint32)")(simple_brev) ary = np.zeros(1, dtype=np.uint32) - compiled(ary, 0x000030F0) + compiled[1, 1](ary, 0x000030F0) self.assertEquals(ary[0], 0x0F0C0000) @skip_on_cudasim('only get given a Python "int", assumes 32 bits') def test_brev_u8(self): compiled = cuda.jit("void(uint64[:], uint64)")(simple_brev) ary = np.zeros(1, dtype=np.uint64) - compiled(ary, 0x000030F0000030F0) + compiled[1, 1](ary, 0x000030F0000030F0) self.assertEquals(ary[0], 0x0F0C00000F0C0000) def test_clz_i4(self): compiled = cuda.jit("void(int32[:], int32)")(simple_clz) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0x00100000) + compiled[1, 1](ary, 0x00100000) self.assertEquals(ary[0], 11) def test_clz_u4(self): @@ -314,57 +314,57 @@ def test_clz_u4(self): """ compiled = cuda.jit("void(int32[:], uint32)")(simple_clz) ary = np.zeros(1, dtype=np.uint32) - compiled(ary, 0x00100000) + compiled[1, 1](ary, 0x00100000) self.assertEquals(ary[0], 11) def test_clz_i4_1s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_clz) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0xFFFFFFFF) + compiled[1, 1](ary, 0xFFFFFFFF) self.assertEquals(ary[0], 0) def test_clz_i4_0s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_clz) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0x0) + compiled[1, 1](ary, 0x0) self.assertEquals(ary[0], 32, "CUDA semantics") @skip_on_cudasim('only get given a Python "int", assumes 32 bits') def test_clz_i8(self): compiled = cuda.jit("void(int32[:], int64)")(simple_clz) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0x000000000010000) + compiled[1, 1](ary, 0x000000000010000) self.assertEquals(ary[0], 47) def test_ffs_i4(self): compiled = cuda.jit("void(int32[:], int32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0x00100000) + compiled[1, 1](ary, 0x00100000) self.assertEquals(ary[0], 20) def test_ffs_u4(self): compiled = cuda.jit("void(int32[:], uint32)")(simple_ffs) ary = np.zeros(1, dtype=np.uint32) - compiled(ary, 0x00100000) + compiled[1, 1](ary, 0x00100000) self.assertEquals(ary[0], 20) def test_ffs_i4_1s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0xFFFFFFFF) + compiled[1, 1](ary, 0xFFFFFFFF) self.assertEquals(ary[0], 0) def test_ffs_i4_0s(self): compiled = cuda.jit("void(int32[:], int32)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0x0) + compiled[1, 1](ary, 0x0) self.assertEquals(ary[0], 32, "CUDA semantics") @skip_on_cudasim('only get given a Python "int", assumes 32 bits') def test_ffs_i8(self): compiled = cuda.jit("void(int32[:], int64)")(simple_ffs) ary = np.zeros(1, dtype=np.int32) - compiled(ary, 0x000000000010000) + compiled[1, 1](ary, 0x000000000010000) self.assertEquals(ary[0], 16) def test_simple_laneid(self): @@ -378,7 +378,7 @@ def test_simple_laneid(self): def test_simple_warpsize(self): compiled = cuda.jit("void(int32[:])")(simple_warpsize) ary = np.zeros(1, dtype=np.int32) - compiled(ary) + compiled[1, 1](ary) self.assertEquals(ary[0], 32, "CUDA semantics") @unittest.skipUnless(IS_PY3, "round() returns float on Py2") @@ -387,7 +387,7 @@ def test_round_f4(self): ary = np.zeros(1, dtype=np.int32) for i in [-3.0, -2.5, -2.25, -1.5, 1.5, 2.25, 2.5, 2.75]: - compiled(ary, i) + compiled[1, 1](ary, i) self.assertEquals(ary[0], round(i)) @unittest.skipUnless(IS_PY3, "round() returns float on Py2") @@ -396,7 +396,7 @@ def test_round_f8(self): ary = np.zeros(1, dtype=np.int32) for i in [-3.0, -2.5, -2.25, -1.5, 1.5, 2.25, 2.5, 2.75]: - compiled(ary, i) + compiled[1, 1](ary, i) self.assertEquals(ary[0], round(i)) diff --git a/numba/cuda/tests/cudapy/test_lang.py b/numba/cuda/tests/cudapy/test_lang.py index f0133a09226..0166df6e23a 100644 --- a/numba/cuda/tests/cudapy/test_lang.py +++ b/numba/cuda/tests/cudapy/test_lang.py @@ -19,7 +19,7 @@ def foo(a): a[i] = v a = np.zeros(len(tup)) - foo(a) + foo[1, 1](a) self.assertTrue(np.all(a == tup)) def test_zip(self): @@ -34,7 +34,7 @@ def foo(a): a[0] = c a = np.zeros(1) - foo(a) + foo[1, 1](a) b = np.array(t1) c = np.array(t2) self.assertTrue(np.all(a == (b + c).sum())) diff --git a/numba/cuda/tests/cudapy/test_localmem.py b/numba/cuda/tests/cudapy/test_localmem.py index 605beae5676..f50ecf9f9db 100644 --- a/numba/cuda/tests/cudapy/test_localmem.py +++ b/numba/cuda/tests/cudapy/test_localmem.py @@ -36,7 +36,7 @@ def test_local_array(self): self.assertTrue('.local' in jculocal.ptx) A = np.arange(1000, dtype='int32') B = np.zeros_like(A) - jculocal(A, B) + jculocal[1, 1](A, B) self.assertTrue(np.all(A == B)) def test_local_array_1_tuple(self): @@ -47,7 +47,7 @@ def test_local_array_1_tuple(self): # may reduce it to registers. A = np.arange(5, dtype='int32') B = np.zeros_like(A) - jculocal(A, B) + jculocal[1, 1](A, B) self.assertTrue(np.all(A == B)) def test_local_array_complex(self): @@ -57,7 +57,7 @@ def test_local_array_complex(self): # self.assertTrue('.local' in jculocalcomplex.ptx) A = (np.arange(100, dtype='complex128') - 1) / 2j B = np.zeros_like(A) - jculocalcomplex(A, B) + jculocalcomplex[1, 1](A, B) self.assertTrue(np.all(A == B)) diff --git a/numba/cuda/tests/cudapy/test_macro.py b/numba/cuda/tests/cudapy/test_macro.py index b6644c4feb8..2caf65f06c0 100644 --- a/numba/cuda/tests/cudapy/test_macro.py +++ b/numba/cuda/tests/cudapy/test_macro.py @@ -57,11 +57,11 @@ def getarg2(self): def test_global_constants(self): udt = cuda.jit((float32[:],))(udt_global_constants) - udt(self.getarg()) + udt[1, 1](self.getarg()) def test_global_build_tuple(self): udt = cuda.jit((float32[:, :],))(udt_global_build_tuple) - udt(self.getarg2()) + udt[1, 1](self.getarg2()) @skip_on_cudasim('Simulator does not perform macro expansion') def test_global_build_list(self): @@ -73,7 +73,7 @@ def test_global_build_list(self): def test_global_constant_tuple(self): udt = cuda.jit((float32[:, :],))(udt_global_constant_tuple) - udt(self.getarg2()) + udt[1, 1](self.getarg2()) @skip_on_cudasim("Can't check for constants in simulator") def test_invalid_1(self): diff --git a/numba/cuda/tests/cudapy/test_multithreads.py b/numba/cuda/tests/cudapy/test_multithreads.py index 7337241b259..6d0b1c6df15 100644 --- a/numba/cuda/tests/cudapy/test_multithreads.py +++ b/numba/cuda/tests/cudapy/test_multithreads.py @@ -23,7 +23,7 @@ def foo(x): x[0] += 1 def use_foo(x): - foo(x) + foo[1, 1](x) return x arrays = [np.arange(10) for i in range(10)] diff --git a/numba/cuda/tests/cudapy/test_print.py b/numba/cuda/tests/cudapy/test_print.py index 59513d127a2..4fe1b2ba859 100644 --- a/numba/cuda/tests/cudapy/test_print.py +++ b/numba/cuda/tests/cudapy/test_print.py @@ -43,7 +43,7 @@ def test_cuhello(self): def test_printfloat(self): jprintfloat = cuda.jit('void()', debug=False)(printfloat) with captured_cuda_stdout() as stdout: - jprintfloat() + jprintfloat[1, 1]() # CUDA and the simulator use different formats for float formatting self.assertIn(stdout.getvalue(), ["0 23 34.750000 321\n", "0 23 34.75 321\n"]) @@ -51,7 +51,7 @@ def test_printfloat(self): def test_printempty(self): cufunc = cuda.jit('void()', debug=False)(printempty) with captured_cuda_stdout() as stdout: - cufunc() + cufunc[1, 1]() self.assertEqual(stdout.getvalue(), "\n") def test_string(self): diff --git a/numba/cuda/tests/cudapy/test_record_dtype.py b/numba/cuda/tests/cudapy/test_record_dtype.py index b07770dd4af..4781fcf819a 100644 --- a/numba/cuda/tests/cudapy/test_record_dtype.py +++ b/numba/cuda/tests/cudapy/test_record_dtype.py @@ -137,7 +137,7 @@ def _test_set_equal(self, pyfunc, value, valuetype): else: expect = got.copy().view(np.recarray) - cfunc(got, i, value) + cfunc[1, 1](got, i, value) pyfunc(expect, i, value) # Match the entire array to ensure no memory corruption @@ -169,7 +169,7 @@ def test_set_record(self): pyfunc(expect, i, j) got = self.sample1d.copy() - cfunc(got, i, j) + cfunc[1, 1](got, i, j) # Match the entire array to ensure no memory corruption self.assertEqual(expect[i], expect[j]) @@ -180,7 +180,7 @@ def _test_rec_set(self, v, pyfunc, f): rec = self.sample1d.copy()[0] nbrecord = numpy_support.from_dtype(recordtype) cfunc = self.get_cfunc(pyfunc, (nbrecord,)) - cfunc(rec, v) + cfunc[1, 1](rec, v) np.testing.assert_equal(rec[f], v) def test_rec_set_a(self): @@ -198,7 +198,7 @@ def _test_rec_read(self, v, pyfunc, f): arr = np.zeros(1, v.dtype) nbrecord = numpy_support.from_dtype(recordtype) cfunc = self.get_cfunc(pyfunc, (nbrecord,)) - cfunc(rec, arr) + cfunc[1, 1](rec, arr) np.testing.assert_equal(arr[0], v) def test_rec_read_a(self): @@ -218,7 +218,7 @@ def test_record_write_1d_array(self): nbrecord = numpy_support.from_dtype(recordwitharray) cfunc = self.get_cfunc(record_write_array, (nbrecord,)) - cfunc(rec) + cfunc[1, 1](rec) expected = self.samplerec1darr.copy() expected['g'] = 2 expected['h'][0] = 3.0 @@ -233,7 +233,7 @@ def test_record_write_2d_array(self): rec = self.samplerec2darr.copy() nbrecord = numpy_support.from_dtype(recordwith2darray) cfunc = self.get_cfunc(record_write_2d_array, (nbrecord,)) - cfunc(rec) + cfunc[1, 1](rec) expected = self.samplerec2darr.copy() expected['i'] = 3 @@ -252,7 +252,7 @@ def test_record_read_1d_array(self): nbrecord = numpy_support.from_dtype(recordwitharray) cfunc = self.get_cfunc(record_read_array, (nbrecord,)) arr = np.zeros(2, dtype=rec['h'].dtype) - cfunc(rec, arr) + cfunc[1, 1](rec, arr) np.testing.assert_equal(rec['h'], arr) @@ -268,7 +268,7 @@ def test_record_read_2d_array(self): nbrecord = numpy_support.from_dtype(recordwith2darray) cfunc = self.get_cfunc(record_read_2d_array, (nbrecord,)) arr = np.zeros((3,2), dtype=rec['j'].dtype) - cfunc(rec, arr) + cfunc[1, 1](rec, arr) np.testing.assert_equal(rec['j'], arr) diff --git a/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py b/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py index 81a98279a9b..1432b2dc970 100644 --- a/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py +++ b/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py @@ -41,42 +41,42 @@ def setUp(self): def test_array_inout(self): host_arr = np.zeros(1, dtype=np.int64) - self.set_array_to_three(cuda.InOut(host_arr)) + self.set_array_to_three[1, 1](cuda.InOut(host_arr)) self.assertEqual(3, host_arr[0]) def test_array_in(self): host_arr = np.zeros(1, dtype=np.int64) - self.set_array_to_three(cuda.In(host_arr)) + self.set_array_to_three[1, 1](cuda.In(host_arr)) self.assertEqual(0, host_arr[0]) def test_array_in_from_config(self): host_arr = np.zeros(1, dtype=np.int64) - self.set_array_to_three_nocopy(host_arr) + self.set_array_to_three_nocopy[1, 1](host_arr) self.assertEqual(0, host_arr[0]) def test_array_default(self): host_arr = np.zeros(1, dtype=np.int64) - self.set_array_to_three(host_arr) + self.set_array_to_three[1, 1](host_arr) self.assertEqual(3, host_arr[0]) def test_record_in(self): host_rec = np.zeros(1, dtype=recordtype) - self.set_record_to_three(cuda.In(host_rec)) + self.set_record_to_three[1, 1](cuda.In(host_rec)) self.assertEqual(0, host_rec[0]['b']) def test_record_inout(self): host_rec = np.zeros(1, dtype=recordtype) - self.set_record_to_three(cuda.InOut(host_rec)) + self.set_record_to_three[1, 1](cuda.InOut(host_rec)) self.assertEqual(3, host_rec[0]['b']) def test_record_default(self): host_rec = np.zeros(1, dtype=recordtype) - self.set_record_to_three(host_rec) + self.set_record_to_three[1, 1](host_rec) self.assertEqual(3, host_rec[0]['b']) def test_record_in_from_config(self): host_rec = np.zeros(1, dtype=recordtype) - self.set_record_to_three_nocopy(host_rec) + self.set_record_to_three_nocopy[1, 1](host_rec) self.assertEqual(0, host_rec[0]['b']) diff --git a/numba/cuda/tests/cudapy/test_serialize.py b/numba/cuda/tests/cudapy/test_serialize.py index c2289e32724..6f1f59d4e3e 100644 --- a/numba/cuda/tests/cudapy/test_serialize.py +++ b/numba/cuda/tests/cudapy/test_serialize.py @@ -11,13 +11,13 @@ class TestPickle(SerialMixin, unittest.TestCase): def check_call(self, callee): arr = np.array([100]) - expected = callee(arr) + expected = callee[1, 1](arr) # serialize and rebuild foo1 = pickle.loads(pickle.dumps(callee)) del callee # call rebuild function - got1 = foo1(arr) + got1 = foo1[1, 1](arr) np.testing.assert_equal(got1, expected) del got1 @@ -25,7 +25,7 @@ def check_call(self, callee): foo2 = pickle.loads(pickle.dumps(foo1)) del foo1 # call rebuild function - got2 = foo2(arr) + got2 = foo2[1, 1](arr) np.testing.assert_equal(got2, expected) del got2 diff --git a/numba/cuda/tests/cudapy/test_sm.py b/numba/cuda/tests/cudapy/test_sm.py index 4e6e1ea8fa1..0d188d9e8c1 100644 --- a/numba/cuda/tests/cudapy/test_sm.py +++ b/numba/cuda/tests/cudapy/test_sm.py @@ -21,7 +21,7 @@ def outer(): outer_arr = cuda.shared.array(1, dtype=int32) inner() - outer() + outer[1, 1]() def _check_shared_array_size(self, shape, expected): @cuda.jit @@ -30,7 +30,7 @@ def s(a): a[0] = arr.size result = np.zeros(1, dtype=np.int32) - s(result) + s[1, 1](result) self.assertEqual(result[0], expected) def test_issue_1051_shared_size_broken_1d(self): diff --git a/numba/cuda/tests/cudapy/test_userexc.py b/numba/cuda/tests/cudapy/test_userexc.py index 073a5dba8bd..dcb05961431 100644 --- a/numba/cuda/tests/cudapy/test_userexc.py +++ b/numba/cuda/tests/cudapy/test_userexc.py @@ -23,14 +23,14 @@ def test_exc(x): elif x == 2: raise MyError("foo") - test_exc(0) # no raise + test_exc[1, 1](0) # no raise with self.assertRaises(MyError) as cm: - test_exc(1) + test_exc[1, 1](1) if not config.ENABLE_CUDASIM: self.assertRegexpMatches(str(cm.exception), regex_pattern) self.assertIn("tid=[0, 0, 0] ctaid=[0, 0, 0]", str(cm.exception)) with self.assertRaises(MyError) as cm: - test_exc(2) + test_exc[1, 1](2) if not config.ENABLE_CUDASIM: self.assertRegexpMatches(str(cm.exception), regex_pattern) self.assertRegexpMatches(str(cm.exception), regex_pattern) From b74e4099ec366d45fccdb51569e2702ec8c2cf88 Mon Sep 17 00:00:00 2001 From: Stuart Archibald Date: Tue, 20 Aug 2019 09:36:08 +0100 Subject: [PATCH 3/9] Prevent CUDA kernel launch without a specified launch config. This patch prevents the launch of a CUDA kernel with no configuration as this causes confusion for (especially) first time users. Current behaviour is that if no launch config is specified then a default everything-set-to-1 config is used, new behaviour is that if no launch config is specified then an exception is raised pointing users to the syntax and documentation. --- numba/cuda/compiler.py | 4 ++-- numba/cuda/errors.py | 13 +++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index 19d6b0ccea6..3bef3b4bcf2 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -301,8 +301,8 @@ class CUDAKernelBase(object): """ def __init__(self): - self.griddim = (1, 1) - self.blockdim = (1, 1, 1) + self.griddim = None + self.blockdim = None self.sharedmem = 0 self.stream = 0 diff --git a/numba/cuda/errors.py b/numba/cuda/errors.py index 79da66f91d3..c6a28e77de5 100644 --- a/numba/cuda/errors.py +++ b/numba/cuda/errors.py @@ -13,6 +13,16 @@ def __init__(self, msg, tid=None, ctaid=None): msg = t % (self.tid, self.ctaid, self.msg) super(KernelRuntimeError, self).__init__(msg) +_launch_help_url = ("https://numba.pydata.org/numba-doc/" + "latest/cuda/kernels.html#kernel-invocation") +_missing_launch_config_msg = """ +Kernel launch configuration was not specified. Use the syntax: + +kernel_function[blockspergrid, threadsperblock](arg0, arg1, ..., argn) + +See {} for help. + +""".format(_launch_help_url) def normalize_kernel_dimensions(griddim, blockdim): """ @@ -35,6 +45,9 @@ def check_dim(dim, name): dim.append(1) return dim + if None in (griddim, blockdim): + raise ValueError(_missing_launch_config_msg) + griddim = check_dim(griddim, 'griddim') blockdim = check_dim(blockdim, 'blockdim') From ff4f86c8c7c39a3e598f32a1d8083447b2650e6f Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 10 Jan 2020 05:41:13 -0800 Subject: [PATCH 4/9] Normalize kernel dimensions in CUDAKernel.__call__ The `normalize_kernel_dimensions` function validates that the kernel has been configured, so it must be called in a CUDAKernel call (in addition to an AutoJitCUDAKernel call) to ensure that the kernel has been configured. --- numba/cuda/compiler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py index 3bef3b4bcf2..ed44fc32730 100644 --- a/numba/cuda/compiler.py +++ b/numba/cuda/compiler.py @@ -519,9 +519,10 @@ def __reduce__(self): def __call__(self, *args, **kwargs): assert not kwargs + griddim, blockdim = normalize_kernel_dimensions(self.griddim, self.blockdim) self._kernel_call(args=args, - griddim=self.griddim, - blockdim=self.blockdim, + griddim=griddim, + blockdim=blockdim, stream=self.stream, sharedmem=self.sharedmem) From dc120bce9f46bf3965dad141bf87febff83af7ff Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 10 Jan 2020 06:55:34 -0800 Subject: [PATCH 5/9] Raise ValueError in simulator if kernel not configured --- numba/cuda/simulator/kernel.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/numba/cuda/simulator/kernel.py b/numba/cuda/simulator/kernel.py index aa44f675a07..b737630eef2 100644 --- a/numba/cuda/simulator/kernel.py +++ b/numba/cuda/simulator/kernel.py @@ -53,16 +53,23 @@ def __init__(self, fn, device, fastmath=False, extensions=[]): self._device = device self._fastmath = fastmath self.extensions = list(extensions) # defensive copy - # Initial configuration: 1 block, 1 thread, stream 0, no dynamic shared + # Initial configuration: grid unconfigured, stream 0, no dynamic shared # memory. - self[1, 1, 0, 0] + self.grid_dim = None + self.block_dim = None + self.stream = 0 + self.dynshared_size = 0 def __call__(self, *args): if self._device: with swapped_cuda_module(self.fn, _get_kernel_context()): return self.fn(*args) - fake_cuda_module = FakeCUDAModule(self.grid_dim, self.block_dim, + # Ensure we've been given a valid grid configuration + grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim, + self.block_dim) + + fake_cuda_module = FakeCUDAModule(grid_dim, block_dim, self.dynshared_size) with _push_kernel_context(fake_cuda_module): # fake_args substitutes all numpy arrays for FakeCUDAArrays @@ -90,14 +97,13 @@ def fake_arg(arg): fake_args = [fake_arg(arg) for arg in args] with swapped_cuda_module(self.fn, fake_cuda_module): # Execute one block at a time - for grid_point in np.ndindex(*self.grid_dim): - bm = BlockManager(self.fn, self.grid_dim, self.block_dim) + for grid_point in np.ndindex(*grid_dim): + bm = BlockManager(self.fn, grid_dim, block_dim) bm.run(grid_point, *fake_args) for wb in retr: wb() - def __getitem__(self, configuration): self.grid_dim, self.block_dim = \ normalize_kernel_dimensions(*configuration[:2]) From ad75d115a7783c8dec1536dd38f4b2e5f6a301c5 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 10 Jan 2020 06:20:17 -0800 Subject: [PATCH 6/9] Add tests for unconfigured kernel error --- numba/cuda/tests/cudapy/test_errors.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py index 3028f0e8503..24f80a00336 100644 --- a/numba/cuda/tests/cudapy/test_errors.py +++ b/numba/cuda/tests/cudapy/test_errors.py @@ -41,6 +41,20 @@ def test_non_integral_dims(self): self.assertIn("blockdim must be a sequence of integers, got [3.0]", str(raises.exception)) + def _test_unconfigured(self, kernfunc): + with self.assertRaises(ValueError) as raises: + kernfunc(0) + self.assertIn("launch configuration was not specified", + str(raises.exception)) + + def test_unconfigured_cudakernel(self): + kernfunc = cuda.jit("void(int32)")(noop) + self._test_unconfigured(kernfunc) + + def test_unconfigured_autojitcudakernel(self): + kernfunc = cuda.jit(noop) + self._test_unconfigured(kernfunc) + if __name__ == '__main__': unittest.main() From 23334fd4d2c6ab9e71563253075269893bb5b949 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 10 Jan 2020 08:02:51 -0800 Subject: [PATCH 7/9] Fix flake8 errors in errors.py --- numba/cuda/errors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/numba/cuda/errors.py b/numba/cuda/errors.py index c6a28e77de5..42586a43d8f 100644 --- a/numba/cuda/errors.py +++ b/numba/cuda/errors.py @@ -13,6 +13,7 @@ def __init__(self, msg, tid=None, ctaid=None): msg = t % (self.tid, self.ctaid, self.msg) super(KernelRuntimeError, self).__init__(msg) + _launch_help_url = ("https://numba.pydata.org/numba-doc/" "latest/cuda/kernels.html#kernel-invocation") _missing_launch_config_msg = """ @@ -24,6 +25,7 @@ def __init__(self, msg, tid=None, ctaid=None): """.format(_launch_help_url) + def normalize_kernel_dimensions(griddim, blockdim): """ Normalize and validate the user-supplied kernel dimensions. From 773b6690832d709984ad3c557687a2efd21de177 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Mon, 13 Jan 2020 01:04:30 -0800 Subject: [PATCH 8/9] Add SerialMixin to TestJitErrors Now that the TestJitErrors class contains test cases that call CUDA functions, it requires the SerialMixin, otherwise it will be executed in a child process after the parent already used CUDA (which is not supported) when testing in parallel. --- contrib/valgrind-numba.supp | 21 +++++++++++++++++++++ numba/cuda/tests/cudapy/test_errors.py | 4 ++-- 2 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 contrib/valgrind-numba.supp diff --git a/contrib/valgrind-numba.supp b/contrib/valgrind-numba.supp new file mode 100644 index 00000000000..26271eb4ef0 --- /dev/null +++ b/contrib/valgrind-numba.supp @@ -0,0 +1,21 @@ +{ + + Memcheck:Cond + fun:_ZN4llvm3sys14getHostCPUNameEv + fun:LLVMPY_GetHostCPUName +} + +{ + + Memcheck:Value8 + fun:_ZN4llvm3sys14getHostCPUNameEv + fun:LLVMPY_GetHostCPUName +} + +{ + + Memcheck:Cond + fun:__intel_sse2_strrchr + fun:_ZN67_INTERNAL_45_______src_thirdparty_tbb_omp_dynamic_link_cpp_c306cade5__kmp12init_dl_dataEv + fun:__sti__$E +} diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py index 24f80a00336..135a4baa443 100644 --- a/numba/cuda/tests/cudapy/test_errors.py +++ b/numba/cuda/tests/cudapy/test_errors.py @@ -3,14 +3,14 @@ import numpy as np from numba import cuda -from numba.cuda.testing import unittest +from numba.cuda.testing import unittest, SerialMixin def noop(x): pass -class TestJitErrors(unittest.TestCase): +class TestJitErrors(SerialMixin, unittest.TestCase): """ Test compile-time errors with @jit. """ From 252f2266fa765a559428cc385eb9b242f78596b8 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 21 Jan 2020 07:54:40 -0800 Subject: [PATCH 9/9] Remove accidentally-committed suppressions file --- contrib/valgrind-numba.supp | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 contrib/valgrind-numba.supp diff --git a/contrib/valgrind-numba.supp b/contrib/valgrind-numba.supp deleted file mode 100644 index 26271eb4ef0..00000000000 --- a/contrib/valgrind-numba.supp +++ /dev/null @@ -1,21 +0,0 @@ -{ - - Memcheck:Cond - fun:_ZN4llvm3sys14getHostCPUNameEv - fun:LLVMPY_GetHostCPUName -} - -{ - - Memcheck:Value8 - fun:_ZN4llvm3sys14getHostCPUNameEv - fun:LLVMPY_GetHostCPUName -} - -{ - - Memcheck:Cond - fun:__intel_sse2_strrchr - fun:_ZN67_INTERNAL_45_______src_thirdparty_tbb_omp_dynamic_link_cpp_c306cade5__kmp12init_dl_dataEv - fun:__sti__$E -}