From 7e169e1f9dc796f0c89f78e2a510af245412871d Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 10 Jan 2020 05:51:34 -0800
Subject: [PATCH 1/9] Remove the autotuner

The autotuner has been deprecated since Numba 0.40.
---
 numba/cuda/compiler.py                      |  48 +--
 numba/cuda/cudadrv/autotune.py              | 343 --------------------
 numba/cuda/tests/cudapy/test_deprecation.py |  44 ---
 3 files changed, 2 insertions(+), 433 deletions(-)
 delete mode 100644 numba/cuda/cudadrv/autotune.py
 delete mode 100644 numba/cuda/tests/cudapy/test_deprecation.py

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index af8e9ad1c71..19d6b0ccea6 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -16,7 +16,6 @@
 from numba import funcdesc, typing, utils, serialize
 from numba.compiler_lock import global_compiler_lock
 
-from .cudadrv.autotune import AutoTuner
 from .cudadrv.devices import get_context
 from .cudadrv import nvvm, devicearray, driver
 from .errors import normalize_kernel_dimensions
@@ -284,7 +283,7 @@ def _compute_thread_per_block(self, kernel):
         # Prefer user-specified config
         if tpb != 0:
             return tpb
-        # Else, ask the driver to give a good cofnig
+        # Else, ask the driver to give a good config
         else:
             ctx = get_context()
             kwargs = dict(
@@ -293,24 +292,9 @@ def _compute_thread_per_block(self, kernel):
                 memsize=self.sharedmem,
                 blocksizelimit=1024,
             )
-            try:
-                # Raises from the driver if the feature is unavailable
-                _, tpb = ctx.get_max_potential_block_size(**kwargs)
-            except AttributeError:
-                # Fallback to table-based approach.
-                tpb = self._fallback_autotune_best(kernel)
-                raise
+            _, tpb = ctx.get_max_potential_block_size(**kwargs)
             return tpb
 
-    def _fallback_autotune_best(self, kernel):
-        try:
-            tpb = kernel.autotune.best()
-        except ValueError:
-            warnings.warn('Could not autotune, using default tpb of 128')
-            tpb = 128
-
-        return tpb
-
 
 class CUDAKernelBase(object):
     """Define interface for configurable kernels
@@ -719,34 +703,6 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
         else:
             raise NotImplementedError(ty, val)
 
-    @property
-    def autotune(self):
-        """Return the autotuner object associated with this kernel."""
-        warnings.warn(_deprec_warn_msg.format('autotune'), DeprecationWarning)
-        has_autotune = hasattr(self, '_autotune')
-        if has_autotune and self._autotune.dynsmem == self.sharedmem:
-            return self._autotune
-        else:
-            # Get CUDA Function
-            cufunc = self._func.get()
-            at = AutoTuner(info=cufunc.attrs, cc=cufunc.device.compute_capability)
-            self._autotune = at
-            return self._autotune
-
-    @property
-    def occupancy(self):
-        """Occupancy is the ratio of the number of active warps per multiprocessor to the maximum
-        number of warps that can be active on the multiprocessor at once.
-        Calculate the theoretical occupancy of the kernel given the
-        current configuration."""
-        warnings.warn(_deprec_warn_msg.format('occupancy'), DeprecationWarning)
-        thread_per_block = reduce(operator.mul, self.blockdim, 1)
-        return self.autotune.closest(thread_per_block)
-
-
-_deprec_warn_msg = ("The .{} attribute is is deprecated and will be "
-                    "removed in a future release")
-
 
 class AutoJitCUDAKernel(CUDAKernelBase):
     '''
diff --git a/numba/cuda/cudadrv/autotune.py b/numba/cuda/cudadrv/autotune.py
deleted file mode 100644
index 2dae5293f24..00000000000
--- a/numba/cuda/cudadrv/autotune.py
+++ /dev/null
@@ -1,343 +0,0 @@
-"""
-- Parse jit compile info
-- Compute warp occupancy histogram
-"""
-from __future__ import division, absolute_import, print_function
-import math
-import re
-
-SMEM0K = 0
-SMEM8K = 8 * 2 ** 10
-SMEM16K = 16 * 2 ** 10
-SMEM48K = 48 * 2 ** 10
-SMEM64K = 64 * 2 ** 10
-SMEM80K = 80 * 2 ** 10
-SMEM96K = 96 * 2 ** 10
-SMEM112K = 112 * 2 ** 10
-
-#------------------------------------------------------------------------------
-# autotuning
-
-
-class OccupancyThreadKey(object):
-    def __init__(self, item):
-        self.occupancy, self.threads = item
-        self.comparison = self.occupancy, 1 / self.threads
-
-    def __lt__(self, other):
-        return self.comparison < other.comparison
-
-    def __eq__(self, other):
-        return self.comparison == other.comparison
-
-    def __ne__(self, other):
-        return self.comparison != other.comparison
-
-    def __gt__(self, other):
-        return self.comparison > other.comparison
-
-    def __le__(self, other):
-        return self.comparison <= other.comparison
-
-    def __ge__(self, other):
-        return self.comparison >= other.comparison
-
-
-class AutoTuner(object):
-    """Autotune a kernel based upon the theoretical occupancy.
-    """
-    def __init__(self, cc, info, smem_config=None, dynsmem=0):
-        self.cc = cc
-        self.dynsmem = dynsmem
-        self._table = warp_occupancy(info=info, cc=cc)
-        self._by_occupancy = list(reversed(sorted(((occup, tpb)
-                                                   for tpb, (occup, factor)
-                                                   in self.table.items()),
-                                                  key=OccupancyThreadKey)))
-
-    @property
-    def table(self):
-        """A dict with thread-per-block as keys and tuple-2 of
-        (occupency, limiting factor) as values.
-        """
-        return self._table
-
-    @property
-    def by_occupancy(self):
-        """A list of tuple-2 of (occupancy, thread-per-block) sorted in
-        descending.
-
-        The first item has the highest occupancy and the lowest number of
-        thread-per-block.
-        """
-        return self._by_occupancy
-
-    def best(self):
-        return self.max_occupancy_min_blocks()
-
-    def max_occupancy_min_blocks(self):
-        """Returns the thread-per-block that optimizes for
-        maximum occupancy and minimum blocks.
-
-        Maximum blocks allows for the best utilization of parallel execution
-        because each block can be executed concurrently on different SM.
-        """
-        return self.by_occupancy[0][1]
-
-    def closest(self, tpb):
-        """Find the occupancy of the closest tpb
-        """
-        # round to the nearest multiple of warpsize
-        warpsize = PHYSICAL_LIMITS[self.cc]['thread_per_warp']
-        tpb = ceil(tpb, warpsize)
-        # search
-        return self.table.get(tpb, [0])[0]
-
-
-    def best_within(self, mintpb, maxtpb):
-        """Returns the best tpb in the given range inclusively.
-        """
-        warpsize = PHYSICAL_LIMITS[self.cc]['thread_per_warp']
-        mintpb = int(ceil(mintpb, warpsize))
-        maxtpb = int(floor(maxtpb, warpsize))
-        return self.prefer(*range(mintpb, maxtpb + 1, warpsize))
-
-    def prefer(self, *tpblist):
-        """Prefer the thread-per-block with the highest warp occupancy
-        and the lowest thread-per-block.
-
-        May return None if all threads-per-blocks are invalid
-        """
-        bin = []
-        for tpb in tpblist:
-            occ = self.closest(tpb)
-            if occ > 0:
-                bin.append((occ, tpb))
-        if bin:
-            return sorted(bin, key=OccupancyThreadKey)[-1][1]
-
-
-#------------------------------------------------------------------------------
-# warp occupancy calculator
-
-# Reference: NVIDIA CUDA Toolkit v10.2.89 Programming Guide, Appendix H.
-# URL: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
-
-LIMITS_CC_20 = {
-    'thread_per_warp': 32,
-    'warp_per_sm': 48,
-    'thread_per_sm': 1536,
-    'block_per_sm': 8,
-    'registers': 32768,
-    'reg_alloc_unit': 64,
-    'reg_alloc_gran': 'warp',
-    'reg_per_thread': 63,
-    'smem_per_sm': SMEM48K,
-    'smem_alloc_unit': 128,
-    'warp_alloc_gran': 2,
-    'max_block_size': 1024,
-    'default_smem_config': SMEM16K,
-}
-
-LIMITS_CC_21 = LIMITS_CC_20
-
-LIMITS_CC_30 = {
-    'thread_per_warp': 32,
-    'warp_per_sm': 64,
-    'thread_per_sm': 2048,
-    'block_per_sm': 16,
-    'registers': 65536,
-    'reg_alloc_unit': 256,
-    'reg_alloc_gran': 'warp',
-    'reg_per_thread': 63,
-    'smem_per_sm': SMEM48K,
-    'smem_alloc_unit': 256,
-    'warp_alloc_gran': 4,
-    'max_block_size': 1024,
-    'default_smem_config': SMEM48K,
-}
-
-LIMITS_CC_35 = LIMITS_CC_30.copy()
-LIMITS_CC_35.update({
-    'reg_per_thread': 255,
-})
-
-LIMITS_CC_37 = LIMITS_CC_35.copy()
-
-LIMITS_CC_37.update({
-    'registers': 131072,
-    'default_smem_config': SMEM112K,
-})
-
-
-LIMITS_CC_50 = {
-    'thread_per_warp': 32,
-    'warp_per_sm': 64,
-    'thread_per_sm': 2048,
-    'block_per_sm': 32,
-    'registers': 65536,
-    'reg_alloc_unit': 256,
-    'reg_alloc_gran': 'warp',
-    'reg_per_thread': 255,
-    'smem_per_sm': SMEM64K,
-    'smem_per_block': SMEM48K,
-    'smem_alloc_unit': 256,
-    'warp_alloc_gran': 4,
-    'max_block_size': 1024,
-    'default_smem_config': SMEM64K,
-}
-
-LIMITS_CC_52 = LIMITS_CC_50.copy()
-LIMITS_CC_52.update({
-    'smem_per_sm': SMEM96K,
-    'default_smem_config': SMEM96K,
-})
-LIMITS_CC_53 = LIMITS_CC_50.copy()
-LIMITS_CC_53.update({
-    'registers': 32768,
-})
-
-LIMITS_CC_60 = LIMITS_CC_50.copy()
-LIMITS_CC_60.update({
-    'warp_alloc_gran': 2,
-})
-LIMITS_CC_61 = LIMITS_CC_60.copy()
-LIMITS_CC_61.update({
-    'smem_per_sm': SMEM96K,
-    'default_smem_config': SMEM96K,
-    'warp_alloc_gran': 4,
-})
-LIMITS_CC_62 = LIMITS_CC_60.copy()
-LIMITS_CC_62.update({
-    'thread_per_sm': 4096,
-    'warp_per_sm': 128,
-    'warp_alloc_gran': 4,
-})
-
-LIMITS_CC_70 = LIMITS_CC_62.copy()
-LIMITS_CC_70.update({
-    'smem_per_sm': SMEM96K,
-    'smem_per_block': SMEM96K,
-    'default_smem_config': SMEM96K,
-})
-
-LIMITS_CC_75 = LIMITS_CC_70.copy()
-LIMITS_CC_75.update({
-    'warp_per_sm': 32,
-    'thread_per_sm': 1024,
-    'block_per_sm': 16,
-    'smem_per_sm': SMEM64K,
-    'smem_per_block': SMEM64K,
-    'default_smem_config': SMEM64K,
-})
-
-PHYSICAL_LIMITS = {
-    (2, 0): LIMITS_CC_20,
-    (2, 1): LIMITS_CC_21,
-    (3, 0): LIMITS_CC_30,
-    (3, 5): LIMITS_CC_35,
-    (3, 7): LIMITS_CC_35,
-    (5, 0): LIMITS_CC_50,
-    (5, 2): LIMITS_CC_52,
-    (5, 3): LIMITS_CC_53,
-    (6, 0): LIMITS_CC_50,
-    (6, 1): LIMITS_CC_61,
-    (6, 2): LIMITS_CC_62,
-    (7, 0): LIMITS_CC_70,
-    (7, 5): LIMITS_CC_75,
-}
-
-
-def ceil(x, s=1):
-    return s * math.ceil(x / s)
-
-
-def floor(x, s=1):
-    return s * math.floor(x / s)
-
-
-def warp_occupancy(info, cc, smem_config=None):
-    """Returns a dictionary of {threadperblock: occupancy, factor}
-
-    Only threadperblock of multiple of warpsize is used.
-    Only threadperblock of non-zero occupancy is returned.
-    """
-    ret = {}
-    try:
-        limits = PHYSICAL_LIMITS[cc]
-    except KeyError:
-        raise ValueError("%s is not a supported compute capability"
-                             % ".".join(str(c) for c in cc))
-    if smem_config is None:
-        smem_config = limits['default_smem_config']
-    warpsize = limits['thread_per_warp']
-    max_thread = info.maxthreads
-
-    for tpb in range(warpsize, max_thread + 1, warpsize):
-        result = compute_warp_occupancy(tpb=tpb,
-                                        reg=info.regs,
-                                        smem=info.shared,
-                                        smem_config=smem_config,
-                                        limits=limits)
-        if result[0]:
-            ret[tpb] = result
-    return ret
-
-
-def compute_warp_occupancy(tpb, reg, smem, smem_config, limits):
-    assert limits['reg_alloc_gran'] == 'warp', \
-        "assume warp register allocation granularity"
-    limit_block_per_sm = limits['block_per_sm']
-    limit_warp_per_sm = limits['warp_per_sm']
-    limit_thread_per_warp = limits['thread_per_warp']
-    limit_reg_per_thread = limits['reg_per_thread']
-    limit_total_regs = limits['registers']
-    limit_total_smem = min(limits['smem_per_sm'], smem_config)
-    my_smem_alloc_unit = limits['smem_alloc_unit']
-    reg_alloc_unit = limits['reg_alloc_unit']
-    warp_alloc_gran = limits['warp_alloc_gran']
-
-    my_warp_per_block = ceil(tpb / limit_thread_per_warp)
-    my_reg_count = reg
-    my_reg_per_block = my_warp_per_block
-    my_smem = smem
-    my_smem_per_block = ceil(my_smem, my_smem_alloc_unit)
-
-    # allocated resource
-    limit_blocks_due_to_warps = min(limit_block_per_sm,
-                                    floor(
-                                        limit_warp_per_sm / my_warp_per_block))
-
-    c39 = floor(limit_total_regs / ceil(my_reg_count * limit_thread_per_warp,
-                                        reg_alloc_unit),
-                warp_alloc_gran)
-
-    limit_blocks_due_to_regs = (0
-                                if my_reg_count > limit_reg_per_thread
-                                else (floor(c39 / my_reg_per_block)
-                                      if my_reg_count > 0
-                                      else limit_block_per_sm))
-
-    limit_blocks_due_to_smem = (floor(limit_total_smem /
-                                      my_smem_per_block)
-                                if my_smem_per_block > 0
-                                else limit_block_per_sm)
-
-    # occupancy
-    active_block_per_sm = min(limit_blocks_due_to_smem,
-                              limit_blocks_due_to_warps,
-                              limit_blocks_due_to_regs)
-
-    if active_block_per_sm == limit_blocks_due_to_warps:
-        factor = 'warps'
-    elif active_block_per_sm == limit_blocks_due_to_regs:
-        factor = 'regs'
-    else:
-        factor = 'smem'
-
-    active_warps_per_sm = active_block_per_sm * my_warp_per_block
-    #active_threads_per_sm = active_warps_per_sm * limit_thread_per_warp
-
-    occupancy = active_warps_per_sm / limit_warp_per_sm
-    return occupancy, factor
-
diff --git a/numba/cuda/tests/cudapy/test_deprecation.py b/numba/cuda/tests/cudapy/test_deprecation.py
deleted file mode 100644
index f3888425020..00000000000
--- a/numba/cuda/tests/cudapy/test_deprecation.py
+++ /dev/null
@@ -1,44 +0,0 @@
-from __future__ import print_function, absolute_import
-
-import warnings
-from contextlib import contextmanager
-
-from numba.tests.support import override_config, TestCase
-from numba.cuda.testing import skip_on_cudasim
-from numba import unittest_support as unittest
-from numba import cuda, types
-from numba.cuda.testing import SerialMixin
-
-
-@skip_on_cudasim("Skipped on simulator")
-class TestCudaDebugInfo(SerialMixin, TestCase):
-    """Tests features that will be deprecated
-    """
-    @contextmanager
-    def assert_deprecation_warning(self):
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            yield w
-
-    def test_autotune(self):
-        @cuda.jit("(int32[:],)")
-        def foo(xs):
-            xs[0] = 1
-
-        with self.assert_deprecation_warning() as w:
-            foo.autotune
-            assert len(w) == 1
-            assert issubclass(w[-1].category, DeprecationWarning)
-            assert ".autotune" in str(w[-1].message)
-
-        with self.assert_deprecation_warning() as w:
-            foo.occupancy
-            assert len(w) == 2
-            assert issubclass(w[0].category, DeprecationWarning)
-            assert ".occupancy" in str(w[0].message)
-            assert issubclass(w[1].category, DeprecationWarning)
-            assert ".autotune" in str(w[1].message)
-
-
-if __name__ == '__main__':
-    unittest.main()

From c67e2b37c21522926c2cb5e9af5d64f6c1bf150f Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 10 Jan 2020 05:43:07 -0800
Subject: [PATCH 2/9] Tests: ensure all kernel calls are configured

Any unconfigured calls were using the default of one thread and one
block, so this is made explicit.
---
 .../cuda/tests/cudadrv/test_context_stack.py  |  2 +-
 numba/cuda/tests/cudadrv/test_linker.py       |  2 +-
 numba/cuda/tests/cudapy/test_autojit.py       | 10 ++---
 numba/cuda/tests/cudapy/test_boolean.py       |  4 +-
 numba/cuda/tests/cudapy/test_casting.py       |  2 +-
 numba/cuda/tests/cudapy/test_freevar.py       |  2 +-
 numba/cuda/tests/cudapy/test_idiv.py          |  4 +-
 numba/cuda/tests/cudapy/test_inspect.py       |  4 +-
 numba/cuda/tests/cudapy/test_intrinsics.py    | 40 +++++++++----------
 numba/cuda/tests/cudapy/test_lang.py          |  4 +-
 numba/cuda/tests/cudapy/test_localmem.py      |  6 +--
 numba/cuda/tests/cudapy/test_macro.py         |  6 +--
 numba/cuda/tests/cudapy/test_multithreads.py  |  2 +-
 numba/cuda/tests/cudapy/test_print.py         |  4 +-
 numba/cuda/tests/cudapy/test_record_dtype.py  | 16 ++++----
 .../test_retrieve_autoconverted_arrays.py     | 16 ++++----
 numba/cuda/tests/cudapy/test_serialize.py     |  6 +--
 numba/cuda/tests/cudapy/test_sm.py            |  4 +-
 numba/cuda/tests/cudapy/test_userexc.py       |  6 +--
 19 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/numba/cuda/tests/cudadrv/test_context_stack.py b/numba/cuda/tests/cudadrv/test_context_stack.py
index e41ceb2d1f2..35b1e490783 100644
--- a/numba/cuda/tests/cudadrv/test_context_stack.py
+++ b/numba/cuda/tests/cudadrv/test_context_stack.py
@@ -118,7 +118,7 @@ def foo(a):
                     a[i] = i
 
             a = cuda.device_array(10)
-            foo(a)
+            foo[1, 1](a)
             self.assertEqual(list(a.copy_to_host()), list(range(10)))
 
         self.test_attached_primary(do)
diff --git a/numba/cuda/tests/cudadrv/test_linker.py b/numba/cuda/tests/cudadrv/test_linker.py
index ffee8be5a5a..293dc466af2 100644
--- a/numba/cuda/tests/cudadrv/test_linker.py
+++ b/numba/cuda/tests/cudadrv/test_linker.py
@@ -82,7 +82,7 @@ def foo(x, y):
         A = np.array([123])
         B = np.array([321])
 
-        foo(A, B)
+        foo[1, 1](A, B)
 
         self.assertTrue(A[0] == 123 + 2 * 321)
 
diff --git a/numba/cuda/tests/cudapy/test_autojit.py b/numba/cuda/tests/cudapy/test_autojit.py
index ab349b52e63..ba65735108f 100644
--- a/numba/cuda/tests/cudapy/test_autojit.py
+++ b/numba/cuda/tests/cudapy/test_autojit.py
@@ -12,11 +12,11 @@ def test_autojit(self):
         def what(a, b, c):
             pass
 
-        what(np.empty(1), 1.0, 21)
-        what(np.empty(1), 1.0, 21)
-        what(np.empty(1), np.empty(1, dtype=np.int32), 21)
-        what(np.empty(1), np.empty(1, dtype=np.int32), 21)
-        what(np.empty(1), 1.0, 21)
+        what[1, 1](np.empty(1), 1.0, 21)
+        what[1, 1](np.empty(1), 1.0, 21)
+        what[1, 1](np.empty(1), np.empty(1, dtype=np.int32), 21)
+        what[1, 1](np.empty(1), np.empty(1, dtype=np.int32), 21)
+        what[1, 1](np.empty(1), 1.0, 21)
 
         self.assertTrue(len(what.definitions) == 2)
 
diff --git a/numba/cuda/tests/cudapy/test_boolean.py b/numba/cuda/tests/cudapy/test_boolean.py
index 3c788b02c18..ba81abbb6a1 100644
--- a/numba/cuda/tests/cudapy/test_boolean.py
+++ b/numba/cuda/tests/cudapy/test_boolean.py
@@ -15,9 +15,9 @@ class TestCudaBoolean(SerialMixin, unittest.TestCase):
     def test_boolean(self):
         func = cuda.jit('void(float64[:], bool_)')(boolean_func)
         A = np.array([0], dtype='float64')
-        func(A, True)
+        func[1, 1](A, True)
         self.assertTrue(A[0] == 123)
-        func(A, False)
+        func[1, 1](A, False)
         self.assertTrue(A[0] == 321)
 
 
diff --git a/numba/cuda/tests/cudapy/test_casting.py b/numba/cuda/tests/cudapy/test_casting.py
index 33e63966f27..f4b91526936 100644
--- a/numba/cuda/tests/cudapy/test_casting.py
+++ b/numba/cuda/tests/cudapy/test_casting.py
@@ -33,7 +33,7 @@ def wrapper_fn(arg):
             argarray = np.zeros(1, dtype=intype)
             argarray[0] = arg
             resarray = np.zeros(1, dtype=outtype)
-            cuda_wrapper_fn(argarray, resarray)
+            cuda_wrapper_fn[1, 1](argarray, resarray)
             return resarray[0]
 
         return wrapper_fn
diff --git a/numba/cuda/tests/cudapy/test_freevar.py b/numba/cuda/tests/cudapy/test_freevar.py
index 24c18b873b6..592da9aced0 100644
--- a/numba/cuda/tests/cudapy/test_freevar.py
+++ b/numba/cuda/tests/cudapy/test_freevar.py
@@ -23,7 +23,7 @@ def foo(A, i):
             A[i] = sdata[i]
 
         A = np.arange(2, dtype="float32")
-        foo(A, 0)
+        foo[1, 1](A, 0)
 
 
 if __name__ == '__main__':
diff --git a/numba/cuda/tests/cudapy/test_idiv.py b/numba/cuda/tests/cudapy/test_idiv.py
index 635a9e4c1d0..bf233d38285 100644
--- a/numba/cuda/tests/cudapy/test_idiv.py
+++ b/numba/cuda/tests/cudapy/test_idiv.py
@@ -15,7 +15,7 @@ def div(grid, l_x, l_y):
 
         x = np.ones((2, 2), dtype=np.float32)
         grid = cuda.to_device(x)
-        div(grid, 2, 2)
+        div[1, 1](grid, 2, 2)
         y = grid.copy_to_host()
         self.assertTrue(np.all(y == 0.5))
 
@@ -30,7 +30,7 @@ def div_double(grid, l_x, l_y):
 
         x = np.ones((2, 2), dtype=np.float64)
         grid = cuda.to_device(x)
-        div_double(grid, 2, 2)
+        div_double[1, 1](grid, 2, 2)
         y = grid.copy_to_host()
         self.assertTrue(np.all(y == 0.5))
 
diff --git a/numba/cuda/tests/cudapy/test_inspect.py b/numba/cuda/tests/cudapy/test_inspect.py
index 9ecacd06381..c57198b8a04 100644
--- a/numba/cuda/tests/cudapy/test_inspect.py
+++ b/numba/cuda/tests/cudapy/test_inspect.py
@@ -39,8 +39,8 @@ def test_polytyped(self):
         def foo(x, y):
             pass
 
-        foo(1, 1)
-        foo(1.2, 2.4)
+        foo[1, 1](1, 1)
+        foo[1, 1](1.2, 2.4)
 
         file = StringIO()
         foo.inspect_types(file=file)
diff --git a/numba/cuda/tests/cudapy/test_intrinsics.py b/numba/cuda/tests/cudapy/test_intrinsics.py
index 3dab9f3d034..432e3c6daa8 100644
--- a/numba/cuda/tests/cudapy/test_intrinsics.py
+++ b/numba/cuda/tests/cudapy/test_intrinsics.py
@@ -117,7 +117,7 @@ class TestCudaIntrinsic(SerialMixin, unittest.TestCase):
     def test_simple_threadidx(self):
         compiled = cuda.jit("void(int32[:])")(simple_threadidx)
         ary = np.ones(1, dtype=np.int32)
-        compiled(ary)
+        compiled[1, 1](ary)
         self.assertTrue(ary[0] == 0)
 
     def test_fill_threadidx(self):
@@ -264,44 +264,44 @@ def foo(out):
     def test_popc_u4(self):
         compiled = cuda.jit("void(int32[:], uint32)")(simple_popc)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0xF0)
+        compiled[1, 1](ary, 0xF0)
         self.assertEquals(ary[0], 4)
 
     def test_popc_u8(self):
         compiled = cuda.jit("void(int32[:], uint64)")(simple_popc)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0xF00000000000)
+        compiled[1, 1](ary, 0xF00000000000)
         self.assertEquals(ary[0], 4)
 
     def test_fma_f4(self):
         compiled = cuda.jit("void(f4[:], f4, f4, f4)")(simple_fma)
         ary = np.zeros(1, dtype=np.float32)
-        compiled(ary, 2., 3., 4.)
+        compiled[1, 1](ary, 2., 3., 4.)
         np.testing.assert_allclose(ary[0], 2 * 3 + 4)
 
     def test_fma_f8(self):
         compiled = cuda.jit("void(f8[:], f8, f8, f8)")(simple_fma)
         ary = np.zeros(1, dtype=np.float64)
-        compiled(ary, 2., 3., 4.)
+        compiled[1, 1](ary, 2., 3., 4.)
         np.testing.assert_allclose(ary[0], 2 * 3 + 4)
 
     def test_brev_u4(self):
         compiled = cuda.jit("void(uint32[:], uint32)")(simple_brev)
         ary = np.zeros(1, dtype=np.uint32)
-        compiled(ary, 0x000030F0)
+        compiled[1, 1](ary, 0x000030F0)
         self.assertEquals(ary[0], 0x0F0C0000)
 
     @skip_on_cudasim('only get given a Python "int", assumes 32 bits')
     def test_brev_u8(self):
         compiled = cuda.jit("void(uint64[:], uint64)")(simple_brev)
         ary = np.zeros(1, dtype=np.uint64)
-        compiled(ary, 0x000030F0000030F0)
+        compiled[1, 1](ary, 0x000030F0000030F0)
         self.assertEquals(ary[0], 0x0F0C00000F0C0000)
 
     def test_clz_i4(self):
         compiled = cuda.jit("void(int32[:], int32)")(simple_clz)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0x00100000)
+        compiled[1, 1](ary, 0x00100000)
         self.assertEquals(ary[0], 11)
 
     def test_clz_u4(self):
@@ -314,57 +314,57 @@ def test_clz_u4(self):
         """
         compiled = cuda.jit("void(int32[:], uint32)")(simple_clz)
         ary = np.zeros(1, dtype=np.uint32)
-        compiled(ary, 0x00100000)
+        compiled[1, 1](ary, 0x00100000)
         self.assertEquals(ary[0], 11)
 
     def test_clz_i4_1s(self):
         compiled = cuda.jit("void(int32[:], int32)")(simple_clz)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0xFFFFFFFF)
+        compiled[1, 1](ary, 0xFFFFFFFF)
         self.assertEquals(ary[0], 0)
 
     def test_clz_i4_0s(self):
         compiled = cuda.jit("void(int32[:], int32)")(simple_clz)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0x0)
+        compiled[1, 1](ary, 0x0)
         self.assertEquals(ary[0], 32, "CUDA semantics")
 
     @skip_on_cudasim('only get given a Python "int", assumes 32 bits')
     def test_clz_i8(self):
         compiled = cuda.jit("void(int32[:], int64)")(simple_clz)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0x000000000010000)
+        compiled[1, 1](ary, 0x000000000010000)
         self.assertEquals(ary[0], 47)
 
     def test_ffs_i4(self):
         compiled = cuda.jit("void(int32[:], int32)")(simple_ffs)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0x00100000)
+        compiled[1, 1](ary, 0x00100000)
         self.assertEquals(ary[0], 20)
 
     def test_ffs_u4(self):
         compiled = cuda.jit("void(int32[:], uint32)")(simple_ffs)
         ary = np.zeros(1, dtype=np.uint32)
-        compiled(ary, 0x00100000)
+        compiled[1, 1](ary, 0x00100000)
         self.assertEquals(ary[0], 20)
 
     def test_ffs_i4_1s(self):
         compiled = cuda.jit("void(int32[:], int32)")(simple_ffs)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0xFFFFFFFF)
+        compiled[1, 1](ary, 0xFFFFFFFF)
         self.assertEquals(ary[0], 0)
 
     def test_ffs_i4_0s(self):
         compiled = cuda.jit("void(int32[:], int32)")(simple_ffs)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0x0)
+        compiled[1, 1](ary, 0x0)
         self.assertEquals(ary[0], 32, "CUDA semantics")
 
     @skip_on_cudasim('only get given a Python "int", assumes 32 bits')
     def test_ffs_i8(self):
         compiled = cuda.jit("void(int32[:], int64)")(simple_ffs)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary, 0x000000000010000)
+        compiled[1, 1](ary, 0x000000000010000)
         self.assertEquals(ary[0], 16)
 
     def test_simple_laneid(self):
@@ -378,7 +378,7 @@ def test_simple_laneid(self):
     def test_simple_warpsize(self):
         compiled = cuda.jit("void(int32[:])")(simple_warpsize)
         ary = np.zeros(1, dtype=np.int32)
-        compiled(ary)
+        compiled[1, 1](ary)
         self.assertEquals(ary[0], 32, "CUDA semantics")
 
     @unittest.skipUnless(IS_PY3, "round() returns float on Py2")
@@ -387,7 +387,7 @@ def test_round_f4(self):
         ary = np.zeros(1, dtype=np.int32)
 
         for i in [-3.0, -2.5, -2.25, -1.5, 1.5, 2.25, 2.5, 2.75]:
-            compiled(ary, i)
+            compiled[1, 1](ary, i)
             self.assertEquals(ary[0], round(i))
 
     @unittest.skipUnless(IS_PY3, "round() returns float on Py2")
@@ -396,7 +396,7 @@ def test_round_f8(self):
         ary = np.zeros(1, dtype=np.int32)
 
         for i in [-3.0, -2.5, -2.25, -1.5, 1.5, 2.25, 2.5, 2.75]:
-            compiled(ary, i)
+            compiled[1, 1](ary, i)
             self.assertEquals(ary[0], round(i))
 
 
diff --git a/numba/cuda/tests/cudapy/test_lang.py b/numba/cuda/tests/cudapy/test_lang.py
index f0133a09226..0166df6e23a 100644
--- a/numba/cuda/tests/cudapy/test_lang.py
+++ b/numba/cuda/tests/cudapy/test_lang.py
@@ -19,7 +19,7 @@ def foo(a):
                 a[i] = v
 
         a = np.zeros(len(tup))
-        foo(a)
+        foo[1, 1](a)
         self.assertTrue(np.all(a == tup))
 
     def test_zip(self):
@@ -34,7 +34,7 @@ def foo(a):
             a[0] = c
 
         a = np.zeros(1)
-        foo(a)
+        foo[1, 1](a)
         b = np.array(t1)
         c = np.array(t2)
         self.assertTrue(np.all(a == (b + c).sum()))
diff --git a/numba/cuda/tests/cudapy/test_localmem.py b/numba/cuda/tests/cudapy/test_localmem.py
index 605beae5676..f50ecf9f9db 100644
--- a/numba/cuda/tests/cudapy/test_localmem.py
+++ b/numba/cuda/tests/cudapy/test_localmem.py
@@ -36,7 +36,7 @@ def test_local_array(self):
         self.assertTrue('.local' in jculocal.ptx)
         A = np.arange(1000, dtype='int32')
         B = np.zeros_like(A)
-        jculocal(A, B)
+        jculocal[1, 1](A, B)
         self.assertTrue(np.all(A == B))
 
     def test_local_array_1_tuple(self):
@@ -47,7 +47,7 @@ def test_local_array_1_tuple(self):
         # may reduce it to registers.
         A = np.arange(5, dtype='int32')
         B = np.zeros_like(A)
-        jculocal(A, B)
+        jculocal[1, 1](A, B)
         self.assertTrue(np.all(A == B))
 
     def test_local_array_complex(self):
@@ -57,7 +57,7 @@ def test_local_array_complex(self):
         # self.assertTrue('.local' in jculocalcomplex.ptx)
         A = (np.arange(100, dtype='complex128') - 1) / 2j
         B = np.zeros_like(A)
-        jculocalcomplex(A, B)
+        jculocalcomplex[1, 1](A, B)
         self.assertTrue(np.all(A == B))
 
 
diff --git a/numba/cuda/tests/cudapy/test_macro.py b/numba/cuda/tests/cudapy/test_macro.py
index b6644c4feb8..2caf65f06c0 100644
--- a/numba/cuda/tests/cudapy/test_macro.py
+++ b/numba/cuda/tests/cudapy/test_macro.py
@@ -57,11 +57,11 @@ def getarg2(self):
 
     def test_global_constants(self):
         udt = cuda.jit((float32[:],))(udt_global_constants)
-        udt(self.getarg())
+        udt[1, 1](self.getarg())
 
     def test_global_build_tuple(self):
         udt = cuda.jit((float32[:, :],))(udt_global_build_tuple)
-        udt(self.getarg2())
+        udt[1, 1](self.getarg2())
 
     @skip_on_cudasim('Simulator does not perform macro expansion')
     def test_global_build_list(self):
@@ -73,7 +73,7 @@ def test_global_build_list(self):
 
     def test_global_constant_tuple(self):
         udt = cuda.jit((float32[:, :],))(udt_global_constant_tuple)
-        udt(self.getarg2())
+        udt[1, 1](self.getarg2())
 
     @skip_on_cudasim("Can't check for constants in simulator")
     def test_invalid_1(self):
diff --git a/numba/cuda/tests/cudapy/test_multithreads.py b/numba/cuda/tests/cudapy/test_multithreads.py
index 7337241b259..6d0b1c6df15 100644
--- a/numba/cuda/tests/cudapy/test_multithreads.py
+++ b/numba/cuda/tests/cudapy/test_multithreads.py
@@ -23,7 +23,7 @@ def foo(x):
         x[0] += 1
 
     def use_foo(x):
-        foo(x)
+        foo[1, 1](x)
         return x
 
     arrays = [np.arange(10) for i in range(10)]
diff --git a/numba/cuda/tests/cudapy/test_print.py b/numba/cuda/tests/cudapy/test_print.py
index 59513d127a2..4fe1b2ba859 100644
--- a/numba/cuda/tests/cudapy/test_print.py
+++ b/numba/cuda/tests/cudapy/test_print.py
@@ -43,7 +43,7 @@ def test_cuhello(self):
     def test_printfloat(self):
         jprintfloat = cuda.jit('void()', debug=False)(printfloat)
         with captured_cuda_stdout() as stdout:
-            jprintfloat()
+            jprintfloat[1, 1]()
         # CUDA and the simulator use different formats for float formatting
         self.assertIn(stdout.getvalue(), ["0 23 34.750000 321\n",
                                           "0 23 34.75 321\n"])
@@ -51,7 +51,7 @@ def test_printfloat(self):
     def test_printempty(self):
         cufunc = cuda.jit('void()', debug=False)(printempty)
         with captured_cuda_stdout() as stdout:
-            cufunc()
+            cufunc[1, 1]()
         self.assertEqual(stdout.getvalue(), "\n")
 
     def test_string(self):
diff --git a/numba/cuda/tests/cudapy/test_record_dtype.py b/numba/cuda/tests/cudapy/test_record_dtype.py
index b07770dd4af..4781fcf819a 100644
--- a/numba/cuda/tests/cudapy/test_record_dtype.py
+++ b/numba/cuda/tests/cudapy/test_record_dtype.py
@@ -137,7 +137,7 @@ def _test_set_equal(self, pyfunc, value, valuetype):
             else:
                 expect = got.copy().view(np.recarray)
 
-            cfunc(got, i, value)
+            cfunc[1, 1](got, i, value)
             pyfunc(expect, i, value)
 
             # Match the entire array to ensure no memory corruption
@@ -169,7 +169,7 @@ def test_set_record(self):
             pyfunc(expect, i, j)
 
             got = self.sample1d.copy()
-            cfunc(got, i, j)
+            cfunc[1, 1](got, i, j)
 
             # Match the entire array to ensure no memory corruption
             self.assertEqual(expect[i], expect[j])
@@ -180,7 +180,7 @@ def _test_rec_set(self, v, pyfunc, f):
         rec = self.sample1d.copy()[0]
         nbrecord = numpy_support.from_dtype(recordtype)
         cfunc = self.get_cfunc(pyfunc, (nbrecord,))
-        cfunc(rec, v)
+        cfunc[1, 1](rec, v)
         np.testing.assert_equal(rec[f], v)
 
     def test_rec_set_a(self):
@@ -198,7 +198,7 @@ def _test_rec_read(self, v, pyfunc, f):
         arr = np.zeros(1, v.dtype)
         nbrecord = numpy_support.from_dtype(recordtype)
         cfunc = self.get_cfunc(pyfunc, (nbrecord,))
-        cfunc(rec, arr)
+        cfunc[1, 1](rec, arr)
         np.testing.assert_equal(arr[0], v)
 
     def test_rec_read_a(self):
@@ -218,7 +218,7 @@ def test_record_write_1d_array(self):
         nbrecord = numpy_support.from_dtype(recordwitharray)
         cfunc = self.get_cfunc(record_write_array, (nbrecord,))
 
-        cfunc(rec)
+        cfunc[1, 1](rec)
         expected = self.samplerec1darr.copy()
         expected['g'] = 2
         expected['h'][0] = 3.0
@@ -233,7 +233,7 @@ def test_record_write_2d_array(self):
         rec = self.samplerec2darr.copy()
         nbrecord = numpy_support.from_dtype(recordwith2darray)
         cfunc = self.get_cfunc(record_write_2d_array, (nbrecord,))
-        cfunc(rec)
+        cfunc[1, 1](rec)
 
         expected = self.samplerec2darr.copy()
         expected['i'] = 3
@@ -252,7 +252,7 @@ def test_record_read_1d_array(self):
         nbrecord = numpy_support.from_dtype(recordwitharray)
         cfunc = self.get_cfunc(record_read_array, (nbrecord,))
         arr = np.zeros(2, dtype=rec['h'].dtype)
-        cfunc(rec, arr)
+        cfunc[1, 1](rec, arr)
 
         np.testing.assert_equal(rec['h'], arr)
 
@@ -268,7 +268,7 @@ def test_record_read_2d_array(self):
         nbrecord = numpy_support.from_dtype(recordwith2darray)
         cfunc = self.get_cfunc(record_read_2d_array, (nbrecord,))
         arr = np.zeros((3,2), dtype=rec['j'].dtype)
-        cfunc(rec, arr)
+        cfunc[1, 1](rec, arr)
 
         np.testing.assert_equal(rec['j'], arr)
 
diff --git a/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py b/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py
index 81a98279a9b..1432b2dc970 100644
--- a/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py
+++ b/numba/cuda/tests/cudapy/test_retrieve_autoconverted_arrays.py
@@ -41,42 +41,42 @@ def setUp(self):
 
     def test_array_inout(self):
         host_arr = np.zeros(1, dtype=np.int64)
-        self.set_array_to_three(cuda.InOut(host_arr))
+        self.set_array_to_three[1, 1](cuda.InOut(host_arr))
         self.assertEqual(3, host_arr[0])
 
     def test_array_in(self):
         host_arr = np.zeros(1, dtype=np.int64)
-        self.set_array_to_three(cuda.In(host_arr))
+        self.set_array_to_three[1, 1](cuda.In(host_arr))
         self.assertEqual(0, host_arr[0])
 
     def test_array_in_from_config(self):
         host_arr = np.zeros(1, dtype=np.int64)
-        self.set_array_to_three_nocopy(host_arr)
+        self.set_array_to_three_nocopy[1, 1](host_arr)
         self.assertEqual(0, host_arr[0])
 
     def test_array_default(self):
         host_arr = np.zeros(1, dtype=np.int64)
-        self.set_array_to_three(host_arr)
+        self.set_array_to_three[1, 1](host_arr)
         self.assertEqual(3, host_arr[0])
 
     def test_record_in(self):
         host_rec = np.zeros(1, dtype=recordtype)
-        self.set_record_to_three(cuda.In(host_rec))
+        self.set_record_to_three[1, 1](cuda.In(host_rec))
         self.assertEqual(0, host_rec[0]['b'])
 
     def test_record_inout(self):
         host_rec = np.zeros(1, dtype=recordtype)
-        self.set_record_to_three(cuda.InOut(host_rec))
+        self.set_record_to_three[1, 1](cuda.InOut(host_rec))
         self.assertEqual(3, host_rec[0]['b'])
 
     def test_record_default(self):
         host_rec = np.zeros(1, dtype=recordtype)
-        self.set_record_to_three(host_rec)
+        self.set_record_to_three[1, 1](host_rec)
         self.assertEqual(3, host_rec[0]['b'])
 
     def test_record_in_from_config(self):
         host_rec = np.zeros(1, dtype=recordtype)
-        self.set_record_to_three_nocopy(host_rec)
+        self.set_record_to_three_nocopy[1, 1](host_rec)
         self.assertEqual(0, host_rec[0]['b'])
 
 
diff --git a/numba/cuda/tests/cudapy/test_serialize.py b/numba/cuda/tests/cudapy/test_serialize.py
index c2289e32724..6f1f59d4e3e 100644
--- a/numba/cuda/tests/cudapy/test_serialize.py
+++ b/numba/cuda/tests/cudapy/test_serialize.py
@@ -11,13 +11,13 @@ class TestPickle(SerialMixin, unittest.TestCase):
 
     def check_call(self, callee):
         arr = np.array([100])
-        expected = callee(arr)
+        expected = callee[1, 1](arr)
 
         # serialize and rebuild
         foo1 = pickle.loads(pickle.dumps(callee))
         del callee
         # call rebuild function
-        got1 = foo1(arr)
+        got1 = foo1[1, 1](arr)
         np.testing.assert_equal(got1, expected)
         del got1
 
@@ -25,7 +25,7 @@ def check_call(self, callee):
         foo2 = pickle.loads(pickle.dumps(foo1))
         del foo1
         # call rebuild function
-        got2 = foo2(arr)
+        got2 = foo2[1, 1](arr)
         np.testing.assert_equal(got2, expected)
         del got2
 
diff --git a/numba/cuda/tests/cudapy/test_sm.py b/numba/cuda/tests/cudapy/test_sm.py
index 4e6e1ea8fa1..0d188d9e8c1 100644
--- a/numba/cuda/tests/cudapy/test_sm.py
+++ b/numba/cuda/tests/cudapy/test_sm.py
@@ -21,7 +21,7 @@ def outer():
             outer_arr = cuda.shared.array(1, dtype=int32)
             inner()
 
-        outer()
+        outer[1, 1]()
 
     def _check_shared_array_size(self, shape, expected):
         @cuda.jit
@@ -30,7 +30,7 @@ def s(a):
             a[0] = arr.size
 
         result = np.zeros(1, dtype=np.int32)
-        s(result)
+        s[1, 1](result)
         self.assertEqual(result[0], expected)
 
     def test_issue_1051_shared_size_broken_1d(self):
diff --git a/numba/cuda/tests/cudapy/test_userexc.py b/numba/cuda/tests/cudapy/test_userexc.py
index 073a5dba8bd..dcb05961431 100644
--- a/numba/cuda/tests/cudapy/test_userexc.py
+++ b/numba/cuda/tests/cudapy/test_userexc.py
@@ -23,14 +23,14 @@ def test_exc(x):
             elif x == 2:
                 raise MyError("foo")
 
-        test_exc(0)    # no raise
+        test_exc[1, 1](0)    # no raise
         with self.assertRaises(MyError) as cm:
-            test_exc(1)
+            test_exc[1, 1](1)
         if not config.ENABLE_CUDASIM:
             self.assertRegexpMatches(str(cm.exception), regex_pattern)
         self.assertIn("tid=[0, 0, 0] ctaid=[0, 0, 0]", str(cm.exception))
         with self.assertRaises(MyError) as cm:
-            test_exc(2)
+            test_exc[1, 1](2)
         if not config.ENABLE_CUDASIM:
             self.assertRegexpMatches(str(cm.exception), regex_pattern)
             self.assertRegexpMatches(str(cm.exception), regex_pattern)

From b74e4099ec366d45fccdb51569e2702ec8c2cf88 Mon Sep 17 00:00:00 2001
From: Stuart Archibald <stuart.archibald@googlemail.com>
Date: Tue, 20 Aug 2019 09:36:08 +0100
Subject: [PATCH 3/9] Prevent CUDA kernel launch without a specified launch
 config.

This patch prevents the launch of a CUDA kernel with no
configuration as this causes confusion for (especially) first time
users. Current behaviour is that if no launch config is specified
then a default everything-set-to-1 config is used, new behaviour is
that if no launch config is specified then an exception is raised
pointing users to the syntax and documentation.
---
 numba/cuda/compiler.py |  4 ++--
 numba/cuda/errors.py   | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index 19d6b0ccea6..3bef3b4bcf2 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -301,8 +301,8 @@ class CUDAKernelBase(object):
     """
 
     def __init__(self):
-        self.griddim = (1, 1)
-        self.blockdim = (1, 1, 1)
+        self.griddim = None
+        self.blockdim = None
         self.sharedmem = 0
         self.stream = 0
 
diff --git a/numba/cuda/errors.py b/numba/cuda/errors.py
index 79da66f91d3..c6a28e77de5 100644
--- a/numba/cuda/errors.py
+++ b/numba/cuda/errors.py
@@ -13,6 +13,16 @@ def __init__(self, msg, tid=None, ctaid=None):
         msg = t % (self.tid, self.ctaid, self.msg)
         super(KernelRuntimeError, self).__init__(msg)
 
+_launch_help_url = ("https://numba.pydata.org/numba-doc/"
+                    "latest/cuda/kernels.html#kernel-invocation")
+_missing_launch_config_msg = """
+Kernel launch configuration was not specified. Use the syntax:
+
+kernel_function[blockspergrid, threadsperblock](arg0, arg1, ..., argn)
+
+See {} for help.
+
+""".format(_launch_help_url)
 
 def normalize_kernel_dimensions(griddim, blockdim):
     """
@@ -35,6 +45,9 @@ def check_dim(dim, name):
             dim.append(1)
         return dim
 
+    if None in (griddim, blockdim):
+        raise ValueError(_missing_launch_config_msg)
+
     griddim = check_dim(griddim, 'griddim')
     blockdim = check_dim(blockdim, 'blockdim')
 

From ff4f86c8c7c39a3e598f32a1d8083447b2650e6f Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 10 Jan 2020 05:41:13 -0800
Subject: [PATCH 4/9] Normalize kernel dimensions in CUDAKernel.__call__

The `normalize_kernel_dimensions` function validates that the kernel has
been configured, so it must be called in a CUDAKernel call (in addition
to an AutoJitCUDAKernel call) to ensure that the kernel has been
configured.
---
 numba/cuda/compiler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
index 3bef3b4bcf2..ed44fc32730 100644
--- a/numba/cuda/compiler.py
+++ b/numba/cuda/compiler.py
@@ -519,9 +519,10 @@ def __reduce__(self):
 
     def __call__(self, *args, **kwargs):
         assert not kwargs
+        griddim, blockdim = normalize_kernel_dimensions(self.griddim, self.blockdim)
         self._kernel_call(args=args,
-                          griddim=self.griddim,
-                          blockdim=self.blockdim,
+                          griddim=griddim,
+                          blockdim=blockdim,
                           stream=self.stream,
                           sharedmem=self.sharedmem)
 

From dc120bce9f46bf3965dad141bf87febff83af7ff Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 10 Jan 2020 06:55:34 -0800
Subject: [PATCH 5/9] Raise ValueError in simulator if kernel not configured

---
 numba/cuda/simulator/kernel.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/numba/cuda/simulator/kernel.py b/numba/cuda/simulator/kernel.py
index aa44f675a07..b737630eef2 100644
--- a/numba/cuda/simulator/kernel.py
+++ b/numba/cuda/simulator/kernel.py
@@ -53,16 +53,23 @@ def __init__(self, fn, device, fastmath=False, extensions=[]):
         self._device = device
         self._fastmath = fastmath
         self.extensions = list(extensions) # defensive copy
-        # Initial configuration: 1 block, 1 thread, stream 0, no dynamic shared
+        # Initial configuration: grid unconfigured, stream 0, no dynamic shared
         # memory.
-        self[1, 1, 0, 0]
+        self.grid_dim = None
+        self.block_dim = None
+        self.stream = 0
+        self.dynshared_size = 0
 
     def __call__(self, *args):
         if self._device:
             with swapped_cuda_module(self.fn, _get_kernel_context()):
                 return self.fn(*args)
 
-        fake_cuda_module = FakeCUDAModule(self.grid_dim, self.block_dim,
+        # Ensure we've been given a valid grid configuration
+        grid_dim, block_dim = normalize_kernel_dimensions(self.grid_dim,
+                                                          self.block_dim)
+
+        fake_cuda_module = FakeCUDAModule(grid_dim, block_dim,
                                           self.dynshared_size)
         with _push_kernel_context(fake_cuda_module):
             # fake_args substitutes all numpy arrays for FakeCUDAArrays
@@ -90,14 +97,13 @@ def fake_arg(arg):
             fake_args = [fake_arg(arg) for arg in args]
             with swapped_cuda_module(self.fn, fake_cuda_module):
                 # Execute one block at a time
-                for grid_point in np.ndindex(*self.grid_dim):
-                    bm = BlockManager(self.fn, self.grid_dim, self.block_dim)
+                for grid_point in np.ndindex(*grid_dim):
+                    bm = BlockManager(self.fn, grid_dim, block_dim)
                     bm.run(grid_point, *fake_args)
 
             for wb in retr:
                 wb()
 
-
     def __getitem__(self, configuration):
         self.grid_dim, self.block_dim = \
             normalize_kernel_dimensions(*configuration[:2])

From ad75d115a7783c8dec1536dd38f4b2e5f6a301c5 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 10 Jan 2020 06:20:17 -0800
Subject: [PATCH 6/9] Add tests for unconfigured kernel error

---
 numba/cuda/tests/cudapy/test_errors.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py
index 3028f0e8503..24f80a00336 100644
--- a/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba/cuda/tests/cudapy/test_errors.py
@@ -41,6 +41,20 @@ def test_non_integral_dims(self):
         self.assertIn("blockdim must be a sequence of integers, got [3.0]",
                       str(raises.exception))
 
+    def _test_unconfigured(self, kernfunc):
+        with self.assertRaises(ValueError) as raises:
+            kernfunc(0)
+        self.assertIn("launch configuration was not specified",
+                      str(raises.exception))
+
+    def test_unconfigured_cudakernel(self):
+        kernfunc = cuda.jit("void(int32)")(noop)
+        self._test_unconfigured(kernfunc)
+
+    def test_unconfigured_autojitcudakernel(self):
+        kernfunc = cuda.jit(noop)
+        self._test_unconfigured(kernfunc)
+
 
 if __name__ == '__main__':
     unittest.main()

From 23334fd4d2c6ab9e71563253075269893bb5b949 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 10 Jan 2020 08:02:51 -0800
Subject: [PATCH 7/9] Fix flake8 errors in errors.py

---
 numba/cuda/errors.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/numba/cuda/errors.py b/numba/cuda/errors.py
index c6a28e77de5..42586a43d8f 100644
--- a/numba/cuda/errors.py
+++ b/numba/cuda/errors.py
@@ -13,6 +13,7 @@ def __init__(self, msg, tid=None, ctaid=None):
         msg = t % (self.tid, self.ctaid, self.msg)
         super(KernelRuntimeError, self).__init__(msg)
 
+
 _launch_help_url = ("https://numba.pydata.org/numba-doc/"
                     "latest/cuda/kernels.html#kernel-invocation")
 _missing_launch_config_msg = """
@@ -24,6 +25,7 @@ def __init__(self, msg, tid=None, ctaid=None):
 
 """.format(_launch_help_url)
 
+
 def normalize_kernel_dimensions(griddim, blockdim):
     """
     Normalize and validate the user-supplied kernel dimensions.

From 773b6690832d709984ad3c557687a2efd21de177 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Mon, 13 Jan 2020 01:04:30 -0800
Subject: [PATCH 8/9] Add SerialMixin to TestJitErrors

Now that the TestJitErrors class contains test cases that call CUDA
functions, it requires the SerialMixin, otherwise it will be executed in
a child process after the parent already used CUDA (which is not
supported) when testing in parallel.
---
 contrib/valgrind-numba.supp            | 21 +++++++++++++++++++++
 numba/cuda/tests/cudapy/test_errors.py |  4 ++--
 2 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 contrib/valgrind-numba.supp

diff --git a/contrib/valgrind-numba.supp b/contrib/valgrind-numba.supp
new file mode 100644
index 00000000000..26271eb4ef0
--- /dev/null
+++ b/contrib/valgrind-numba.supp
@@ -0,0 +1,21 @@
+{
+   <llvmpy_get_cpu_name_cond>
+   Memcheck:Cond
+   fun:_ZN4llvm3sys14getHostCPUNameEv
+   fun:LLVMPY_GetHostCPUName
+}
+
+{
+   <llvmpy_get_cpu_name_value8>
+   Memcheck:Value8
+   fun:_ZN4llvm3sys14getHostCPUNameEv
+   fun:LLVMPY_GetHostCPUName
+}
+
+{
+   <openmp_init_cond>
+   Memcheck:Cond
+   fun:__intel_sse2_strrchr
+   fun:_ZN67_INTERNAL_45_______src_thirdparty_tbb_omp_dynamic_link_cpp_c306cade5__kmp12init_dl_dataEv
+   fun:__sti__$E
+}
diff --git a/numba/cuda/tests/cudapy/test_errors.py b/numba/cuda/tests/cudapy/test_errors.py
index 24f80a00336..135a4baa443 100644
--- a/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba/cuda/tests/cudapy/test_errors.py
@@ -3,14 +3,14 @@
 import numpy as np
 
 from numba import cuda
-from numba.cuda.testing import unittest
+from numba.cuda.testing import unittest, SerialMixin
 
 
 def noop(x):
     pass
 
 
-class TestJitErrors(unittest.TestCase):
+class TestJitErrors(SerialMixin, unittest.TestCase):
     """
     Test compile-time errors with @jit.
     """

From 252f2266fa765a559428cc385eb9b242f78596b8 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 21 Jan 2020 07:54:40 -0800
Subject: [PATCH 9/9] Remove accidentally-committed suppressions file

---
 contrib/valgrind-numba.supp | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 contrib/valgrind-numba.supp

diff --git a/contrib/valgrind-numba.supp b/contrib/valgrind-numba.supp
deleted file mode 100644
index 26271eb4ef0..00000000000
--- a/contrib/valgrind-numba.supp
+++ /dev/null
@@ -1,21 +0,0 @@
-{
-   <llvmpy_get_cpu_name_cond>
-   Memcheck:Cond
-   fun:_ZN4llvm3sys14getHostCPUNameEv
-   fun:LLVMPY_GetHostCPUName
-}
-
-{
-   <llvmpy_get_cpu_name_value8>
-   Memcheck:Value8
-   fun:_ZN4llvm3sys14getHostCPUNameEv
-   fun:LLVMPY_GetHostCPUName
-}
-
-{
-   <openmp_init_cond>
-   Memcheck:Cond
-   fun:__intel_sse2_strrchr
-   fun:_ZN67_INTERNAL_45_______src_thirdparty_tbb_omp_dynamic_link_cpp_c306cade5__kmp12init_dl_dataEv
-   fun:__sti__$E
-}