Merge pull request #5061 from gmarkall/grm-fix-no-launch-config

Prevent kernel launch with no configuration, remove autotuner
numba · Feb 27, 2020 · 5c4c82d · 5c4c82d
2 parents 0f2f216 + 7d28818
commit 5c4c82d
Show file tree

Hide file tree

Showing 25 changed files with 120 additions and 513 deletions.
diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
@@ -12,7 +12,6 @@
 from numba.core import types, typing, utils, funcdesc, serialize, config, compiler, sigutils
 from numba.core.compiler_lock import global_compiler_lock
 
-from .cudadrv.autotune import AutoTuner
 from .cudadrv.devices import get_context
 from .cudadrv import nvvm, devicearray, driver
 from .errors import normalize_kernel_dimensions
@@ -280,7 +279,7 @@ def _compute_thread_per_block(self, kernel):
         # Prefer user-specified config
         if tpb != 0:
             return tpb
-        # Else, ask the driver to give a good cofnig
+        # Else, ask the driver to give a good config
         else:
             ctx = get_context()
             kwargs = dict(
@@ -289,32 +288,17 @@ def _compute_thread_per_block(self, kernel):
                 memsize=self.sharedmem,
                 blocksizelimit=1024,
             )
-            try:
-                # Raises from the driver if the feature is unavailable
-                _, tpb = ctx.get_max_potential_block_size(**kwargs)
-            except AttributeError:
-                # Fallback to table-based approach.
-                tpb = self._fallback_autotune_best(kernel)
-                raise
+            _, tpb = ctx.get_max_potential_block_size(**kwargs)
             return tpb
 
-    def _fallback_autotune_best(self, kernel):
-        try:
-            tpb = kernel.autotune.best()
-        except ValueError:
-            warnings.warn('Could not autotune, using default tpb of 128')
-            tpb = 128
-
-        return tpb
-
 
 class CUDAKernelBase(object):
     """Define interface for configurable kernels
     """
 
     def __init__(self):
-        self.griddim = (1, 1)
-        self.blockdim = (1, 1, 1)
+        self.griddim = None
+        self.blockdim = None
         self.sharedmem = 0
         self.stream = 0
 
@@ -531,9 +515,10 @@ def __reduce__(self):
 
     def __call__(self, *args, **kwargs):
         assert not kwargs
+        griddim, blockdim = normalize_kernel_dimensions(self.griddim, self.blockdim)
         self._kernel_call(args=args,
-                          griddim=self.griddim,
-                          blockdim=self.blockdim,
+                          griddim=griddim,
+                          blockdim=blockdim,
                           stream=self.stream,
                           sharedmem=self.sharedmem)
 
@@ -715,34 +700,6 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
         else:
             raise NotImplementedError(ty, val)
 
-    @property
-    def autotune(self):
-        """Return the autotuner object associated with this kernel."""
-        warnings.warn(_deprec_warn_msg.format('autotune'), DeprecationWarning)
-        has_autotune = hasattr(self, '_autotune')
-        if has_autotune and self._autotune.dynsmem == self.sharedmem:
-            return self._autotune
-        else:
-            # Get CUDA Function
-            cufunc = self._func.get()
-            at = AutoTuner(info=cufunc.attrs, cc=cufunc.device.compute_capability)
-            self._autotune = at
-            return self._autotune
-
-    @property
-    def occupancy(self):
-        """Occupancy is the ratio of the number of active warps per multiprocessor to the maximum
-        number of warps that can be active on the multiprocessor at once.
-        Calculate the theoretical occupancy of the kernel given the
-        current configuration."""
-        warnings.warn(_deprec_warn_msg.format('occupancy'), DeprecationWarning)
-        thread_per_block = reduce(operator.mul, self.blockdim, 1)
-        return self.autotune.closest(thread_per_block)
-
-
-_deprec_warn_msg = ("The .{} attribute is is deprecated and will be "
-                    "removed in a future release")
-
 
 class AutoJitCUDAKernel(CUDAKernelBase):
     '''