Skip to content

Commit

Permalink
Merge pull request #5061 from gmarkall/grm-fix-no-launch-config
Browse files Browse the repository at this point in the history
Prevent kernel launch with no configuration, remove autotuner
  • Loading branch information
sklam committed Feb 27, 2020
2 parents 0f2f216 + 7d28818 commit 5c4c82d
Show file tree
Hide file tree
Showing 25 changed files with 120 additions and 513 deletions.
57 changes: 7 additions & 50 deletions numba/cuda/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from numba.core import types, typing, utils, funcdesc, serialize, config, compiler, sigutils
from numba.core.compiler_lock import global_compiler_lock

from .cudadrv.autotune import AutoTuner
from .cudadrv.devices import get_context
from .cudadrv import nvvm, devicearray, driver
from .errors import normalize_kernel_dimensions
Expand Down Expand Up @@ -280,7 +279,7 @@ def _compute_thread_per_block(self, kernel):
# Prefer user-specified config
if tpb != 0:
return tpb
# Else, ask the driver to give a good cofnig
# Else, ask the driver to give a good config
else:
ctx = get_context()
kwargs = dict(
Expand All @@ -289,32 +288,17 @@ def _compute_thread_per_block(self, kernel):
memsize=self.sharedmem,
blocksizelimit=1024,
)
try:
# Raises from the driver if the feature is unavailable
_, tpb = ctx.get_max_potential_block_size(**kwargs)
except AttributeError:
# Fallback to table-based approach.
tpb = self._fallback_autotune_best(kernel)
raise
_, tpb = ctx.get_max_potential_block_size(**kwargs)
return tpb

def _fallback_autotune_best(self, kernel):
try:
tpb = kernel.autotune.best()
except ValueError:
warnings.warn('Could not autotune, using default tpb of 128')
tpb = 128

return tpb


class CUDAKernelBase(object):
"""Define interface for configurable kernels
"""

def __init__(self):
self.griddim = (1, 1)
self.blockdim = (1, 1, 1)
self.griddim = None
self.blockdim = None
self.sharedmem = 0
self.stream = 0

Expand Down Expand Up @@ -531,9 +515,10 @@ def __reduce__(self):

def __call__(self, *args, **kwargs):
assert not kwargs
griddim, blockdim = normalize_kernel_dimensions(self.griddim, self.blockdim)
self._kernel_call(args=args,
griddim=self.griddim,
blockdim=self.blockdim,
griddim=griddim,
blockdim=blockdim,
stream=self.stream,
sharedmem=self.sharedmem)

Expand Down Expand Up @@ -715,34 +700,6 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
else:
raise NotImplementedError(ty, val)

@property
def autotune(self):
"""Return the autotuner object associated with this kernel."""
warnings.warn(_deprec_warn_msg.format('autotune'), DeprecationWarning)
has_autotune = hasattr(self, '_autotune')
if has_autotune and self._autotune.dynsmem == self.sharedmem:
return self._autotune
else:
# Get CUDA Function
cufunc = self._func.get()
at = AutoTuner(info=cufunc.attrs, cc=cufunc.device.compute_capability)
self._autotune = at
return self._autotune

@property
def occupancy(self):
"""Occupancy is the ratio of the number of active warps per multiprocessor to the maximum
number of warps that can be active on the multiprocessor at once.
Calculate the theoretical occupancy of the kernel given the
current configuration."""
warnings.warn(_deprec_warn_msg.format('occupancy'), DeprecationWarning)
thread_per_block = reduce(operator.mul, self.blockdim, 1)
return self.autotune.closest(thread_per_block)


_deprec_warn_msg = ("The .{} attribute is is deprecated and will be "
"removed in a future release")


class AutoJitCUDAKernel(CUDAKernelBase):
'''
Expand Down

0 comments on commit 5c4c82d

Please sign in to comment.