Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent kernel launch with no configuration, remove autotuner #5061

Merged
merged 10 commits into from
Feb 27, 2020
57 changes: 7 additions & 50 deletions numba/cuda/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from numba.core import types, typing, utils, funcdesc, serialize, config, compiler, sigutils
from numba.core.compiler_lock import global_compiler_lock

from .cudadrv.autotune import AutoTuner
from .cudadrv.devices import get_context
from .cudadrv import nvvm, devicearray, driver
from .errors import normalize_kernel_dimensions
Expand Down Expand Up @@ -280,7 +279,7 @@ def _compute_thread_per_block(self, kernel):
# Prefer user-specified config
if tpb != 0:
return tpb
# Else, ask the driver to give a good cofnig
# Else, ask the driver to give a good config
else:
ctx = get_context()
kwargs = dict(
Expand All @@ -289,32 +288,17 @@ def _compute_thread_per_block(self, kernel):
memsize=self.sharedmem,
blocksizelimit=1024,
)
try:
# Raises from the driver if the feature is unavailable
_, tpb = ctx.get_max_potential_block_size(**kwargs)
except AttributeError:
# Fallback to table-based approach.
tpb = self._fallback_autotune_best(kernel)
raise
_, tpb = ctx.get_max_potential_block_size(**kwargs)
return tpb

def _fallback_autotune_best(self, kernel):
try:
tpb = kernel.autotune.best()
except ValueError:
warnings.warn('Could not autotune, using default tpb of 128')
tpb = 128

return tpb


class CUDAKernelBase(object):
"""Define interface for configurable kernels
"""

def __init__(self):
self.griddim = (1, 1)
self.blockdim = (1, 1, 1)
self.griddim = None
self.blockdim = None
self.sharedmem = 0
self.stream = 0

Expand Down Expand Up @@ -531,9 +515,10 @@ def __reduce__(self):

def __call__(self, *args, **kwargs):
assert not kwargs
griddim, blockdim = normalize_kernel_dimensions(self.griddim, self.blockdim)
self._kernel_call(args=args,
griddim=self.griddim,
blockdim=self.blockdim,
griddim=griddim,
blockdim=blockdim,
stream=self.stream,
sharedmem=self.sharedmem)

Expand Down Expand Up @@ -715,34 +700,6 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
else:
raise NotImplementedError(ty, val)

@property
def autotune(self):
"""Return the autotuner object associated with this kernel."""
warnings.warn(_deprec_warn_msg.format('autotune'), DeprecationWarning)
has_autotune = hasattr(self, '_autotune')
if has_autotune and self._autotune.dynsmem == self.sharedmem:
return self._autotune
else:
# Get CUDA Function
cufunc = self._func.get()
at = AutoTuner(info=cufunc.attrs, cc=cufunc.device.compute_capability)
self._autotune = at
return self._autotune

@property
def occupancy(self):
"""Occupancy is the ratio of the number of active warps per multiprocessor to the maximum
number of warps that can be active on the multiprocessor at once.
Calculate the theoretical occupancy of the kernel given the
current configuration."""
warnings.warn(_deprec_warn_msg.format('occupancy'), DeprecationWarning)
thread_per_block = reduce(operator.mul, self.blockdim, 1)
return self.autotune.closest(thread_per_block)


_deprec_warn_msg = ("The .{} attribute is is deprecated and will be "
"removed in a future release")


class AutoJitCUDAKernel(CUDAKernelBase):
'''
Expand Down