numba · sklam · Nov 24, 2021 · Sep 7, 2021 · Sep 7, 2021 · Sep 7, 2021
diff --git a/buildscripts/gpuci/build.sh b/buildscripts/gpuci/build.sh
@@ -16,6 +16,14 @@ cd "$WORKSPACE"
 # Determine CUDA release version
 export CUDA_REL=${CUDA_VERSION%.*}
 
+# Test with NVIDIA Bindings on CUDA 11.4
+if [ $CUDA_TOOLKIT_VER == "11.4" ]
+then
+  export NUMBA_CUDA_USE_NVIDIA_BINDING=1;
+else
+  export NUMBA_CUDA_USE_NVIDIA_BINDING=0;
+fi;
+
 ################################################################################
 # SETUP - Check environment
 ################################################################################
@@ -41,6 +49,12 @@ gpuci_mamba_retry create -n numba_ci -y \
 
 conda activate numba_ci
 
+if [ $NUMBA_CUDA_USE_NVIDIA_BINDING == "1" ]
+then
+  gpuci_logger "Install NVIDIA CUDA Python bindings";
+  gpuci_mamba_retry install nvidia::cuda-python;
+fi;
+
 gpuci_logger "Install numba"
 python setup.py develop
 

diff --git a/docs/source/cuda/bindings.rst b/docs/source/cuda/bindings.rst
@@ -0,0 +1,31 @@
+CUDA Bindings
+=============
+
+Numba supports two bindings to the CUDA Driver APIs: its own internal bindings
+based on ctypes, and the official `NVIDIA CUDA Python bindings
+<https://nvidia.github.io/cuda-python/>`_. Functionality is equivalent between
+the two bindings, with two exceptions:
+
+* the NVIDIA bindings presently do not support Per-Thread Default Streams
+  (PTDS), and an exception will be raised on import if PTDS is enabled along
+  with the NVIDIA bindings.
+* The profiling APIs are not available with the NVIDIA bindings.
+
+The internal bindings are used by default. If the NVIDIA bindings are installed,
+then they can be used by setting the environment variable
+``NUMBA_CUDA_USE_NVIDIA_BINDING`` to ``1`` prior to the import of Numba. Once
+Numba has been imported, the selected binding cannot be changed.
+
+
+Roadmap
+-------
+
+In future versions of Numba:
+
+- The NVIDIA Bindings will be used by default, if they are installed.
+- The internal bindings will be deprecated.
+- The internal bindings will be removed.
+
+It is expected that the NVIDIA bindings will be the default in Numba 0.56; at
+present, no specific release is planned for the deprecation or removal of the
+internal bindings.
diff --git a/docs/source/cuda/index.rst b/docs/source/cuda/index.rst
@@ -23,4 +23,5 @@ Numba for CUDA GPUs
    ipc.rst
    cuda_array_interface.rst
    external-memory.rst
+   bindings.rst
    faq.rst
diff --git a/docs/source/cuda/overview.rst b/docs/source/cuda/overview.rst
@@ -57,6 +57,29 @@ If you are not using Conda or if you want to use a different version of CUDA
 toolkit, the following describe how Numba searches for a CUDA toolkit
 installation.
 
+.. _cuda-bindings:
+
+CUDA Bindings
+~~~~~~~~~~~~~
+
+Numba supports interacting with the CUDA Driver API via the `NVIDIA CUDA Python
+bindings <https://nvidia.github.io/cuda-python/>`_ and its own ctypes-based
+binding. The ctypes-based binding is presently the default as Per-Thread
+Default Streams and the profiler APIs are not supported with the NVIDIA
+bindings, but otherwise functionality is equivalent between the two. You can
+install the NVIDIA bindings with::
+
+   $ conda install nvidia::cuda-python
+
+if you are using Conda, or::
+
+   $ pip install cuda-python
+
+if you are using pip.
+
+The use of the NVIDIA bindings is enabled by setting the environment variable
+:envvar:`NUMBA_CUDA_USE_NVIDIA_BINDING` to ``"1"``.
+
 .. _cudatoolkit-lookup:
 
 Setting CUDA Installation Path

diff --git a/docs/source/reference/envvars.rst b/docs/source/reference/envvars.rst
@@ -516,11 +516,18 @@ GPU support
    heuristic needs to check the number of SMs available on the device in the
    current context.
 
-.. envvar:: CUDA_WARN_ON_IMPLICIT_COPY
+.. envvar:: NUMBA_CUDA_WARN_ON_IMPLICIT_COPY
 
    Enable warnings if a kernel is launched with host memory which forces a copy to and
    from the device. This option is on by default (default value is 1).
 
+.. envvar:: NUMBA_CUDA_USE_NVIDIA_BINDING
+
+   When set to 1, Numba will use the `NVIDIA CUDA Python binding
+   <https://nvidia.github.io/cuda-python/>`_ to make calls to the driver API
+   instead of using its own ctypes binding. This defaults to 0 (off), as the
+   NVIDIA binding is currently missing support for Per-Thread Default
+   Streams.
 
 Threading Control
 -----------------

diff --git a/docs/source/user/installing.rst b/docs/source/user/installing.rst
@@ -246,6 +246,8 @@ vary with target operating system and hardware. The following lists them all
     Python 3.7.
   * ``typeguard`` - used by ``runtests.py`` for
     :ref:`runtime type-checking <type_anno_check>`.
+  * ``cuda-python`` - The NVIDIA CUDA Python bindings. See :ref:`cuda-bindings`.
+    Numba is tested with Version 11.5 of the bindings.
 
 * To build the documentation:
 

diff --git a/numba/core/config.py b/numba/core/config.py
@@ -119,6 +119,20 @@ def update(self, force=False):
             # Store a copy
             self.old_environ = dict(new_environ)
 
+        self.validate()
+
+    def validate(self):
+        if CUDA_USE_NVIDIA_BINDING:  # noqa: F821
+            try:
+                import cuda  # noqa: F401
+            except ImportError as ie:
+                msg = ("CUDA Python bindings requested, "
+                       "but they are not importable")
+                raise RuntimeError(msg) from ie
+
+            if CUDA_PER_THREAD_DEFAULT_STREAM:  # noqa: F821
+                warnings.warn("PTDS is not supported with CUDA Python")
+
     def process_environ(self, environ):
         def _readenv(name, ctor, default):
             value = environ.get(name)
@@ -170,6 +184,10 @@ def optional_str(x):
         CUDA_LOW_OCCUPANCY_WARNINGS = _readenv(
             "NUMBA_CUDA_LOW_OCCUPANCY_WARNINGS", int, 1)
 
+        # Whether to use the official CUDA Python API Bindings
+        CUDA_USE_NVIDIA_BINDING = _readenv(
+            "NUMBA_CUDA_USE_NVIDIA_BINDING", int, 0)
+
         # Debug flag to control compiler debug print
         DEBUG = _readenv("NUMBA_DEBUG", int, 0)
 

diff --git a/numba/cuda/api.py b/numba/cuda/api.py
@@ -227,9 +227,13 @@ def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
     # compute size
     size = np.prod(shape) * dtype.itemsize
     # manually recreate the IPC mem handle
-    handle = driver.drvapi.cu_ipc_mem_handle(*handle)
+    if driver.USE_NV_BINDING:
+        driver_handle = driver.binding.CUipcMemHandle()
+        driver_handle.reserved = handle
+    else:
+        driver_handle = driver.drvapi.cu_ipc_mem_handle(*handle)
     # use *IpcHandle* to open the IPC memory
-    ipchandle = driver.IpcHandle(None, handle, size, offset=offset)
+    ipchandle = driver.IpcHandle(None, driver_handle, size, offset=offset)
     yield ipchandle.open_array(current_context(), shape=shape,
                                strides=strides, dtype=dtype)
     ipchandle.close()

diff --git a/numba/cuda/codegen.py b/numba/cuda/codegen.py
@@ -167,7 +167,7 @@ def get_cubin(self, cc=None):
         if cubin:
             return cubin
 
-        linker = driver.Linker(max_registers=self._max_registers, cc=cc)
+        linker = driver.Linker.new(max_registers=self._max_registers, cc=cc)
 
         ptxes = self._get_ptxes(cc=cc)
         for ptx in ptxes:

diff --git a/numba/cuda/compiler.py b/numba/cuda/compiler.py
@@ -563,7 +563,12 @@ def launch(self, args, griddim, blockdim, stream=0, sharedmem=0):
         for t, v in zip(self.argument_types, args):
             self._prepare_args(t, v, stream, retr, kernelargs)
 
-        stream_handle = stream and stream.handle or None
+        if driver.USE_NV_BINDING:
+            zero_stream = driver.binding.CUstream(0)
+        else:
+            zero_stream = None
+
+        stream_handle = stream and stream.handle or zero_stream
 
         # Invoke kernel
         driver.launch_kernel(cufunc.handle,
@@ -634,7 +639,14 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
             parent = ctypes.c_void_p(0)
             nitems = c_intp(devary.size)
             itemsize = c_intp(devary.dtype.itemsize)
-            data = ctypes.c_void_p(driver.device_pointer(devary))
+
+            ptr = driver.device_pointer(devary)
+
+            if driver.USE_NV_BINDING:
+                ptr = int(ptr)
+
+            data = ctypes.c_void_p(ptr)
+
             kernelargs.append(meminfo)
             kernelargs.append(parent)
             kernelargs.append(nitems)
@@ -674,7 +686,10 @@ def _prepare_args(self, ty, val, stream, retr, kernelargs):
 
         elif isinstance(ty, types.Record):
             devrec = wrap_arg(val).to_device(retr, stream)
-            kernelargs.append(devrec)
+            ptr = devrec.device_ctypes_pointer
+            if driver.USE_NV_BINDING:
+                ptr = ctypes.c_void_p(int(ptr))
+            kernelargs.append(ptr)
 
         elif isinstance(ty, types.BaseTuple):
             assert len(ty) == len(val)

diff --git a/numba/cuda/cudadrv/devicearray.py b/numba/cuda/cudadrv/devicearray.py
@@ -105,19 +105,29 @@ def __init__(self, shape, strides, dtype, stream=0, gpu_data=None):
                 self.alloc_size = _driver.device_memory_size(gpu_data)
         else:
             # Make NULL pointer for empty allocation
+            if _driver.USE_NV_BINDING:
+                null = _driver.binding.CUdeviceptr(0)
+            else:
+                null = c_void_p(0)
             gpu_data = _driver.MemoryPointer(context=devices.get_context(),
-                                             pointer=c_void_p(0), size=0)
+                                             pointer=null, size=0)
             self.alloc_size = 0
 
         self.gpu_data = gpu_data
         self.stream = stream
 
     @property
     def __cuda_array_interface__(self):
-        if self.device_ctypes_pointer.value is not None:
-            ptr = self.device_ctypes_pointer.value
+        if _driver.USE_NV_BINDING:
+            if self.device_ctypes_pointer is not None:
+                ptr = int(self.device_ctypes_pointer)
+            else:
+                ptr = 0
         else:
-            ptr = 0
+            if self.device_ctypes_pointer.value is not None:
+                ptr = self.device_ctypes_pointer.value
+            else:
+                ptr = 0
 
         return {
             'shape': tuple(self.shape),
@@ -191,7 +201,10 @@ def device_ctypes_pointer(self):
         """Returns the ctypes pointer to the GPU data buffer
         """
         if self.gpu_data is None:
-            return c_void_p(0)
+            if _driver.USE_NV_BINDING:
+                return _driver.binding.CUdeviceptr(0)
+            else:
+                return c_void_p(0)
         else:
             return self.gpu_data.device_ctypes_pointer
 

diff --git a/numba/cuda/cudadrv/devices.py b/numba/cuda/cudadrv/devices.py
@@ -14,7 +14,7 @@
 import threading
 from contextlib import contextmanager
 
-from .driver import driver
+from .driver import driver, USE_NV_BINDING
 
 
 class _DeviceList(object):
@@ -139,6 +139,8 @@ def get_or_create_context(self, devnum):
             else:
                 return attached_ctx
         else:
+            if USE_NV_BINDING:
+                devnum = int(devnum)
             return self._activate_context_for(devnum)
 
     def _get_or_create_context_uncached(self, devnum):
@@ -155,10 +157,16 @@ def _get_or_create_context_uncached(self, devnum):
                     # Get primary context for the active device
                     ctx = self.gpus[ac.devnum].get_primary_context()
                     # Is active context the primary context?
-                    if ctx.handle.value != ac.context_handle.value:
+                    if USE_NV_BINDING:
+                        ctx_handle = int(ctx.handle)
+                        ac_ctx_handle = int(ac.context_handle)
+                    else:
+                        ctx_handle = ctx.handle.value
+                        ac_ctx_handle = ac.context_handle.value
+                    if ctx_handle != ac_ctx_handle:
                         msg = ('Numba cannot operate on non-primary'
                                ' CUDA context {:x}')
-                        raise RuntimeError(msg.format(ac.context_handle.value))
+                        raise RuntimeError(msg.format(ac_ctx_handle))
                     # Ensure the context is ready
                     ctx.prepare_for_use()
                 return ctx