Merge pull request #5136 from gmarkall/grm-default-stream

CUDA: Enable asynchronous operations on the default stream
numba · Feb 26, 2020 · f504112 · f504112
2 parents c80caf6 + ec38019
commit f504112
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 5 deletions.
diff --git a/docs/source/cuda-reference/host.rst b/docs/source/cuda-reference/host.rst
@@ -141,10 +141,14 @@ transfers and kernel execution. For further details on streams, see the `CUDA C
 Programming Guide Streams section
 <http://docs.nvidia.com/cuda/cuda-c-programming-guide/#streams>`_.
 
-To create a stream:
+To create a new stream:
 
 .. autofunction:: numba.cuda.stream
 
+To get the default stream:
+
+.. autofunction:: numba.cuda.default_stream
+
 Streams are instances of :class:`numba.cuda.cudadrv.driver.Stream`:
 
 .. autoclass:: numba.cuda.cudadrv.driver.Stream

diff --git a/docs/source/cuda/memory.rst b/docs/source/cuda/memory.rst
@@ -56,9 +56,16 @@ Pinned memory
 Streams
 =======
 
+Streams can be passed to functions that accept them (e.g. copies between the
+host and device) and into kernel launch configurations so that the operations
+are executed asynchronously.
+
 .. autofunction:: numba.cuda.stream
    :noindex:
 
+.. autofunction:: numba.cuda.default_stream
+   :noindex:
+
 CUDA streams have the following methods:
 
 .. autoclass:: numba.cuda.cudadrv.driver.Stream
@@ -184,4 +191,4 @@ Sometimes, it is desired to defer resource deallocation until a code section
 ends.  Most often, users want to avoid any implicit synchronization due to
 deallocation.  This can be done by using the following context manager:
 
-.. autofunction:: numba.cuda.defer_cleanup
+.. autofunction:: numba.cuda.defer_cleanup
diff --git a/numba/cuda/api.py b/numba/cuda/api.py
@@ -269,6 +269,14 @@ def stream():
     """
     return current_context().create_stream()
 
+@require_context
+def default_stream():
+    """default_stream()
+
+    Get the default CUDA stream.
+    """
+    return current_context().get_default_stream()
+
 # Page lock
 @require_context
 @contextlib.contextmanager

diff --git a/numba/cuda/cudadrv/driver.py b/numba/cuda/cudadrv/driver.py
@@ -891,6 +891,9 @@ def create_module_image(self, image):
     def unload_module(self, module):
         del self.modules[module.handle.value]
 
+    def get_default_stream(self):
+        return Stream(weakref.proxy(self), drvapi.cu_stream(0), None)
+
     def create_stream(self):
         handle = drvapi.cu_stream()
         driver.cuStreamCreate(byref(handle), 0)
@@ -1407,10 +1410,14 @@ def __init__(self, context, handle, finalizer):
             weakref.finalize(self, finalizer)
 
     def __int__(self):
-        return self.handle.value
+        # The default stream's handle.value is 0, which gives `None`
+        return self.handle.value or 0
 
     def __repr__(self):
-        return "<CUDA stream %d on %s>" % (self.handle.value, self.context)
+        if self.handle.value:
+            return "<CUDA stream %d on %s>" % (self.handle.value, self.context)
+        else:
+            return "<Default CUDA stream on %s>" % self.context
 
     def synchronize(self):
         '''

diff --git a/numba/cuda/tests/cudadrv/test_cuda_driver.py b/numba/cuda/tests/cudadrv/test_cuda_driver.py
@@ -91,7 +91,7 @@ def test_cuda_driver_basic(self):
 
         module.unload()
 
-    def test_cuda_driver_stream(self):
+    def test_cuda_driver_stream_operations(self):
         module = self.context.create_module_ptx(self.ptx)
         function = module.get_function('_Z10helloworldPi')
 
@@ -111,6 +111,24 @@ def test_cuda_driver_stream(self):
         for i, v in enumerate(array):
             self.assertEqual(i, v)
 
+    def test_cuda_driver_default_stream(self):
+        # Test properties of the default stream
+        ds = self.context.get_default_stream()
+        self.assertIn("Default CUDA stream", repr(ds))
+        self.assertEqual(0, int(ds))
+        # bool(stream) is the check that is done in memcpy to decide if async
+        # version should be used. So the default (0) stream should be true-ish
+        # even though 0 is usually false-ish in Python.
+        self.assertTrue(ds)
+
+    def test_cuda_driver_stream(self):
+        # Test properties of non-default streams
+        s = self.context.create_stream()
+        self.assertIn("CUDA stream", repr(s))
+        self.assertNotIn("Default", repr(s))
+        self.assertNotEqual(0, int(s))
+        self.assertTrue(s)
+
     def test_cuda_driver_occupancy(self):
         module = self.context.create_module_ptx(self.ptx)
         function = module.get_function('_Z10helloworldPi')