Merge remote-tracking branch 'origin/master' into develop

numba · Sep 27, 2018 · 753282b · 753282b
2 parents ffe2bf8 + ae1acab
commit 753282b
Show file tree

Hide file tree

Showing 13 changed files with 138 additions and 32 deletions.
diff --git a/CHANGE_LOG b/CHANGE_LOG
@@ -53,6 +53,7 @@ CUDA Enhancements:
 * PR #3198: Fix GPU datetime timedelta types usage
 * PR #3221: Support datetime/timedelta scalar argument to a CUDA kernel.
 * PR #3259: Add DeviceNDArray.view method to reinterpret data as a different type.
+* PR #3310: Fix IPC handling of sliced cuda array.
 
 ROCm Enhancements:
 
@@ -126,9 +127,11 @@ Documentation Updates:
 * PR #3274: Update installation instructions
 * PR #3275: Note pyobject and voidptr are types in docs
 * PR #3288: Do not need to call parallel optimizations "experimental" anymore
+* PR #3318: Tweak spacing to avoid search box wrapping onto second line
 
 Contributors:
 
+* Anton Malakhov
 * Alex Ford
 * Anthony Bisulco
 * Ehsan Totoni (core dev)

diff --git a/buildscripts/azure/azure-linux.yml b/buildscripts/azure/azure-linux.yml
@@ -77,7 +77,8 @@ jobs:
 
   steps:
     - script: |
-        if [ "$(uname)" == "Linux" ]; then sudo apt-get install -y libc6-dev-i386; fi
+        # Sleep 10 seconds because of race condition with background apt process
+        if [ "$(uname)" == "Linux" ]; then sleep 10; sudo apt-get install -y libc6-dev-i386; fi
         echo "Installing Miniconda"
         buildscripts/incremental/install_miniconda.sh
         export PATH=$HOME/miniconda3/bin:$PATH

diff --git a/docs/_static/numba-docs.css b/docs/_static/numba-docs.css
@@ -23,6 +23,11 @@ body {
     height: 135%;
 }
 
+.nav li a {
+    padding-left: 11px;
+    padding-right: 11px;
+}
+
 @media (min-width: 768px) {
     .container {
         width: 700px;

diff --git a/docs/source/reference/numpysupported.rst b/docs/source/reference/numpysupported.rst
@@ -156,7 +156,8 @@ Other methods
 
 The following methods of Numpy arrays are supported:
 
-* :meth:`~numpy.ndarray.argsort` (without arguments)
+* :meth:`~numpy.ndarray.argsort` (``kind`` key word argument supported for
+  values ``'quicksort'`` and ``'mergesort'``)
 * :meth:`~numpy.ndarray.astype` (only the 1-argument form)
 * :meth:`~numpy.ndarray.copy` (without arguments)
 * :meth:`~numpy.ndarray.dot` (only the 1-argument form)
@@ -247,7 +248,8 @@ Other functions
 The following top-level functions are supported:
 
 * :func:`numpy.arange`
-* :func:`numpy.argsort` (no optional arguments)
+* :func:`numpy.argsort` (``kind`` key word argument supported for values
+  ``'quicksort'`` and ``'mergesort'``)
 * :func:`numpy.array` (only the 2 first arguments)
 * :func:`numpy.asfortranarray` (only the first argument)
 * :func:`numpy.atleast_1d`

diff --git a/docs/source/reference/pysemantics.rst b/docs/source/reference/pysemantics.rst
@@ -1,8 +1,18 @@
+.. _pysemantics:
 
-Deviations from Python semantics
+Deviations from Python Semantics
 ================================
 
 
+Exceptions and Memory Allocation
+--------------------------------
+
+Due to limitations in the current compiler when handling exceptions, memory
+allocated (almost always NumPy arrays) within a function that raises an
+exception will **leak**.  This is a known issue that will be fixed, but in the
+meantime, it is best to do memory allocation outside of functions that can
+also raise exceptions.
+
 Integer width
 -------------
 

diff --git a/docs/source/reference/pysupported.rst b/docs/source/reference/pysupported.rst
@@ -8,6 +8,12 @@ Apart from the :ref:`pysupported-language` part below, which applies to both
 :term:`object mode` and :term:`nopython mode`, this page only lists the
 features supported in :term:`nopython mode`.
 
+.. warning::
+    Numba behavior differs from Python semantics in some situations.  We
+    strongly advise reviewing :ref:`pysemantics` to become familiar with these
+    differences. 
+
+
 .. _pysupported-language:
 
 Language

diff --git a/docs/source/user/faq.rst b/docs/source/user/faq.rst
@@ -197,6 +197,30 @@ compiled version on disk for later use.
 A more radical alternative is :ref:`ahead-of-time compilation <pycc>`.
 
 
+GPU Programming
+===============
+
+How do I work around the ``CUDA intialized before forking`` error?
+------------------------------------------------------------------
+
+On Linux, the ``multiprocessing`` module in the Python standard library
+defaults to using the ``fork`` method for creating new processes.  Because of
+the way process forking duplicates state between the parent and child
+processes, CUDA will not work correctly in the child process if the CUDA
+runtime was initialized *prior* to the fork.  Numba detects this and raises a
+``CudaDriverError`` with the message ``CUDA initialized before forking``.
+
+One approach to avoid this error is to make all calls to ``numba.cuda``
+functions inside the child processes or after the process pool is created.
+However, this is not always possible, as you might want to query the number of
+available GPUs before starting the process pool.  In Python 3, you can change
+the process start method, as described in the `multiprocessing documentation
+<https://docs.python.org/3.6/library/multiprocessing.html#contexts-and-start-methods>`_.
+Switching from ``fork`` to ``spawn`` or ``forkserver`` will avoid the CUDA
+initalization issue, although the child processes will not inherit any global
+variables from their parent.
+
+
 Integration with other utilities
 ================================
 

diff --git a/numba/cuda/api.py b/numba/cuda/api.py
@@ -162,7 +162,7 @@ def mapped_array(shape, dtype=np.float, strides=None, order='C', stream=0,
 
 @contextlib.contextmanager
 @require_context
-def open_ipc_array(handle, shape, dtype, strides=None):
+def open_ipc_array(handle, shape, dtype, strides=None, offset=0):
     """
     A context manager that opens a IPC *handle* (*CUipcMemHandle*) that is
     represented as a sequence of bytes (e.g. *bytes*, tuple of int)
@@ -180,7 +180,7 @@ def open_ipc_array(handle, shape, dtype, strides=None):
     # manually recreate the IPC mem handle
     handle = driver.drvapi.cu_ipc_mem_handle(*handle)
     # use *IpcHandle* to open the IPC memory
-    ipchandle = driver.IpcHandle(None, handle, size)
+    ipchandle = driver.IpcHandle(None, handle, size, offset=offset)
     yield ipchandle.open_array(current_context(), shape=shape,
                                strides=strides, dtype=dtype)
     ipchandle.close()

diff --git a/numba/cuda/cudadrv/devicearray.py b/numba/cuda/cudadrv/devicearray.py
@@ -552,7 +552,6 @@ def _do_setitem(self, key, value, stream=0):
         _assign_kernel(lhs.ndim).forall(n_elements, stream=stream)(lhs, rhs)
 
 
-
 class IpcArrayHandle(object):
     """
     An IPC array handle that can be serialized and transfer to another process

diff --git a/numba/cuda/cudadrv/driver.py b/numba/cuda/cudadrv/driver.py
@@ -785,15 +785,13 @@ def get_ipc_handle(self, memory):
             raise OSError('OS does not support CUDA IPC')
         ipchandle = drvapi.cu_ipc_mem_handle()
         driver.cuIpcGetMemHandle(
-            ctypes.cast(
-                ipchandle,
-                ctypes.POINTER(drvapi.cu_ipc_mem_handle),
-                ),
-            memory.handle,
+            ctypes.byref(ipchandle),
+            memory.owner.handle,
             )
-
         source_info = self.device.get_device_identity()
-        return IpcHandle(memory, ipchandle, memory.size, source_info)
+        offset = memory.handle.value - memory.owner.handle.value
+        return IpcHandle(memory, ipchandle, memory.size, source_info,
+                         offset=offset)
 
     def open_ipc_handle(self, handle, size):
         # open the IPC handle to get the device pointer
@@ -972,6 +970,7 @@ def __init__(self, parent):
         self.base = parent.base
         self.handle = parent.handle
         self.size = parent.size
+        self.offset = parent.offset
         # remember if the handle is already opened
         self._opened_mem = None
 
@@ -985,12 +984,12 @@ def open(self, context):
         if self._opened_mem is not None:
             raise ValueError('IpcHandle is already opened')
 
-        mem = context.open_ipc_handle(self.handle, self.size)
+        mem = context.open_ipc_handle(self.handle, self.offset + self.size)
         # this object owns the opened allocation
         # note: it is required the memory be freed after the ipc handle is
         #       closed by the importing context.
         self._opened_mem = mem
-        return mem.own()
+        return mem.own().view(self.offset)
 
     def close(self):
         if self._opened_mem is None:
@@ -1047,12 +1046,13 @@ class IpcHandle(object):
     alive.  The *handle* is a ctypes object of the CUDA IPC handle. The *size*
     is the allocation size.
     """
-    def __init__(self, base, handle, size, source_info=None):
+    def __init__(self, base, handle, size, source_info=None, offset=0):
         self.base = base
         self.handle = handle
         self.size = size
         self.source_info = source_info
         self._impl = None
+        self.offset = offset
 
     def _sentry_source_info(self):
         if self.source_info is None:
@@ -1132,14 +1132,15 @@ def __reduce__(self):
             preprocessed_handle,
             self.size,
             self.source_info,
+            self.offset,
             )
         return (serialize._rebuild_reduction, args)
 
     @classmethod
-    def _rebuild(cls, handle_ary, size, source_info):
+    def _rebuild(cls, handle_ary, size, source_info, offset):
         handle = drvapi.cu_ipc_mem_handle(*handle_ary)
         return cls(base=None, handle=handle, size=size,
-                   source_info=source_info)
+                   source_info=source_info, offset=offset)
 
 
 class MemoryPointer(object):

diff --git a/numba/cuda/tests/cudapy/test_ipc.py b/numba/cuda/tests/cudapy/test_ipc.py
@@ -111,10 +111,13 @@ def test_ipc_handle(self):
             np.testing.assert_equal(arr, out)
         proc.join(3)
 
-    def test_ipc_handle_serialization(self):
+    def check_ipc_handle_serialization(self, index_arg=None):
         # prepare data for IPC
         arr = np.arange(10, dtype=np.intp)
         devarr = cuda.to_device(arr)
+        if index_arg is not None:
+            devarr = devarr[index_arg]
+        expect = devarr.copy_to_host()
 
         # create IPC handle
         ctx = cuda.current_context()
@@ -137,13 +140,25 @@ def test_ipc_handle_serialization(self):
         if not succ:
             self.fail(out)
         else:
-            np.testing.assert_equal(arr, out)
+            np.testing.assert_equal(expect, out)
         proc.join(3)
 
-    def test_ipc_array(self):
+    def test_ipc_handle_serialization(self):
+        # test no slicing
+        self.check_ipc_handle_serialization()
+        # slicing tests
+        self.check_ipc_handle_serialization(slice(3, None))
+        self.check_ipc_handle_serialization(slice(3, 8))
+        self.check_ipc_handle_serialization(slice(None, 8))
+
+    def check_ipc_array(self, index_arg=None):
         # prepare data for IPC
         arr = np.arange(10, dtype=np.intp)
         devarr = cuda.to_device(arr)
+        # Slice
+        if index_arg is not None:
+            devarr = devarr[index_arg]
+        expect = devarr.copy_to_host()
         ipch = devarr.get_ipc_handle()
 
         # spawn new process for testing
@@ -156,9 +171,16 @@ def test_ipc_array(self):
         if not succ:
             self.fail(out)
         else:
-            np.testing.assert_equal(arr, out)
+            np.testing.assert_equal(expect, out)
         proc.join(3)
 
+    def test_ipc_array(self):
+        # test no slicing
+        self.check_ipc_array()
+        # slicing tests
+        self.check_ipc_array(slice(3, None))
+        self.check_ipc_array(slice(3, 8))
+        self.check_ipc_array(slice(None, 8))
 
 @unittest.skipUnless(not_linux, "Only on OS other than Linux")
 @skip_on_cudasim('Ipc not available in CUDASIM')

diff --git a/numba/pycc/platform.py b/numba/pycc/platform.py
@@ -11,7 +11,8 @@
 import os
 import subprocess
 import sys
-from tempfile import NamedTemporaryFile, gettempdir
+from tempfile import NamedTemporaryFile, mkdtemp, gettempdir
+from contextlib import contextmanager
 
 _configs = {
     # DLL suffix, Python C extension suffix
@@ -27,26 +28,42 @@ def get_configs(arg):
 find_shared_ending = functools.partial(get_configs, 0)
 find_pyext_ending = functools.partial(get_configs, 1)
 
+@contextmanager
+def _gentmpfile(suffix):
+    # windows locks the tempfile so use a tempdir + file, see
+    # https://github.com/numba/numba/issues/3304
+    try:
+        tmpdir = mkdtemp()
+        ntf = open(os.path.join(tmpdir, "temp%s" % suffix), 'wt')
+        yield ntf
+    finally:
+        try:
+            ntf.close()
+            os.remove(ntf)
+        except:
+            pass
+        else:
+            os.rmdir(tmpdir)
 
 def _check_external_compiler():
     # see if the external compiler bound in numpy.distutil is present
     # and working
     compiler = new_compiler()
     customize_compiler(compiler)
     for suffix in ['.c', '.cxx']:
-        with NamedTemporaryFile('wt', suffix=suffix) as ntf:
-            simple_c = "int main(void) { return 0; }"
-            ntf.write(simple_c)
-            ntf.flush()
-            try:
+        try:
+            with _gentmpfile(suffix) as ntf:
+                simple_c = "int main(void) { return 0; }"
+                ntf.write(simple_c)
+                ntf.flush()
+                ntf.close()
                 # *output_dir* is set to avoid the compiler putting temp files
                 # in the current directory.
                 compiler.compile([ntf.name], output_dir=gettempdir())
-            except Exception: # likely CompileError
-                return False
+        except Exception: # likely CompileError or file system issue
+            return False
     return True
 
-
 # boolean on whether the externally provided compiler is present and
 # functioning correctly
 _external_compiler_ok = _check_external_compiler()

diff --git a/numba/tests/test_pycc.py b/numba/tests/test_pycc.py
@@ -28,6 +28,9 @@
 _skip_reason = 'AOT compatible compilers missing'
 _skip_missing_compilers = unittest.skipIf(not _external_compiler_ok,
                                           _skip_reason)
+_skip_reason = 'windows only'
+_windows_only = unittest.skipIf(not sys.platform.startswith('win'),
+                                _skip_reason)
 
 from .matmul_usecase import has_blas
 from .support import TestCase, tag, import_dynamic, temp_directory
@@ -43,6 +46,19 @@ def unset_macosx_deployment_target():
     if 'MACOSX_DEPLOYMENT_TARGET' in os.environ:
         del os.environ['MACOSX_DEPLOYMENT_TARGET']
 
+class TestCompilerChecks(TestCase):
+
+    # NOTE: THIS TEST MUST ALWAYS RUN ON WINDOWS, DO NOT SKIP
+    @_windows_only
+    def test_windows_compiler_validity(self):
+        # When inside conda-build VSINSTALLDIR should be set and windows should
+        # have a valid compiler available, `_external_compiler_ok` should  agree
+        # with this. If this is not the case then error out to alert devs.
+        is_running_conda_build = os.environ.get('CONDA_BUILD', None) is not None
+        if is_running_conda_build:
+            if os.environ.get('VSINSTALLDIR', None) is not None:
+                self.assertTrue(_external_compiler_ok)
+
 
 class BasePYCCTest(TestCase):