From 29bc72f6f8dc48c464d5f3fd98d91207cc4ec802 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 17 Jan 2020 08:14:21 -0800
Subject: [PATCH] Fix #5073: Slices of dynamic shared memory all alias

The root of the issue was that computations of indices and slice bounds
were incorrect because the shape of dynamic shared memory is generally
(0,). To fix this, we compute the shape (1-D only) of dynamic shared
arrays using the dynamic shared memory size and the itemsize of the type
of the array when it is created.

This is implemented by reading the special register %dynamic_smem_size -
unfortunately NVVM doesn't provide an intrinsic for this, so we access
it using inline assembly.

As part of these changes, the `_generic_array` function is refactored
and `_make_array` folded into it (`_generic_array` was the only caller
and its parameters were the same as its local variables). The refactored
version checks for errors earlier on and moves groups the creation of
various variables more closely with their usages.
---
 numba/cuda/cudaimpl.py             |  63 +++++---
 numba/cuda/tests/cudapy/test_sm.py | 251 +++++++++++++++++++++++++++++
 2 files changed, 291 insertions(+), 23 deletions(-)

diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py
index 31ede321e35..3a1c0b86355 100644
--- a/numba/cuda/cudaimpl.py
+++ b/numba/cuda/cudaimpl.py
@@ -4,7 +4,7 @@
 import operator
 import math
 
-from llvmlite.llvmpy.core import Type
+from llvmlite.llvmpy.core import Type, InlineAsm
 import llvmlite.llvmpy.core as lc
 import llvmlite.binding as ll
 
@@ -627,18 +627,29 @@ def _get_target_data(context):
 def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
                    can_dynsized=False):
     elemcount = reduce(operator.mul, shape)
+
+    # Check for valid shape for this type of allocation
+    dynamic_smem = elemcount <= 0 and can_dynsized
+    if elemcount <= 0 and not dynamic_smem:
+        raise ValueError("array length <= 0")
+
+    # Check that we support the requested dtype
+    other_supported_type = isinstance(dtype, (types.Record, types.Boolean))
+    if dtype not in types.number_domain and not other_supported_type:
+        raise TypeError("unsupported type: %s" % dtype)
+
     lldtype = context.get_data_type(dtype)
     laryty = Type.array(lldtype, elemcount)
 
     if addrspace == nvvm.ADDRSPACE_LOCAL:
-        # Special case local addrespace allocation to use alloca
+        # Special case local address space allocation to use alloca
         # NVVM is smart enough to only use local memory if no register is
         # available
         dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name)
     else:
         lmod = builder.module
 
-        # Create global variable in the requested address-space
+        # Create global variable in the requested address space
         gvmem = lmod.add_global_variable(laryty, symbol_name, addrspace)
         # Specify alignment to avoid misalignment bug
         align = context.get_abi_sizeof(lldtype)
@@ -646,11 +657,8 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
         # not a power of 2 (e.g. for a Record array) then round up accordingly.
         gvmem.align = 1 << (align - 1 ).bit_length()
 
-        if elemcount <= 0:
-            if can_dynsized:    # dynamic shared memory
-                gvmem.linkage = lc.LINKAGE_EXTERNAL
-            else:
-                raise ValueError("array length <= 0")
+        if dynamic_smem:
+            gvmem.linkage = lc.LINKAGE_EXTERNAL
         else:
             ## Comment out the following line to workaround a NVVM bug
             ## which generates a invalid symbol name when the linkage
@@ -660,36 +668,45 @@ def _generic_array(context, builder, shape, dtype, symbol_name, addrspace,
 
             gvmem.initializer = lc.Constant.undef(laryty)
 
-        other_supported_type = isinstance(dtype, (types.Record, types.Boolean))
-        if dtype not in types.number_domain and not other_supported_type:
-            raise TypeError("unsupported type: %s" % dtype)
-
         # Convert to generic address-space
         conv = nvvmutils.insert_addrspace_conv(lmod, Type.int(8), addrspace)
         addrspaceptr = gvmem.bitcast(Type.pointer(Type.int(8), addrspace))
         dataptr = builder.call(conv, [addrspaceptr])
 
-    return _make_array(context, builder, dataptr, dtype, shape)
-
-
-def _make_array(context, builder, dataptr, dtype, shape, layout='C'):
-    ndim = len(shape)
-    # Create array object
-    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C')
-    ary = context.make_array(aryty)(context, builder)
-
     targetdata = _get_target_data(context)
     lldtype = context.get_data_type(dtype)
     itemsize = lldtype.get_abi_size(targetdata)
+
     # Compute strides
     rstrides = [itemsize]
     for i, lastsize in enumerate(reversed(shape[1:])):
         rstrides.append(lastsize * rstrides[-1])
     strides = [s for s in reversed(rstrides)]
-
-    kshape = [context.get_constant(types.intp, s) for s in shape]
     kstrides = [context.get_constant(types.intp, s) for s in strides]
 
+    # Compute shape
+    if dynamic_smem:
+        # Compute the shape based on the dynamic shared memory configuration.
+        # Unfortunately NVVM does not provide an intrinsic for the
+        # %dynamic_smem_size register, so we must read it using inline
+        # assembly.
+        get_dynshared_size = InlineAsm.get(Type.function(Type.int(), []),
+                                           "mov.u32 $0, %dynamic_smem_size;",
+                                           '=r', side_effect=True)
+        dynsmem_size = builder.zext(builder.call(get_dynshared_size, []),
+                                    Type.int(width=64))
+        # Only 1-D dynamic shared memory is supported so the following is a
+        # sufficient construction of the shape
+        kitemsize = context.get_constant(types.intp, itemsize)
+        kshape = [builder.udiv(dynsmem_size, kitemsize)]
+    else:
+        kshape = [context.get_constant(types.intp, s) for s in shape]
+
+    # Create array object
+    ndim = len(shape)
+    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C')
+    ary = context.make_array(aryty)(context, builder)
+
     context.populate_array(ary,
                            data=builder.bitcast(dataptr, ary.data.type),
                            shape=cgutils.pack_array(builder, kshape),
diff --git a/numba/cuda/tests/cudapy/test_sm.py b/numba/cuda/tests/cudapy/test_sm.py
index 4e6e1ea8fa1..b2eebbc3f2d 100644
--- a/numba/cuda/tests/cudapy/test_sm.py
+++ b/numba/cuda/tests/cudapy/test_sm.py
@@ -124,6 +124,257 @@ def test_shared_bool(self):
         arr = np.random.randint(2, size=(1024,), dtype=np.bool_)
         self._test_shared(arr)
 
+    def _test_dynshared_slice(self, func, arr, expected):
+        # Check that slices of shared memory are correct
+        # (See Bug #5073 - prior to the addition of these tests and
+        # corresponding fix, slices of dynamic shared arrays all aliased each
+        # other)
+        nshared = arr.size * arr.dtype.itemsize
+        func[1, 1, 0, nshared](arr)
+        np.testing.assert_array_equal(expected, arr)
+
+    def test_dynshared_slice_write(self):
+        # Test writing values into disjoint slices of dynamic shared memory
+        @cuda.jit
+        def slice_write(x):
+            dynsmem = cuda.shared.array(0, dtype=int32)
+            sm1 = dynsmem[0:1]
+            sm2 = dynsmem[1:2]
+
+            sm1[0] = 1
+            sm2[0] = 2
+            x[0] = dynsmem[0]
+            x[1] = dynsmem[1]
+
+        arr = np.zeros(2, dtype=np.int32)
+        expected = np.array([1, 2], dtype=np.int32)
+        self._test_dynshared_slice(slice_write, arr, expected)
+
+    def test_dynshared_slice_read(self):
+        # Test reading values from disjoint slices of dynamic shared memory
+        @cuda.jit
+        def slice_read(x):
+            dynsmem = cuda.shared.array(0, dtype=int32)
+            sm1 = dynsmem[0:1]
+            sm2 = dynsmem[1:2]
+
+            dynsmem[0] = 1
+            dynsmem[1] = 2
+            x[0] = sm1[0]
+            x[1] = sm2[0]
+
+        arr = np.zeros(2, dtype=np.int32)
+        expected = np.array([1, 2], dtype=np.int32)
+        self._test_dynshared_slice(slice_read, arr, expected)
+
+    def test_dynshared_slice_diff_sizes(self):
+        # Test reading values from disjoint slices of dynamic shared memory
+        # with different sizes
+        @cuda.jit
+        def slice_diff_sizes(x):
+            dynsmem = cuda.shared.array(0, dtype=int32)
+            sm1 = dynsmem[0:1]
+            sm2 = dynsmem[1:3]
+
+            dynsmem[0] = 1
+            dynsmem[1] = 2
+            dynsmem[2] = 3
+            x[0] = sm1[0]
+            x[1] = sm2[0]
+            x[2] = sm2[1]
+
+        arr = np.zeros(3, dtype=np.int32)
+        expected = np.array([1, 2, 3], dtype=np.int32)
+        self._test_dynshared_slice(slice_diff_sizes, arr, expected)
+
+    def test_dynshared_slice_overlap(self):
+        # Test reading values from overlapping slices of dynamic shared memory
+        @cuda.jit
+        def slice_overlap(x):
+            dynsmem = cuda.shared.array(0, dtype=int32)
+            sm1 = dynsmem[0:2]
+            sm2 = dynsmem[1:4]
+
+            dynsmem[0] = 1
+            dynsmem[1] = 2
+            dynsmem[2] = 3
+            dynsmem[3] = 4
+            x[0] = sm1[0]
+            x[1] = sm1[1]
+            x[2] = sm2[0]
+            x[3] = sm2[1]
+            x[4] = sm2[2]
+
+        arr = np.zeros(5, dtype=np.int32)
+        expected = np.array([1, 2, 2, 3, 4], dtype=np.int32)
+        self._test_dynshared_slice(slice_overlap, arr, expected)
+
+    def test_dynshared_slice_gaps(self):
+        # Test writing values to slices of dynamic shared memory doesn't write
+        # outside the slice
+        @cuda.jit
+        def slice_gaps(x):
+            dynsmem = cuda.shared.array(0, dtype=int32)
+            sm1 = dynsmem[1:3]
+            sm2 = dynsmem[4:6]
+
+            # Initial values for dynamic shared memory, some to be overwritten
+            dynsmem[0] = 99
+            dynsmem[1] = 99
+            dynsmem[2] = 99
+            dynsmem[3] = 99
+            dynsmem[4] = 99
+            dynsmem[5] = 99
+            dynsmem[6] = 99
+
+            sm1[0] = 1
+            sm1[1] = 2
+            sm2[0] = 3
+            sm2[1] = 4
+
+            x[0] = dynsmem[0]
+            x[1] = dynsmem[1]
+            x[2] = dynsmem[2]
+            x[3] = dynsmem[3]
+            x[4] = dynsmem[4]
+            x[5] = dynsmem[5]
+            x[6] = dynsmem[6]
+
+        arr = np.zeros(7, dtype=np.int32)
+        expected = np.array([99, 1, 2, 99, 3, 4, 99], dtype=np.int32)
+        self._test_dynshared_slice(slice_gaps, arr, expected)
+
+    def test_dynshared_slice_write_backwards(self):
+        # Test writing values into disjoint slices of dynamic shared memory
+        # with negative steps
+        @cuda.jit
+        def slice_write_backwards(x):
+            dynsmem = cuda.shared.array(0, dtype=int32)
+            sm1 = dynsmem[1::-1]
+            sm2 = dynsmem[3:1:-1]
+
+            sm1[0] = 1
+            sm1[1] = 2
+            sm2[0] = 3
+            sm2[1] = 4
+            x[0] = dynsmem[0]
+            x[1] = dynsmem[1]
+            x[2] = dynsmem[2]
+            x[3] = dynsmem[3]
+
+        arr = np.zeros(4, dtype=np.int32)
+        expected = np.array([2, 1, 4, 3], dtype=np.int32)
+        self._test_dynshared_slice(slice_write_backwards, arr, expected)
+
+    def test_dynshared_slice_nonunit_stride(self):
+        # Test writing values into slice of dynamic shared memory with
+        # non-unit stride
+        @cuda.jit
+        def slice_nonunit_stride(x):
+            dynsmem = cuda.shared.array(0, dtype=int32)
+            sm1 = dynsmem[::2]
+
+            # Initial values for dynamic shared memory, some to be overwritten
+            dynsmem[0] = 99
+            dynsmem[1] = 99
+            dynsmem[2] = 99
+            dynsmem[3] = 99
+            dynsmem[4] = 99
+            dynsmem[5] = 99
+
+            sm1[0] = 1
+            sm1[1] = 2
+            sm1[2] = 3
+
+            x[0] = dynsmem[0]
+            x[1] = dynsmem[1]
+            x[2] = dynsmem[2]
+            x[3] = dynsmem[3]
+            x[4] = dynsmem[4]
+            x[5] = dynsmem[5]
+
+        arr = np.zeros(6, dtype=np.int32)
+        expected = np.array([1, 99, 2, 99, 3, 99], dtype=np.int32)
+        self._test_dynshared_slice(slice_nonunit_stride, arr, expected)
+
+    def test_dynshared_slice_nonunit_reverse_stride(self):
+        # Test writing values into slice of dynamic shared memory with
+        # reverse non-unit stride
+        @cuda.jit
+        def slice_nonunit_reverse_stride(x):
+            dynsmem = cuda.shared.array(0, dtype=int32)
+            sm1 = dynsmem[-1::-2]
+
+            # Initial values for dynamic shared memory, some to be overwritten
+            dynsmem[0] = 99
+            dynsmem[1] = 99
+            dynsmem[2] = 99
+            dynsmem[3] = 99
+            dynsmem[4] = 99
+            dynsmem[5] = 99
+
+            sm1[0] = 1
+            sm1[1] = 2
+            sm1[2] = 3
+
+            x[0] = dynsmem[0]
+            x[1] = dynsmem[1]
+            x[2] = dynsmem[2]
+            x[3] = dynsmem[3]
+            x[4] = dynsmem[4]
+            x[5] = dynsmem[5]
+
+        arr = np.zeros(6, dtype=np.int32)
+        expected = np.array([99, 3, 99, 2, 99, 1], dtype=np.int32)
+        self._test_dynshared_slice(slice_nonunit_reverse_stride, arr, expected)
+
+    def test_issue_5073(self):
+        # An example with which Bug #5073 (slices of dynamic shared memory all
+        # alias) was discovered. The kernel uses all threads in the block to
+        # load values into slices of dynamic shared memory. One thread per
+        # block then writes the loaded values back to a global array after
+        # syncthreads().
+
+        arr = np.arange(1024)
+        nelem = len(arr)
+        nthreads = 16
+        nblocks = int(nelem / nthreads)
+        dt = nps.from_dtype(arr.dtype)
+        nshared = nthreads * arr.dtype.itemsize
+        chunksize = int(nthreads / 2)
+
+        @cuda.jit
+        def sm_slice_copy(x, y, chunksize):
+            dynsmem = cuda.shared.array(0, dtype=dt)
+            sm1 = dynsmem[0:chunksize]
+            sm2 = dynsmem[chunksize:chunksize*2]
+
+            tx = cuda.threadIdx.x
+            bx = cuda.blockIdx.x
+            bd = cuda.blockDim.x
+
+            # load this block's chunk into shared
+            i = bx * bd + tx
+            if i < len(x):
+                if tx < chunksize:
+                    sm1[tx] = x[i]
+                else:
+                    sm2[tx - chunksize] = x[i]
+
+            cuda.syncthreads()
+
+            # one thread per block writes this block's chunk
+            if tx == 0:
+                for j in range(chunksize):
+                    y[bd * bx + j] = sm1[j]
+                    y[bd * bx + j + chunksize] = sm2[j]
+
+
+        d_result = cuda.device_array_like(arr)
+        sm_slice_copy[nblocks, nthreads, 0, nshared](arr, d_result, chunksize)
+        host_result = d_result.copy_to_host()
+        np.testing.assert_array_equal(arr, host_result)
+
 
 if __name__ == '__main__':
     unittest.main()