In [41]:
import numba

In [47]:
#! numba -s

In [1]:
! ls

ex_tst.py
ex1.py
ex2.py
ex3.py
img
Section_1.ipynb
Section_2_GPU_ufunc.ipynb
Section_3_Memory_Management.ipynb
Section_4_CUDA_Kernels.ipynb
Section_5_Debugging.ipynb


# Printing

Numba supports printing from CUDA kernels, with some restrictions. 
Note that output printed from a CUDA kernel will not be captured by Jupyter, so you will need to debug with a script you can run from the terminal.

In [48]:
! cat ex1.py

import numpy as np

from numba import cuda

@cuda.jit
def histogram(x, xmin, xmax, histogram_out):
    nbins = histogram_out.shape[0]
    bin_width = (xmax - xmin) / nbins

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for i in range(start, x.shape[0], stride):
        bin_number = np.int32((x[i] - xmin)/bin_width)
        if bin_number >= 0 and bin_number < histogram_out.shape[0]:
            print('in range', x[i], bin_number)
            histogram_out[bin_number] += 1 # synchronization?
        else:
        	print('out of range', x[i], bin_number)

x = np.random.normal(size=50, loc=0, scale=1).astype(np.float32)
xmin = np.float32(-4.0)
xmax = np.float32(4.0)
histogram_out = np.zeros(shape=10, dtype=np.int32)

histogram[64, 64](x, xmin, xmax, histogram_out)

print('input count:', x.shape[0])
print('histogram:', histogram_out)
print('count:', histogram_out.sum())



In [49]:
! python ex1.py

in range -1.323514 3
in range 1.697249 7
in range -0.336293 4
in range -0.135460 4
in range -0.161487 4
in range -0.761789 4
in range -0.236587 4
in range 0.227503 5
in range -1.212478 3
in range 0.324774 5
in range -0.892641 3
in range 0.570581 5
in range -0.361130 4
in range -1.112782 3
in range 0.765541 5
in range -0.942633 3
in range -0.998648 3
in range 1.464560 6
in range 0.423738 5
in range 0.823576 6
in range 0.538805 5
in range -1.533019 3
in range 1.986257 7
in range 0.615398 5
in range -1.753933 2
in range -1.356655 3
in range -0.854929 3
in range 1.094279 6
in range -0.024274 4
in range 0.653723 5
in range -0.285262 4
in range -0.002720 4
in range 0.482828 5
in range 0.534120 5
in range -1.992800 2
in range 0.279898 5
in range -1.002313 3
in range 0.036957 5
in range 0.067615 5
in range -1.386677 3
in range 0.752976 5
in range -0.064104 4
in range 0.227325 5
in range 0.051657 5
in range 0.301618 5
in range 0.145294 5
in range -0.414238 4
in range -0.123766 4
in range 0.4801

After adding print commands 

In [50]:
! python ex1.py

in range -2.136343 2
in range 0.902413 6
in range -0.621482 4
in range 1.047049 6
in range -2.228859 2
in range -1.198175 3
in range 0.665069 5
in range 0.328402 5
in range -1.724847 2
in range 0.666625 5
in range -1.307602 3
in range 1.335494 6
in range 0.487079 5
in range -0.396419 4
in range 1.082031 6
in range -0.090555 4
in range 1.185914 6
in range 0.698263 5
in range 0.824187 6
in range -2.031569 2
in range -0.336351 4
in range -0.187361 4
in range 0.260466 5
in range -0.386369 4
in range -0.593185 4
in range -0.584813 4
in range 0.464575 5
in range -0.621168 4
in range -1.542048 3
in range 1.579781 6
in range 1.032547 6
in range 0.163719 5
in range 0.690543 5
in range 0.085653 5
in range 1.343960 6
in range 0.465475 5
in range -0.823949 3
in range -0.260188 4
in range 0.144786 5
in range 2.487358 8
in range -0.019695 4
in range -0.333395 4
in range 0.118706 5
in range -0.847674 3
in range 0.215269 5
in range 0.092835 5
in range 0.304421 5
in range -0.048209 4
in range -1.175677

Scanning down that output, we see that all 50 values should be in range. Clearly we have some kind of race condition updating the histogram. In fact, the culprit line is:

`histogram_out[bin_number] += 1`

which should be (as you may have seen in a previous exercise)

`cuda.atomic.add(histogram_out, bin_number, 1)`

In [51]:
! python ex2.py

in range 0.079367 5
in range 1.363287 6
in range 1.873459 7
in range -0.017616 4
in range 0.001278 5
in range 0.302049 5
in range 0.177706 5
in range -0.782068 4
in range 0.860395 6
in range 1.314295 6
in range 0.711960 5
in range 0.255307 5
in range 0.983536 6
in range 1.068653 6
in range 0.864480 6
in range -2.185833 2
in range -1.418863 3
in range -0.099831 4
in range 0.277479 5
in range -1.054719 3
in range -1.592868 3
in range 0.263342 5
in range 0.447606 5
in range 0.134533 5
in range -1.909023 2
in range -0.359674 4
in range -2.663350 1
in range -1.071304 3
in range -1.168933 3
in range 0.683264 5
in range 0.293292 5
in range -0.141315 4
in range -0.130128 4
in range -0.547374 4
in range 0.186888 5
in range 0.427405 5
in range 0.190016 5
in range 0.452734 5
in range 0.055578 5
in range 0.565442 5
in range 0.747505 5
in range -1.077742 3
in range -2.163671 2
in range -0.827153 3
in range 0.234512 5
in range -1.177214 3
in range -1.025811 3
in range -1.484583 3
in range 0.103086 5

<b>50 !</b>

# CUDA Simulator

Back in the early days of CUDA, nvcc had an "emulator" mode that would execute CUDA code on the CPU. That functionality was dropped in later CUDA releases after cuda-gdb was created. We missed emulator mode so much, Numba includes a "CUDA simulator" in Numba that runs your CUDA code with the Python interpreter on the host CPU. This allows you to debug the logic of your code using Python modules and functions that would otherwise be not allowed by the compile.

A very common use case is to start the Python debugger inside one thread of a CUDA kernel:


In [54]:
! cat ex3.py

import numpy as np
import numba
from numba import cuda
from pdb import set_trace

@cuda.jit()
def histogram(x, xmin, xmax, histogram_out):
    nbins = histogram_out.shape[0]
    bin_width = (xmax - xmin) / nbins

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    ### DEBUG FIRST THREAD
    if start == 0:
        print("In")#set_trace()
    ###

    for i in range(start, x.shape[0], stride):
        bin_number = np.int32((x[i] + xmin)/bin_width)

        if bin_number >= 0 and bin_number < histogram_out.shape[0]:
            cuda.atomic.add(histogram_out, bin_number, 1)

x = np.random.normal(size=50, loc=0, scale=1).astype(np.float32)
xmin = np.float32(-4.0)
xmax = np.float32(4.0)
histogram_out = np.zeros(shape=10, dtype=np.int32)

histogram[64, 64](x, xmin, xmax, histogram_out)

print('input count:', x.shape[0])
print('histogram:', histogram_out)
print('count:', histogram_out.sum())


In [65]:
! SET NUMBA_ENABLE_CUDASIM=1
! python ex3.py

Traceback (most recent call last):
  File "ex3.py", line 29, in <module>
    histogram[64, 64](x, xmin, xmax, histogram_out)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 736, in __call__
    kernel = self.specialize(*args)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 747, in specialize
    kernel = self.compile(argtypes)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 762, in compile
    **self.targetoptions)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 37, in core
    return fn(*args, **kwargs)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 76, in compile_kernel
    cres = compile_cuda(pyfunc, types.void, args, debug=debug, inline=inline)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 37, in core
    return fn(*args, **kwargs)
  File "C:\Users\Nuclear\Anaconda3\lib\si


<b>It works in cmd!!!</b>

###### Cuda Memcheck

Another common error occurs when a CUDA kernel has an invalid memory access, typically caused by running off the end of an array. The full CUDA toolkit from NVIDIA (not the cudatoolkit conda package) contain a utility called cuda-memcheck that can check for a wide range of memory access mistakes in CUDA code.

In [75]:
! cat ex4.py

import numpy as np

from numba import cuda

@cuda.jit
def histogram(x, xmin, xmax, histogram_out):
    nbins = histogram_out.shape[0]
    bin_width = (xmax - xmin) / nbins

    start = cuda.grid(1)
    stride = cuda.gridsize(1)

    for i in range(start, x.shape[0], stride):
        bin_number = np.int32((x[i] + xmin)/bin_width)

        if bin_number >= 0 or bin_number < histogram_out.shape[0]:
            cuda.atomic.add(histogram_out, bin_number, 1)

x = np.random.normal(size=50, loc=0, scale=1).astype(np.float32)
xmin = np.float32(-4.0)
xmax = np.float32(4.0)
histogram_out = np.zeros(shape=10, dtype=np.int32)

histogram[64, 64](x, xmin, xmax, histogram_out)

print('input count:', x.shape[0])
print('histogram:', histogram_out)
print('count:', histogram_out.sum())



In [74]:
! cuda-memcheck python ex4.py



Traceback (most recent call last):
  File "ex4.py", line 24, in <module>
    histogram[64, 64](x, xmin, xmax, histogram_out)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 738, in __call__
    cfg(*args)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 489, in __call__
    sharedmem=self.sharedmem)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 591, in _kernel_call
    wb()
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\args.py", line 65, in <lambda>
    retr.append(lambda: devary.copy_to_host(self.value, stream=stream))
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devices.py", line 212, in _require_cuda_context
    return fn(*args, **kws)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\cudadrv\devicearray.py", line 252, in copy_to_host
    _driver.device_to_host(hostary, self, self.alloc_size, stream=stream)
  File "C:\Us

In [77]:
! cuda-memcheck python ex4.py



Traceback (most recent call last):
  File "ex4.py", line 24, in <module>
    histogram[64, 64](x, xmin, xmax, histogram_out)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 738, in __call__
    cfg(*args)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 489, in __call__
    sharedmem=self.sharedmem)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\compiler.py", line 566, in _kernel_call
    driver.device_to_host(ctypes.addressof(excval), excmem, excsz)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py", line 1776, in device_to_host
    fn(host_pointer(dst), device_pointer(src), size, *varargs)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py", line 288, in safe_cuda_api_call
    self._check_error(fname, retcode)
  File "C:\Users\Nuclear\Anaconda3\lib\site-packages\numba\cuda\cudadrv\driver.py", line 323, in _check_error
    raise CudaAPIEr

Now we get an error message that includes a source file and line number: ex4.py:17.

In [79]:
! cat -n ex4.py | grep -C 2 "17"

    15	
    16	        if bin_number >= 0 or bin_number < histogram_out.shape[0]:
    17	            cuda.atomic.add(histogram_out, bin_number, 1)
    18	
    19	x = np.random.normal(size=50, loc=0, scale=1).astype(np.float32)




At this point, we might realize that our if statement incorrect has an or instead of an and.

cuda-memcheck has different modes for detecting different kinds of problems (similar to valgrind for debugging CPU memory access errors). Take a look at the documentation for more information: http://docs.nvidia.com/cuda/cuda-memcheck/
