In [17]:
## Cuda Device

In [1]:
from numba import cuda

In [2]:
cuda.detect()

Found 1 CUDA devices
id 0    b'NVIDIA GeForce RTX 2060 SUPER'                              [SUPPORTED]
                      Compute Capability: 7.5
                           PCI Device ID: 0
                              PCI Bus ID: 1
                                    UUID: GPU-8500da8a-ebb6-595d-cbda-381e217fd292
                                Watchdog: Enabled
             FP32/FP64 Performance Ratio: 32
Summary:
	1/1 devices are supported


True

In [3]:
# Launching a Cuda kernel from Numba is very easy. A kernel is defined by using the @cuda.jit decorator as
@cuda.jit
def an_empty_kernel():
    """A kernel that doesn't do anything."""
    # Get my current position in the global grid
    [pos_x, pos_y] = cuda.grid(2)

The following commands define a two dimensional thread layout of  threads per block and 16x16  blocks. In total this gives us 256x256 threads.


In [4]:
threadsperblock = (16, 16) # Should be a multiple of 32 if possible.
blockspergrid = (256, 256) # Blocks per grid

We can now launch all 16.8 million threads by calling

In [5]:
an_empty_kernel[blockspergrid, threadsperblock]()

### Memory management


In [7]:
import numpy as np

arr = np.arange(10)
device_arr = cuda.to_device(arr)

In [8]:
host_arr = device_arr.copy_to_host() 

In [41]:
host_array = np.empty(shape=device_arr.shape, dtype=device_arr.dtype)
device_arr.copy_to_host(host_array)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [9]:
from numba import vectorize

@vectorize(['int64(int64, int64)'], target='cuda')
def add_ufunc(x, y):
    return x + y

In [10]:
a = np.array([1, 2, 3, 4])
b = np.array([10, 20, 30, 40])
b_col = b[:, np.newaxis] # b as column array
c = np.arange(4*4).reshape((4,4))

print('a+b:\n', add_ufunc(a, b))
print()
print('b_col + c:\n', add_ufunc(b_col, c))

a+b:
 [11 22 33 44]

b_col + c:
 [[10 11 12 13]
 [24 25 26 27]
 [38 39 40 41]
 [52 53 54 55]]




In [11]:
%timeit np.add(b_col, c)   # NumPy on CPU

894 ns ± 11.5 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [12]:
%timeit add_ufunc(b_col, c) # Numba on GPU


713 µs ± 2.02 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


the GPU is a lot slower than the CPU. What happened??

In [13]:
import math  # Note that for the CUDA target, we need to use the scalar functions from the math module, not NumPy

SQRT_2PI = np.float32((2*math.pi)**0.5)  # Precompute this constant as a float32.  Numba will inline it at compile time.

@vectorize(['float32(float32, float32, float32)'], target='cuda')
def gaussian_pdf(x, mean, sigma):
    '''Compute the value of a Gaussian probability density function at x with given mean and sigma.'''
    return math.exp(-0.5 * ((x - mean) / sigma)**2) / (sigma * SQRT_2PI)

In [14]:
# Evaluate the Gaussian distribution PDF a million times!
x = np.random.uniform(-3, 3, size=1000000).astype(np.float32)
mean = np.float32(0.0)
sigma = np.float32(1.0)

# Quick test
gaussian_pdf(x[0], 0.0, 1.0)



array([0.01306025])

In [15]:
import scipy.stats # for definition of gaussian distribution
norm_pdf = scipy.stats.norm
%timeit norm_pdf.pdf(x, loc=mean, scale=sigma)

32.8 ms ± 56.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
%timeit gaussian_pdf(x, mean, sigma)


2.79 ms ± 44.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
