In [2]:
"""
Reference: https://nvidia.github.io/cuda-python/install.html
"""
%pip install cuda-python

'\nReference: https://nvidia.github.io/cuda-python/install.html\n'

In [3]:

from cuda import cuda, nvrtc
import numpy as np

def ASSERT_DRV(err):
    if isinstance(err, cuda.CUresult):
        if err != cuda.CUresult.CUDA_SUCCESS:
            raise RuntimeError("Cuda Error: {}".format(err))
    elif isinstance(err, nvrtc.nvrtcResult):
        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
            raise RuntimeError("Nvrtc Error: {}".format(err))
    else:
        raise RuntimeError("Unknown error type: {}".format(err))
    
saxpy = """\
extern "C" __global__
void saxpy(float a, float *x, float *y, float *out, size_t n)
{
 size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
 if (tid < n) {
   out[tid] = a * x[tid] + y[tid];
 }
}
"""

# Create program
err, prog = nvrtc.nvrtcCreateProgram(str.encode(saxpy), b"saxpy.cu", 0, [], [])

# Compile program
# opts = [b"--fmad=false", b"--gpu-architecture=compute_75"]
opts = [b"--fmad=false", b"--gpu-architecture=compute_60"]
err, = nvrtc.nvrtcCompileProgram(prog, 2, opts)

# Get PTX from compilation
err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
ptx = b" " * ptxSize
err, = nvrtc.nvrtcGetPTX(prog, ptx)

# Initialize CUDA Driver API
err, = cuda.cuInit(0)

# Retrieve handle for device 0
err, cuDevice = cuda.cuDeviceGet(0)

# Create context
err, context = cuda.cuCtxCreate(0, cuDevice)

# Load PTX as module data and retrieve function
ptx = np.char.array(ptx)
# Note: Incompatible --gpu-architecture would be detected here
err, module = cuda.cuModuleLoadData(ptx.ctypes.data)
ASSERT_DRV(err)
err, kernel = cuda.cuModuleGetFunction(module, b"saxpy")
ASSERT_DRV(err)

In [4]:
# a simple vector operation (ax + y = z)
# executed on the GPU using CUDA and Python's cuda library

NUM_THREADS = 512  # Sets the number of threads within each thread block.
NUM_BLOCKS = 32768  # Sets the number of thread blocks within the grid. Threads are organized into blocks, and blocks form a grid, defining the parallel execution structure on the GPU.

##### Input Data Preparation
# Create a scalar 'a' with value 2.0 (float32)
a = np.array([2.0], dtype=np.float32)

# Calculate the total number of elements for processing
n = np.array(NUM_THREADS * NUM_BLOCKS, dtype=np.uint32)

bufferSize = n * a.itemsize  # Calculate the total memory size required for each vector

# hX, hY, hOut are host vectors
hX = np.random.rand(n).astype(dtype=np.float32)
hY = np.random.rand(n).astype(dtype=np.float32)
hOut = np.zeros(n).astype(dtype=np.float32)

##### Device Memory Allocation
err, dXclass = cuda.cuMemAlloc(bufferSize)
err, dYclass = cuda.cuMemAlloc(bufferSize)
err, dOutclass = cuda.cuMemAlloc(bufferSize)

##### Stream Creation
err, stream = cuda.cuStreamCreate(0)

##### Data Transfer to CUDA Device (Host to Device)
err, = cuda.cuMemcpyHtoDAsync(
   dXclass, hX.ctypes.data, bufferSize, stream
)
err, = cuda.cuMemcpyHtoDAsync(
   dYclass, hY.ctypes.data, bufferSize, stream
)

##### Kernel Launch Preparation
# Note: Subject to change in a future release
dX = np.array([int(dXclass)], dtype=np.uint64)
dY = np.array([int(dYclass)], dtype=np.uint64)
dOut = np.array([int(dOutclass)], dtype=np.uint64)

# Create an array of arguments to pass to the kernel
args = [a, dX, dY, dOut, n]
# Create an array of arguments to pass to the kernel
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)

##### Launch the CUDA kernel with specified grid and block dimensions
err, = cuda.cuLaunchKernel(
   kernel,
   NUM_BLOCKS,  # grid x dim
   1,  # grid y dim
   1,  # grid z dim
   NUM_THREADS,  # block x dim
   1,  # block y dim
   1,  # block z dim
   0,  # dynamic shared memory
   stream,  # stream
   args.ctypes.data,  # kernel arguments
   0,  # extra (ignore)
)

# Copy the result 'dOut' from device to host memory asynchronously
err, = cuda.cuMemcpyDtoHAsync(
   hOut.ctypes.data, dOutclass, bufferSize, stream
)

# Synchronize the stream, wait for all operations to complete
err, = cuda.cuStreamSynchronize(stream)

# Assert values are same as host after running kernel
hZ = a * hX + hY
if not np.allclose(hOut, hZ):
   raise ValueError("Error outside tolerance for host-device vectors")
else:
   print("Success: Host and Device vectors are equal")
   print("result from CUDA (hOut): ", hOut)
   print("result from CPU (hZ): ", hZ)

##### Clean up resources
err, = cuda.cuStreamDestroy(stream)
err, = cuda.cuMemFree(dXclass)
err, = cuda.cuMemFree(dYclass)
err, = cuda.cuMemFree(dOutclass)
err, = cuda.cuModuleUnload(module)
err, = cuda.cuCtxDestroy(context)

Success: Host and Device vectors are equal
result from CUDA (hOut):  [0.8312741 1.719444  2.4221642 ... 1.0741441 1.29529   1.7780536]
result from CPU (hZ):  [0.8312741 1.719444  2.4221642 ... 1.0741441 1.29529   1.7780536]
