In [17]:
import sys
import numba
import numpy

print(f"Python version: {sys.version}")
print(f"Numba version: {numba.__version__}")
print(f"Numpy version: {numpy.__version__}")

Python version: 3.8.10 (default, Nov 14 2022, 12:59:47) 
[GCC 9.4.0]
Numba version: 0.56.4
Numpy version: 1.23.5


In [18]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_Mar__8_18:18:20_PST_2022
Cuda compilation tools, release 11.6, V11.6.124
Build cuda_11.6.r11.6/compiler.31057947_0


In [19]:
!nvidia-smi

Thu Apr 13 23:37:55 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A40          Off  | 00000000:00:10.0 Off |                    0 |
|  0%   37C    P0    75W / 300W |   1113MiB / 46068MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:00:11.0 Off |                  N/A |
|  0%   27C    P8     5W / 120W |    153MiB /  6144MiB |      0%      Default |
|       

# Python CUDA examples

## Importing required packages

In [20]:
from numba import cuda
import numpy as np
import time

## Cuda Kernels

In [5]:
@cuda.jit
def cudakernel0(array):
    for i in range(array.size):
        array[i] += 0.5
        
@cuda.jit
def cudakernel1(array):
    # cuda.grid(1) = cuda.threadIdx.x + cuda.blockIdx.x * cuda.blockDim.x
    thread_postition = cuda.grid(1)
    array[thread_postition] += 0.5

## Running Kernels

In [21]:
array = np.array([0,1], np.float32)
print(f"Initial Array: {array}")

gridSize = 1
blockSize = 1

print('Kernel launch: cudakernel0[1, 1](array)')
start_time = time.time()
cudakernel0[gridSize,blockSize](array)
time_taken = time.time() - start_time

print (f"Updated array: {array} \n Time taken: {time_taken}")

Initial Array: [0. 1.]
Kernel launch: cudakernel0[1, 1](array)
Updated array: [0.5 1.5] 
 Time taken: 0.0654442310333252


In [25]:
array = np.array([0,1], np.float32)
print(f"Initial Array: {array}")

gridSize = 1024
blockSize = 1024

print('Kernel launch: cudakernel0[1, 1](array)')
start_time = time.time()
cudakernel0[gridSize,blockSize](array)
time_taken = time.time() - start_time

print (f"Updated array: {array} \n Time taken: {time_taken}")

Initial Array: [0. 1.]
Kernel launch: cudakernel0[1, 1](array)
Updated array: [14.  15.5] 
 Time taken: 0.0020678043365478516


In [26]:
array = np.array([0,1], np.float32)
print(f"Initial Array: {array}")

gridSize = 1
blockSize = 2

print('Kernel launch: cudakernel1[1, 2](array)')
start_time = time.time()
cudakernel1[gridSize,blockSize](array)
time_taken = time.time() - start_time

print (f"Updated array: {array} \n Time taken: {time_taken}")

Initial Array: [0. 1.]
Kernel launch: cudakernel1[1, 2](array)
Updated array: [0.5 1.5] 
 Time taken: 0.00250244140625


In [27]:
array = np.zeros(1024*1024)
print(f"Initial Array: {array}")

gridSize = 1024
blockSize = 1024

print('Kernel launch: cudakernel1[1024, 1024](array)')
start_time = time.time()
cudakernel1[gridSize,blockSize](array)
time_taken = time.time() - start_time

print (f"Updated array: {array} \n Time taken: {time_taken}")

Initial Array: [0. 0. 0. ... 0. 0. 0.]
Kernel launch: cudakernel1[1024, 1024](array)
Updated array: [0.5 0.5 0.5 ... 0.5 0.5 0.5] 
 Time taken: 0.006869316101074219


## Streams and Events

In [28]:
N_STREAMS = 10
streams = [cuda.stream() for _ in range(N_STREAMS)]
array = np.zeros(1024*1024)

In [29]:
streams[:5]

[<CUDA stream 50925936 on <CUDA context c_void_p(40175984) of device 0>>,
 <CUDA stream 50866272 on <CUDA context c_void_p(40175984) of device 0>>,
 <CUDA stream 51243200 on <CUDA context c_void_p(40175984) of device 0>>,
 <CUDA stream 50919984 on <CUDA context c_void_p(40175984) of device 0>>,
 <CUDA stream 50518320 on <CUDA context c_void_p(40175984) of device 0>>]

In [30]:
stream_size = (len(array)+ N_STREAMS -1) // N_STREAMS
blockSize = 256
gridSize = (stream_size + blockSize -1) // blockSize

print(f"Per stream blockSize: {blockSize}")
print(f"Per stream gridSize: {gridSize}")

Per stream blockSize: 256
Per stream gridSize: 410


In [31]:
result_array = np.empty_like(array)
start_time = time.time()

for i, stream in enumerate(streams):
    if (i+1)*stream_size < len(array) -1:
        # coping data to the device
        d_array = cuda.to_device(array[i*stream_size : (i+1)*stream_size], stream=stream)
        
        # run the kernel
        cudakernel1[blockSize, gridSize, stream](d_array)
        
        # copy the result back
        d_array.copy_to_host(result_array[i*stream_size : (i+1)*stream_size], stream=stream)
    
    else:
        # coping data to the device
        d_array = cuda.to_device(array[i*stream_size:], stream=stream)
        
        # run the kernel
        cudakernel1[blockSize, gridSize, stream](d_array)
        
        # copy the result back
        d_array.copy_to_host(result_array[i*stream_size :], stream=stream)

cuda.synchronize()
time_taken = time.time() - start_time

print(f"Time Taken : {time_taken}")

Time Taken : 0.022527217864990234


In [32]:
result_array

array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5])

In [33]:
np.all(result_array == np.zeros(1024*1024)+0.5)

True