# Element-wise update

<a href="https://colab.research.google.com/github/mark-hobbs/articles/blob/main/cuda/element-wise-update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Update particle positions

In [16]:
!uv pip install -q --system numba-cuda==0.15.0

import numpy as np
from numba import  njit, prange, config
config.CUDA_ENABLE_PYNVJITLINK = 1

try:
    import google.colab
    !git clone https://github.com/mark-hobbs/articles.git
    import os
    os.chdir('articles/cuda')  # Navigate to the cuda subdirectory
except ImportError:
    pass  # Already local, no need to clone

import utils

Cloning into 'articles'...
remote: Enumerating objects: 595, done.[K
remote: Counting objects: 100% (131/131), done.[K
remote: Compressing objects: 100% (112/112), done.[K
remote: Total 595 (delta 73), reused 61 (delta 19), pack-reused 464 (from 1)[K
Receiving objects: 100% (595/595), 101.10 MiB | 37.12 MiB/s, done.
Resolving deltas: 100% (287/287), done.


In [17]:
np.random.seed(42)

n_particles = 1500000
n_dimensions = 2

f = np.random.randn(n_particles, n_dimensions)
u = np.zeros((n_particles, n_dimensions))
v = np.zeros((n_particles, n_dimensions))
a = np.zeros((n_particles, n_dimensions))
bc_flag = np.ones((n_particles, n_dimensions))
bc_unit_vector = np.ones((n_particles, n_dimensions))
bc_magnitude = 1
density = 1.0
damping = 1.0
dt = 1.0

In [18]:
@utils.profile(runs=10)
def euler_cromer_a(
    f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt
):
    n_nodes = np.shape(f)[0]
    n_dimensions = np.shape(f)[1]

    for node_i in range(n_nodes):
        for dof in range(n_dimensions):
            a[node_i, dof] = (f[node_i, dof] - damping * v[node_i, dof]) / density
            v[node_i, dof] += (a[node_i, dof] * dt)
            u[node_i, dof] += (v[node_i, dof] * dt)

            if bc_flag[node_i, dof] != 0:
                u[node_i, dof] = bc_magnitude * bc_unit_vector[node_i, dof]

    return u, v

In [19]:
@utils.profile(runs=10)
def euler_cromer_b(
    f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt
):
    a[:] = (f - damping * v) / density
    v += a * dt
    u += v * dt

    mask = bc_flag != 0
    u[mask] = bc_magnitude * bc_unit_vector[mask]

    return u, v

In [20]:
@utils.profile(runs=10)
@njit(parallel=True)
def euler_cromer_c(
    f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt
):
    n_nodes = np.shape(f)[0]
    n_dimensions = np.shape(f)[1]

    for node_i in prange(n_nodes):
        for dof in range(n_dimensions):
            a[node_i, dof] = (f[node_i, dof] - damping * v[node_i, dof]) / density
            v[node_i, dof] += (a[node_i, dof] * dt)
            u[node_i, dof] += (v[node_i, dof] * dt)

            if bc_flag[node_i, dof] != 0:
                u[node_i, dof] = bc_magnitude * bc_unit_vector[node_i, dof]

    return u, v

In [21]:
euler_cromer_a(f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt);
euler_cromer_b(f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt);
euler_cromer_c(f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt);

Function 'euler_cromer_a' executed 10 time(s)
Average execution time: 6.3897 seconds
Min: 6.3321s, Max: 6.4412s

Function 'euler_cromer_b' executed 10 time(s)
Average execution time: 0.0551 seconds
Min: 0.0533s, Max: 0.0583s

Function 'euler_cromer_c' executed 10 time(s)
Average execution time: 0.0678 seconds
Min: 0.0033s, Max: 0.6475s



## Numba CUDA

In [22]:
from numba import cuda, float32

In [23]:
utils.get_cuda_device_info()

CUDA Device Information:
----------------------------------------
CUDA Runtime Version:          12.6
Device Name:                   NVIDIA L4                                                                                                                       
Compute Capability:            (8, 9)

Memory:
Total Memory:                  23.80 GB
Free Memory:                   23.52 GB

Compute Resources:
Streaming Multiprocessors:     58
Max Threads per Block:         1024

Grid Limitations:
Max Grid Dimensions X:         2147483647
Max Grid Dimensions Y:         65535
Max Grid Dimensions Z:         65535

Additional Characteristics:
Warp Size:                     32
Clock Rate:                    2.04 GHz
Memory Clock Rate:             6.25 GHz


In [24]:
@utils.profile(runs=10)
def euler_cromer_gpu(
    f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt
):
    """
    Update particle positions using an Euler-Cromer time integration scheme

    This function is a wrapper for the CUDA kernel `euler_cromer_kernel`
    """
    THREADS_PER_BLOCK = 256
    BLOCKS_PER_GRID = (n_particles + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK
    euler_cromer_kernel[BLOCKS_PER_GRID, THREADS_PER_BLOCK](
        f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt
    )


@cuda.jit
def euler_cromer_kernel(
    f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt
):
    """
    CUDA kernel for Euler-Cromer time integration scheme
    """
    n_nodes = f.shape[0]
    n_dimensions = f.shape[1]

    idx = cuda.grid(1)
    total = n_nodes * n_dimensions

    if idx < total:
        node_i = idx // n_dimensions
        dof = idx % n_dimensions

        a[node_i, dof] = (f[node_i, dof] - damping * v[node_i, dof]) / density
        v[node_i, dof] += a[node_i, dof] * dt
        u[node_i, dof] += v[node_i, dof] * dt

        if bc_flag[node_i, dof] != 0:
            u[node_i, dof] = bc_magnitude * bc_unit_vector[node_i, dof]

In [25]:
f = cuda.to_device(f.astype(np.float32))
u = cuda.to_device(u.astype(np.float32))
v = cuda.to_device(v.astype(np.float32))
a = cuda.to_device(a.astype(np.float32))
bc_flag = cuda.to_device(bc_flag.astype(np.int32))
bc_unit_vector = cuda.to_device(bc_unit_vector.astype(np.float32))

euler_cromer_gpu(f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt)

Function 'euler_cromer_gpu' executed 10 time(s)
Average execution time: 0.0236 seconds
Min: 0.0001s, Max: 0.2352s



In [28]:
import time

def benchmark_kernel(f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt, num_runs=100):

    n_particles = f.shape[0]
    THREADS_PER_BLOCK = 256
    BLOCKS_PER_GRID = (n_particles + THREADS_PER_BLOCK - 1) // THREADS_PER_BLOCK

    # Warm up the kernel
    for _ in range(5):
        euler_cromer_kernel[BLOCKS_PER_GRID, THREADS_PER_BLOCK](f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt)

    cuda.synchronize()

    start = time.perf_counter()

    for _ in range(num_runs):
        euler_cromer_kernel[BLOCKS_PER_GRID, THREADS_PER_BLOCK](f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt)

    cuda.synchronize()

    end = time.perf_counter()

    avg = (end - start) / num_runs
    return avg, u.copy_to_host()

In [40]:
cuda_event_time, result = benchmark_kernel(f, u, v, a, density, bc_flag, bc_magnitude, bc_unit_vector, damping, dt, num_runs=100)
print(f"Kernel executed in {cuda_event_time:.4f} seconds")

Kernel executed in 0.0003 seconds
