# Parallel reduction

[Optimising parallel reduction in CUDA](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf)

## Reduce bond forces to particle forces

Bond forces can be stored as a bondlist or neighbour list

- bondlist [n_bonds, 2]
- neighbourlist [n_particles, n_family_members]

Reduce:
- particles.forces [n_particles, 1]

In [46]:
import time
from functools import wraps

import numpy as np
from numba import njit, prange

In [47]:
def timeit(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        end = time.time()
        print(f"Function '{func.__name__}' executed in {end - start:.4f} seconds")
        return result

    return wrapper

In [48]:
np.random.seed(42)
n_particles = 1500000
n_family_members = 256

neighbourlist = np.random.rand(n_particles, n_family_members)

### Numpy and Numba

In [49]:
@timeit
def reduce_bond_forces_a(neighbourlist):
    n_particles = neighbourlist.shape[0]
    f = np.zeros((n_particles))
    for i in range(n_particles):
        f[i] = np.sum(neighbourlist[i, :])
    return f

In [50]:
@timeit
def reduce_bond_forces_b(neighbourlist):
    return np.sum(neighbourlist, axis=1)

In [51]:
@timeit
@njit(parallel=True, fastmath=True)
def reduce_bond_forces_c(neighbourlist):
    n_particles = neighbourlist.shape[0]
    f = np.zeros((n_particles))
    for i in prange(n_particles):
        f[i] = np.sum(neighbourlist[i, :])
    return f

In [62]:
f_a = reduce_bond_forces_a(neighbourlist)
f_b = reduce_bond_forces_b(neighbourlist)
f_c = reduce_bond_forces_c(neighbourlist)
assert np.allclose(f_a, f_b) and np.allclose(f_b, f_c), "Results are not equal"

Function 'reduce_bond_forces_a' executed in 4.8073 seconds
Function 'reduce_bond_forces_b' executed in 0.2593 seconds
Function 'reduce_bond_forces_c' executed in 0.2392 seconds


### Numba CUDA

In [53]:
from numba import cuda, float32

In [54]:
from numba import cuda

def get_cuda_device_info(verbose=True):
    """
    Retrieve comprehensive information about the current CUDA device.

    Parameters:
    -----------
    verbose : bool, optional
        If True, print device information. If False, return as dictionary.

    Returns:
    --------
    dict or None
        Dictionary of device properties if verbose=False, otherwise None
    """
    try:
        device = cuda.get_current_device()
        context = cuda.current_context()
        cuda_version = cuda.runtime.get_version()  # (major, minor)

        device_info = {
            "cuda_runtime_version": f"{cuda_version[0]}.{cuda_version[1]}",
            "name": device.name,
            "compute_capability": device.compute_capability,
            "total_memory_gb": context.get_memory_info().total / 1e9,
            "free_memory_gb": context.get_memory_info().free / 1e9,
            "multiprocessors": device.MULTIPROCESSOR_COUNT,
            "max_threads_per_block": device.MAX_THREADS_PER_BLOCK,
            "max_grid_dimensions": {
                "x": device.MAX_GRID_DIM_X,
                "y": device.MAX_GRID_DIM_Y,
                "z": device.MAX_GRID_DIM_Z
            },
            "warp_size": device.WARP_SIZE,
            "clock_rate_khz": device.CLOCK_RATE,
            "memory_clock_rate_khz": device.MEMORY_CLOCK_RATE,
        }

        if verbose:
            print("CUDA Device Information:")
            print("-" * 40)
            print(f"{'CUDA Runtime Version:':<30} {device_info['cuda_runtime_version']}")
            print(f"{'Device Name:':<30} {device_info['name']}")
            print(f"{'Compute Capability:':<30} {device_info['compute_capability']}")

            print("\nMemory:")
            print(f"{'Total Memory:':<30} {device_info['total_memory_gb']:.2f} GB")
            print(f"{'Free Memory:':<30} {device_info['free_memory_gb']:.2f} GB")

            print("\nCompute Resources:")
            print(f"{'Streaming Multiprocessors:':<30} {device_info['multiprocessors']}")
            print(f"{'Max Threads per Block:':<30} {device_info['max_threads_per_block']}")

            print("\nGrid Limitations:")
            print(f"{'Max Grid Dimensions X:':<30} {device_info['max_grid_dimensions']['x']}")
            print(f"{'Max Grid Dimensions Y:':<30} {device_info['max_grid_dimensions']['y']}")
            print(f"{'Max Grid Dimensions Z:':<30} {device_info['max_grid_dimensions']['z']}")

            print("\nAdditional Characteristics:")
            print(f"{'Warp Size:':<30} {device_info['warp_size']}")
            print(f"{'Clock Rate:':<30} {device_info['clock_rate_khz']/1e6:.2f} GHz")
            print(f"{'Memory Clock Rate:':<30} {device_info['memory_clock_rate_khz']/1e6:.2f} GHz")

        return device_info if not verbose else None

    except Exception as e:
        print(f"Error retrieving CUDA device information: {e}")
        return None

In [55]:
@cuda.jit
def row_sum_kernel(neighbourlist, output):
    row = cuda.blockIdx.x
    tid = cuda.threadIdx.x
    n_cols = neighbourlist.shape[1]

    # Allocate shared memory for each thread to load one value
    sdata = cuda.shared.array(512, dtype=float32)  # Adjust size if needed

    val = 0.0
    if tid < n_cols:
        val = neighbourlist[row, tid]

    sdata[tid] = val
    cuda.syncthreads()

    s = cuda.blockDim.x // 2
    while s > 0:
        if tid < s and tid + s < n_cols:
            sdata[tid] += sdata[tid + s]
        cuda.syncthreads()
        s //= 2

    if tid == 0:
        output[row] = sdata[0]

@timeit
def reduce_bond_forces_gpu(neighbourlist):
    n_particles, n_family_members = neighbourlist.shape
    threads_per_block = 512  # Match shared memory allocation
    shared_mem = threads_per_block * 4  # float32: 4 bytes

    d_neigh = cuda.to_device(neighbourlist.astype(np.float32))
    d_out = cuda.device_array(n_particles, dtype=np.float32)

    row_sum_kernel[n_particles, threads_per_block](d_neigh, d_out)
    return d_out.copy_to_host()

In [56]:
get_cuda_device_info()

CUDA Device Information:
----------------------------------------
CUDA Runtime Version:          12.5
Device Name:                   b'Tesla T4'
Compute Capability:            (7, 5)

Memory:
Total Memory:                  15.83 GB
Free Memory:                   14.17 GB

Compute Resources:
Streaming Multiprocessors:     40
Max Threads per Block:         1024

Grid Limitations:
Max Grid Dimensions X:         2147483647
Max Grid Dimensions Y:         65535
Max Grid Dimensions Z:         65535

Additional Characteristics:
Warp Size:                     32
Clock Rate:                    1.59 GHz
Memory Clock Rate:             5.00 GHz


See [this post](https://github.com/googlecolab/colabtools/issues/5081) to understand compatability issues with Google Colab and Numba CUDA

In [57]:
!uv pip install -q --system numba-cuda==0.4.0

In [58]:
from numba import config
config.CUDA_ENABLE_PYNVJITLINK = 1

In [60]:
f_gpu = reduce_bond_forces_gpu(neighbourlist)
assert np.allclose(f_a, f_gpu), "Results are not equal"

Function 'reduce_bond_forces_gpu' executed in 1.0020 seconds
