# LBM - Structure-of-Arrays (AoS)

In [1]:
import lbm
import time
import warp as wp
wp.clear_kernel_cache()
exercise_name = "01-lbm-singleGPU-soa"

params = lbm.Parameters(num_steps=5000,
                        nx=1024 ,
                        ny=1024 ,
                        prescribed_vel=0.5,
                        Re=10000.0)
print(params)


Warp 1.7.1 initialized:
   CUDA Toolkit 12.8, Driver 12.8
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "NVIDIA RTX A4000" (16 GiB, sm_86, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.7.1
LBM Problem Parameters(nx=1024, ny=1024, num_steps=5000, Re=10000.0, prescribed_vel=0.5)


## A Structure-of-Arrays Layout for the LBM Populations

In the previous example, an implementation of the LBM method was introduced leveraging the Array-of-Structures (AoS) layout.  
In this exercise, we are going to implement a Structure-of-Arrays (SoA) solution and compare it with the AoS version.  

<figure>
  <img src="img/panda.png" alt="Exercise" width="7%" style="float: left; margin-right: 10px; margin-bottom: 10px;">
  <figcaption><strong>Exercise</strong>: Currently, the next two cells define allocation and access functionalities for the population based on the AoS layout. Please change them to match a SoA layout. Remember to always check the correctness of the results by comparing the final velocity magnitude.
  </figcaption>
</figure>

In [2]:
f_0 = wp.zeros((params.Q, params.nx, params.ny), dtype=wp.float64)
f_1 = wp.zeros((params.Q, params.nx, params.ny), dtype=wp.float64)

In [3]:
@wp.func
def read_field(field: wp.array3d(dtype=wp.float64), card: wp.int32, xi: wp.int32, yi: wp.int32):
    return field[card, xi, yi]

@wp.func
def write_field(field: wp.array3d(dtype=wp.float64), card: wp.int32, xi: wp.int32, yi: wp.int32,
                value: wp.float64):
    field[card, xi, yi] = value

## LBM Kernels

Next, we report the same kernels presented in the previous example.

In [4]:
# Initialize the memory
mem = lbm.Memory(params,
                 f_0=f_0,
                 f_1=f_1,
                 read=read_field,
                 write=write_field)

# Initialize the kernels
functions = lbm.Functions(params)
kernels = lbm.Kernels(params, mem)

Q = params.Q
D = params.D
bc_bulk = params.bc_bulk
c_dev = params.c_dev
dim_dev = params.dim_dev

In [5]:
@wp.kernel
def stream(
        f_in: wp.array3d(dtype=wp.float64),
        f_out: wp.array3d(dtype=wp.float64),
):
    # Get the global index
    ix, iy = wp.tid()
    index = wp.vec2i(ix, iy)
    f_post = wp.vec(length=Q, dtype=wp.float64)

    for q in range(params.Q):
        pull_ngh = wp.vec2i(0, 0)
        outside_domain = False

        for d in range(D):
            pull_ngh[d] = index[d] - c_dev[d, q]

            if pull_ngh[d] < 0 or pull_ngh[d] >= dim_dev[d]:
                outside_domain = True
        if not outside_domain:
            f_post[q] = read_field(field=f_in, card=q, xi=pull_ngh[0], yi=pull_ngh[1])

    # Set the output
    for q in range(params.Q):
        write_field(field=f_out, card=q, xi=index[0], yi=index[1], value=f_post[q])

In [6]:
compute_boundaries = functions.get_apply_boundary_conditions()

@wp.kernel
def apply_boundary_conditions(
        bc_type_field: wp.array2d(dtype=wp.uint8),
        f_out: wp.array3d(dtype=wp.float64),
):
    # Get the global index
    ix, iy = wp.tid()

    bc_type = bc_type_field[ix, iy]
    if bc_type == bc_bulk:
        return

    f = compute_boundaries(bc_type)

    for q in range(params.Q):
        write_field(field=f_out, card=q, xi=ix, yi=iy, value=f[q])

In [7]:
compute_macroscopic = functions.get_macroscopic()
compute_equilibrium = functions.get_equilibrium()
compute_collision = functions.get_kbc()

@wp.kernel
def collide(
        f: wp.array3d(dtype=wp.float64),
        omega: wp.float64,
):
    # Get the global index
    ix, iy = wp.tid()
    # Get the equilibrium

    f_post_stream = wp.vec(length=Q, dtype=wp.float64)
    for q in range(params.Q):
        f_post_stream[q] = read_field(field=f, card=q, xi=ix, yi=iy)

    mcrpc = compute_macroscopic(f_post_stream)

    # Compute the equilibrium
    f_eq = compute_equilibrium(mcrpc)

    f_post_collision = compute_collision(f_post_stream, f_eq, mcrpc, omega)

    # Set the output
    for q in range(params.Q):
        write_field(field=f, card=q, xi=ix, yi=iy, value=f_post_collision[q])

In [8]:
lbm.setup_LDC_problem(params=params, mem=mem)
lbm.export_setup(prefix=exercise_name, params=params, mem=mem)

Module lbm.kernels 218bc5d load on device 'cuda:0' took 132.82 ms  (compiled)
Module lbm.kernels ba5c45c load on device 'cuda:0' took 283.15 ms  (compiled)


<Figure size 640x480 with 0 Axes>

## LMB iteration


In [9]:
def iterate():
    wp.launch(stream,
              dim=params.launch_dim,
              inputs=[mem.f_0, mem.f_1],
              device="cuda")

    wp.launch(apply_boundary_conditions,
              dim=params.launch_dim,
              inputs=[mem.bc_type, mem.f_1],
              device="cuda")

    wp.launch(collide,
              dim=params.launch_dim,
              inputs=[mem.f_1, params.omega],
              device="cuda")
    # Swap the fields
    mem.f_0, mem.f_1 = mem.f_1, mem.f_0

## Main Loop

In [10]:
# Warm up iteration
iterate()

# Wait for the warm-up to finish.
wp.synchronize()
# Start timer
start = time.time()
for it in range(params.num_steps):
    iterate()

wp.synchronize()
stop = time.time()

# Saving the velocity magnitude.
lbm.export_final(prefix=exercise_name, params=params, mem=mem, f=mem.f_0)

Module __main__ b947c69 load on device 'cuda:0' took 946.59 ms  (compiled)
Module lbm.kernels 641e288 load on device 'cuda:0' took 366.78 ms  (compiled)


<Figure size 640x480 with 0 Axes>

## Performance

<figure>
  <img src="img/panda.png" alt="Exercise" width="7%" style="float: left; margin-right: 10px; margin-bottom: 10px;">
  <figcaption><strong>Exercise</strong>: How does performance change when moving from AoS to SoA? Why? Also, run the profiler to observe the performance differences for each kernel.
  </figcaption>
</figure>

In [11]:
# Printing some statistics.
elapsed_time = stop - start
mlups = params.compute_mlups(elapsed_time)
print(f"--------------------------------------------")
print(f"Exercise: {exercise_name}")
print(f"Main loop time: {elapsed_time:5.3f} seconds")
print(f"MLUPS:          {mlups:5.1f}")
print(f"--------------------------------------------")

--------------------------------------------
Exercise: 01-lbm-singleGPU-soa
Main loop time: 16.916 seconds
MLUPS:          309.9
--------------------------------------------
