# LBM Multi-GPU Step 0

<figure>
  <img src="img/partitions.png" alt="Domain partitioning in LBM" width="50%">
  <figcaption><strong>Figure 3: Domain Partitioning</strong></figcaption>
</figure>


In [1]:
import time
import lbm_mgpu
import warp as wp

exercise_name = "01-lbm-mgpu-occ"


In [2]:
wp.clear_kernel_cache()

gpus = wp.get_cuda_devices()

# Only for testing, if we have only one GPU, we can oversubscribe it
if len(gpus) == 1:
    gpus = gpus * 4

params = lbm_mgpu.Parameters(num_steps=5000,
                        gpus=gpus ,
                        nx=1024 ,
                        ny=1024 ,
                        prescribed_vel=0.5,
                        Re=10000.0)

Warp 1.7.1 initialized:
   CUDA Toolkit 12.8, Driver 12.8
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "NVIDIA RTX A4000" (16 GiB, sm_86, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.7.1



<figure>
  <img src="img/timeline.png" alt="Domain partitioning in LBM" width="50%">
  <figcaption><strong>Figure 3: Domain Partitioning</strong></figcaption>
</figure>

In [3]:
@wp.struct
class Partition:
    id: wp.int32
    num_partitions: wp.int32
    slices_per_partition: wp.int32
    origin: wp.vec(length=2, dtype=wp.int32)
    shape: wp.vec(length=2, dtype=wp.int32)
    shape_with_halo: wp.vec(length=2, dtype=wp.int32)
    shape_domain: wp.vec(length=2, dtype=wp.int32)
    shape_red: wp.vec(length=2, dtype=wp.int32)
    shape_green: wp.vec(length=2, dtype=wp.int32)

<figure>
  <img src="img/green-red-dim.png" alt="Domain partitioning in LBM" width="50%">
  <figcaption><strong>Figure 3: Domain Partitioning</strong></figcaption>
</figure>

<figure>
  <img src="img/panda.png" alt="Exercise" width="7%" style="float: left; margin-right: 10px; margin-bottom: 10px;">
  <figcaption><strong>Exercise</strong>: Complete the missing parameters in the initialization of the partitions.</figcaption>
</figure>


In [4]:
partitions = []

for i in range(params.num_gpsu):
    partition = lbm_mgpu.Partition()
    partition.id = i
    partition.num_partitions = params.num_gpsu
    partition.slices_per_partition = params.dim[0] // params.num_gpsu
    partition.origin[0] = i * partition.slices_per_partition
    partition.origin[1] = 0

    partition.shape[0] = partition.slices_per_partition
    partition.shape[1] = params.dim[1]
    partition.shape_domain[0] = params.dim[0]
    partition.shape_domain[1] = params.dim[1]

    partition.shape_with_halo[0] = partition.shape[0] + 2
    partition.shape_with_halo[1] = partition.shape[1]  # Add halo in y direction

    partition.shape_green[0] = MISSING  
    partition.shape_green[1] = MISSING 

    partition.shape_red[0] = MISSING
    partition.shape_red[1] = MISSING

    partitions.append(partition)

In [5]:
for src_gpu in params.gpus:
    for dst_gpu in params.gpus:
        if wp.is_peer_access_supported(src_gpu, dst_gpu):
            wp.set_peer_access_enabled(src_gpu, dst_gpu, True)
        if wp.is_peer_access_enabled(src_gpu, dst_gpu):
            print(f"peer access {src_gpu}->{dst_gpu}")

peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0
peer access cuda:0->cuda:0


<figure>
  <img src="img/panda.png" alt="Exercise" width="7%" style="float: left; margin-right: 10px; margin-bottom: 10px;">
  <figcaption><strong>Exercise</strong>: What happens if we don’t enable peer access?
  </figcaption>
</figure>


In [6]:
def get_fields(partitions):
    fields = []
    for i, partition in enumerate(partitions):
        nx = partition.shape_with_halo[0]
        ny = partition.shape_with_halo[1]
        f = wp.zeros((params.Q, nx, ny), dtype=wp.float64, device=params.gpus[i])
        fields.append(f)
    return fields

f_0 = get_fields(partitions)
f_1 = get_fields(partitions)

@wp.func
def read_field(field: wp.array3d(dtype=wp.float64), card: wp.int32, xi: wp.int32, yi: wp.int32):
    return field[card, xi + 1, yi]

@wp.func
def write_field(field: wp.array3d(dtype=wp.float64), card: wp.int32, xi: wp.int32, yi: wp.int32,
                value: wp.float64):
    field[card, xi + 1, yi] = value

In [7]:
# Initialize the memory
mem = lbm_mgpu.Memory(params,
                      partitions,
                      f_0=f_0,
                      f_1=f_1,
                      read=read_field,
                      write=write_field)

# Initialize the kernels
functions = lbm_mgpu.Functions(params)
kernels = lbm_mgpu.Kernels(params, mem)

Q = params.Q
D = params.D
bc_bulk = params.bc_bulk
c_dev = params.c_dev

compute_boundaries = functions.get_apply_boundary_conditions()

compute_macroscopic = functions.get_macroscopic()
compute_equilibrium = functions.get_equilibrium()
compute_collision = functions.get_kbc()


<figure>
  <img src="img/events.png" alt="Domain partitioning in LBM" width="50%">
  <figcaption><strong>Figure 3: Domain Partitioning</strong></figcaption>
</figure>

In [8]:
streams_compute_red = []
streams_compute_green = []

streams_halo_pull_left = []
streams_halo_pull_right = []

events_red_done = []
events_green_done = []

events_halo_left = []
events_halo_right = []

for i in range(params.num_gpsu):
    streams_compute_red.append(wp.Stream(params.gpus[i]))
    streams_compute_green.append(wp.Stream(params.gpus[i]))

    streams_halo_pull_left.append(wp.Stream(params.gpus[i]))
    streams_halo_pull_right.append(wp.Stream(params.gpus[i]))

    events_red_done.append(wp.Event(params.gpus[i]))
    events_green_done.append(wp.Event(params.gpus[i]))

    events_halo_left.append(wp.Event(params.gpus[i]))
    events_halo_right.append(wp.Event(params.gpus[i]))


<figure>
  <img src="img/panda.png" alt="Exercise" width="7%" style="float: left; margin-right: 10px; margin-bottom: 10px;">
  <figcaption><strong>Exercise</strong>: Complete the missing initializations.
  </figcaption>
</figure>

In [9]:

@wp.kernel
def green(
        partition: lbm_mgpu.Partition,
        omega: wp.float64,
        f_in: wp.array3d(dtype=wp.float64),
        bc_type_field: wp.array2d(dtype=wp.uint8),
        f_out: wp.array3d(dtype=wp.float64),
):
    # Get the global index
    it, jt = wp.tid()
    MISSING
    partition_index = wp.vec2i(it, jt)
    domain_index = partition_index + partition.origin

    f_post = wp.vec(length=Q, dtype=wp.float64)
    bc_type = bc_type_field[partition_index[0], partition_index[1]]

    for q in range(params.Q):
        partition_pull_ngh = wp.vec2i(0, 0)
        domain_pull_ngh = wp.vec2i(0, 0)

        outside_domain = False

        for d in range(D):
            partition_pull_ngh[d] = partition_index[d] - c_dev[d, q]
            domain_pull_ngh[d] = domain_index[d] - c_dev[d, q]
            if domain_pull_ngh[d] < 0 or domain_pull_ngh[d] >= partition.shape_domain[d]:
                outside_domain = True
        if not outside_domain:
            f_post[q] = read_field(field=f_in, card=q, xi=partition_pull_ngh[0], yi=partition_pull_ngh[1])

    if bc_type != bc_bulk:
        f_post = compute_boundaries(bc_type)

    mcrpc = compute_macroscopic(f_post)

    # Compute the equilibrium
    f_eq = compute_equilibrium(mcrpc)

    f_post = compute_collision(f_post, f_eq, mcrpc, omega)

    # Set the output
    for q in range(params.Q):
        write_field(field=f_out, card=q, xi=partition_index[0], yi=partition_index[1], value=f_post[q])


<figure>
  <img src="img/panda.png" alt="Exercise" width="7%" style="float: left; margin-right: 10px; margin-bottom: 10px;">
  <figcaption><strong>Exercise</strong>: Complete the missing initializations.
  </figcaption>
</figure>

In [10]:
@wp.kernel
def red(
        partition: lbm_mgpu.Partition,
        omega: wp.float64,
        f_in: wp.array3d(dtype=wp.float64),
        bc_type_field: wp.array2d(dtype=wp.uint8),
        f_out: wp.array3d(dtype=wp.float64),
):
    # Get the global index
    it, jt = wp.tid()
    if MISSING:
        MISSING
    partition_index = wp.vec2i(it, jt)
    domain_index = partition_index + partition.origin

    f_post = wp.vec(length=Q, dtype=wp.float64)
    bc_type = bc_type_field[partition_index[0], partition_index[1]]

    for q in range(params.Q):
        partition_pull_ngh = wp.vec2i(0, 0)
        domain_pull_ngh = wp.vec2i(0, 0)

        outside_domain = False

        for d in range(D):
            partition_pull_ngh[d] = partition_index[d] - c_dev[d, q]
            domain_pull_ngh[d] = domain_index[d] - c_dev[d, q]
            if domain_pull_ngh[d] < 0 or domain_pull_ngh[d] >= partition.shape_domain[d]:
                outside_domain = True
        if not outside_domain:
            f_post[q] = read_field(field=f_in, card=q, xi=partition_pull_ngh[0], yi=partition_pull_ngh[1])

    if bc_type != bc_bulk:
        f_post = compute_boundaries(bc_type)

    mcrpc = compute_macroscopic(f_post)

    # Compute the equilibrium
    f_eq = compute_equilibrium(mcrpc)

    f_post = compute_collision(f_post, f_eq, mcrpc, omega)

    # Set the output
    for q in range(params.Q):
        write_field(field=f_out, card=q, xi=partition_index[0], yi=partition_index[1], value=f_post[q])

In [11]:
lbm_mgpu.setup_LDC_problem(params=params, partitions=partitions, mem=mem)
lbm_mgpu.export_setup(prefix=exercise_name, params=params, partitions=partitions, mem=mem)

Module lbm_mgpu.kernels 0620e2b load on device 'cuda:0' took 134.27 ms  (compiled)
Module lbm_mgpu.kernels 9701ec0 load on device 'cuda:0' took 283.91 ms  (compiled)


<Figure size 640x480 with 0 Axes>

<figure>
  <img src="img/panda.png" alt="Exercise" width="7%" style="float: left; margin-right: 10px; margin-bottom: 10px;">
  <figcaption><strong>Exercise</strong>: Add record and wait event operations.
  </figcaption>
</figure>

In [1]:
# #mem.save_magnituge_vtk(0)
def iterate():
    for i, p in enumerate(partitions):
        for q in range(params.Q):
            if i != params.num_gpsu - 1:
                src = mem.f_0[i + 1][q, 1]
                dst = mem.f_0[i][q, partition.shape_with_halo[0] - 1]
                wp.copy(src=src, dest=dst, count=p.shape[1], stream=streams_halo_pull_right[i])
            if i != 0:
                src = mem.f_0[i - 1][q, partition.shape_with_halo[0] - 2]
                dst = mem.f_0[i][q, 0]
                wp.copy(src=src, dest=dst, count=p.shape[1], stream=streams_halo_pull_left[i])

        MISSING?
        MISSING?


    for i, p in enumerate(partitions):

        MISSING?
        MISSING?

        wp.launch(red,
                  dim=p.shape_red,
                  inputs=[p, params.omega, mem.f_0[i], mem.bc_type[i], mem.f_1[i]],
                  device=params.gpus[i],
                  stream=streams_compute_red[i])

    for i, p in enumerate(partitions):
        wp.launch(green,
                  dim=p.shape_green,
                  inputs=[p, params.omega, mem.f_0[i], mem.bc_type[i], mem.f_1[i]],
                  device=params.gpus[i],
                  stream=streams_compute_green[i])

    MISSING?

    MISSING?
    # Swap the fields
    mem.f_0, mem.f_1 = mem.f_1, mem.f_0


In [13]:
# Warm up iteration
iterate()

# Wait for the warm-up to finish
wp.synchronize()
# Start timer
start = time.time()
for it in range(params.num_steps):
    iterate()

wp.synchronize()
stop = time.time()

lbm_mgpu.export_final(prefix=exercise_name, params=params, partitions=partitions, mem=mem, f=mem.f_0)

# Statistics
elapsed_time = stop - start
mlups = params.compute_mlups(elapsed_time)
print(f"Main loop time: {elapsed_time:5.3f} seconds")
print(f"MLUPS:          {mlups:5.1f}")

Module __main__ 269b5e9 load on device 'cuda:0' took 1043.47 ms  (compiled)
Module lbm_mgpu.kernels 65155cc load on device 'cuda:0' took 371.97 ms  (compiled)
Main loop time: 14.486 seconds
MLUPS:          361.9


<Figure size 640x480 with 0 Axes>

<figure>
  <img src="img/panda.png" alt="Exercise" width="7%" style="float: left; margin-right: 10px; margin-bottom: 10px;">
  <figcaption><strong>Exercise</strong>: Optimize the iteration after the first profiling session.
  </figcaption>
</figure>