# Asynchronous and concurrent execution on GPUs
>*Melina Abeling, Julian Aeissen, Michele Pagani*. Supervised by *Oliver Fuhrer*

*GPUs allow for asynchronous (CPU simply launches work on GPU and then continues) and concurrent (multiple tasks are executed in parallel on the GPU) execution. In this project you will implement a simple stencil program and investigate performance using different stencil motifs as a function of grid size and amount of concurrency. It is foreseen to use either CuPy or CUDA for this project.*


---

## Introduction
*TODO*

---

## Methods
*TODO*

In [None]:
# Imports
import time
import numpy as np
import cupy as cp
import matplotlib.pyplot as plt
import math

In [None]:
# Utils
def update_2d_halo(field, num_halo):
    # bottom edge (without corners)
    field[:num_halo, num_halo:-num_halo] = field[
        -2 * num_halo : -num_halo, num_halo:-num_halo
    ]

    # top edge (without corners)
    field[-num_halo:, num_halo:-num_halo] = field[
        num_halo : 2 * num_halo, num_halo:-num_halo
    ]

    # left edge (including corners)
    field[:, :num_halo] = field[-2 * num_halo : -num_halo]

    # right edge (including corners)
    field[:, -num_halo:] = field[num_halo : 2 * num_halo]

def update_3d_halo(field, num_halo):
    # bottom edge (without corners)
    field[:, :num_halo, num_halo:-num_halo] = field[
        :, -2 * num_halo : -num_halo, num_halo:-num_halo
    ]

    # top edge (without corners)
    field[:, -num_halo:, num_halo:-num_halo] = field[
        :, num_halo : 2 * num_halo, num_halo:-num_halo
    ]

    # left edge (including corners)
    field[:, :, :num_halo] = field[:, :, -2 * num_halo : -num_halo]

    # right edge (including corners)
    field[:, :, -num_halo:] = field[:, :, num_halo : 2 * num_halo]

### Stencils
*TODO*

*Example*

In [None]:
def step_stencil_example(in_field, out_field, n_halo):
    # Checks
    assert len(in_field.shape) == 2
    assert len(out_field.shape) == 2
    h,w = out_field.shape
    h_in_,w_in_ = out_field.shape
    assert h_in = h + 2*n_halo
    assert w_in = w + 2*n_halo

    # Example with laplacian
    # IMPORTANT always have an expected halo
    assert n_halo == 1

    # Computation
    out_field[:,:] = (
        -4.0 * in_field[1:-1, 1:-1]
        + in_field[2:, 1:-1]
        + in_field[:-2, 1:-1]
        + in_field[1:-1, 2:]
        + in_field[1:-1, :-2]
    )

#### A
*TODO*

In [None]:
def step_stencil_a(field):
    pass

#### B
*TODO*

In [None]:
def step_stencil_b(field):
    pass

### Sequential
*TODO*

In [None]:
def sequential_computation(stencil, field):
    pass

### Concurrent
*TODO*

In [None]:
def compute_2d_gpu(in_field, stencil, n_stream, n_iter, n_halo):
    # Init
    out_field = in_field
    
    # Chech n_stream
    assert math.sqrt(n_stream).is_integer()
    stream_per_side = int(math.sqrt(n_stream))
    
    # Check in_field
    assert len(in_field.shape) == 2
    h,w = in_field.shape
    h -= 2*n_halo
    w -= 2*n_halo
    assert h % stream_per_side == 0
    assert w % stream_per_side == 0
    h_stream = h / stream_per_side
    w_stream = w / stream_per_side
    
    # Create streams
    streams = [ cp.cuda.Stream() for _ in range(n_stream) ]

    for iter in range(n_iter):
        # Init
        e = cp.cuda.Event()
        e.record()
        update_halo(in_field, n_halo)

        # Iterate over streams
        for idx, s in enumerate(streams):
            # Indeces
            i, j = idx // stream_per_side, idx % stream_per_side
            with s:
                # Stencil iteration
                stencil(
                    in_field[
                        i*stream_per_side: 2*n_halo + (i+1)*stream_per_side,
                        j*stream_per_side: 2*n_halo + (j+1)*stream_per_side
                    ],
                    out_field[
                        n_halo + i*stream_per_side: n_halo + (i+1)*stream_per_side,
                        n_halo + j*stream_per_side: n_halo + (j+1)*stream_per_side
                    ],
                    n_halo
                )

        # Syncronize all streams
        e.syncronize()

        # Update out_field
        if iter < n_iter - 1:
            in_field, out_field = out_field, in_field
            
    return output_field

### Initial field
*TODO*

In [None]:
def get_initial_field(size):
    pass

---

## Results
*TODO*

### Performance over concurrency
*TODO*

In [None]:
# Settings
steps_concurrency = [1, 2, 4, 8, 16, 32]
field_size = 512*512

# Setup
input_field_concur = get_initial_field(field_size)
output_fields_a_concur = []
output_fields_b_concur = []
times_a_concur = []
times_b_concur = []

for concurrency in steps_concurrency:

    # Compute for stencil A
    tic = time.perf_counter()
    output_fields_a_concur.append(compute_gpu(step_stencil_a, input_field_concur, concurrency))
    times_a_concur.append(time.perf_counter())

    # Compute for stencil B
    tic = time.perf_counter()
    output_fields_b_concur.append(compute_gpu(step_stencil_b, input_field_concur, concurrency))
    times_b_concur.append(time.perf_counter())


In [None]:
# Plots
plt.title("Concurrency analysis")

plt.plot(steps_concurrency, times_a_concur, label = "Stencil A")
plt.plot(steps_concurrency, times_b_concur, label = "Stencil B")

plt.xlabel('# streams')
plt.ylabel('Time [s]')
plt.legend()


### Performance over grid size
*TODO*

In [None]:
# TODO

In [None]:
# Plots

---

## Discussion
*TODO*

---

## Conclusion
*TODO*