# Overlapping data transfer and computations with arctan dummy computation

*Disclaimer: we used the venv from the HPC4WC course, see https://github.com/ofuhrer/HPC4WC for the setup.

This file includes experimental code used along the way of finding the 'optimal' solution. It is included more for completeness, the important results with explanations are located in the `Report.ipynb`. I have added some comments (marked as 'Comment: ' so my process and thoughts are more understandable.

In [31]:
import random
import time

import cupy as cp
import cupyx as cpx
import matplotlib.pyplot as plt
import numpy as np

random.seed(42)

Comment: I was experimenting with numpy/cupy as I thought we might benchmark against numpy version aswell. We decided against that (obviously this would greatly distort results as numpy is way slower than cupy) because we wanted to get an efficiency gain using the same architecture & libraries.

In [32]:
def naive_np_iterate_arctan(x, i):
    result = x
    for _ in range(i):
        result = np.arctan(result)
    return result

def naive_cp_iterate_arctan(x, i):
    result = x
    for _ in range(i):
        result = cp.arctan(result)
    return result

In [33]:
# generate the random array with numpy
def generate_random_array_np(n):
    return np.random.rand(n)

# generate the random array with random
def naive_generate_random_array(n):
    return [random.random() for _ in range(n)]

# generate the random array directly on the gpu
def generate_random_array_cp(n):
    return cp.random.random(n)

In [34]:
# numpy naive implementation
def np_timed(num_elems, iters):
    random_array = generate_random_array_np(num_elems)

    start_time = time.time()
    result_array = naive_np_iterate_arctan(random_array, iters)
    computation_duration = time.time() - start_time
    print(
        f"number of elements: {num_elems}, iterations: {iters},  computation time np: {computation_duration:.6f}"
    )

Comment: this was before we realized that the copy to device cannot be done asynchronously with the older cupy version. Note that I'm freeing memory at the end because with my experiments I kept running out of RAM on the GPU as I used very large arrays.

In [35]:
# cupy with transfer
def cp_timed(num_elems, iters):
    random_array = generate_random_array_np(num_elems)
    T_out = cpx.zeros_pinned(random_array.shape, dtype=np.float64)

    # transfer
    start_time = time.time()
    random_gpu_array = cp.asarray(random_array)
    transfer_duration = time.time() - start_time
    print(f"Transfer duration: {transfer_duration:.6f} seconds")

    start_time = time.time()
    random_gpu_array = naive_cp_iterate_arctan(random_gpu_array, iters)
    computation_duration = time.time() - start_time
    print(f"Computation time CuPy: {computation_duration:.6f} seconds")

    # Copy result back to CPU
    start_time = time.time()
    random_gpu_array.get(out=T_out)
    # result_array = cp.asnumpy(random_gpu_array)
    cp.cuda.runtime.deviceSynchronize()
    copyback_duration = time.time() - start_time
    print(f"Copy back to CPU duration: {copyback_duration:.6f} seconds")

    print(
        f"number of elements: {num_elems}, iterations: {iters}, total computation time naive cp: {transfer_duration + computation_duration + copyback_duration:.6f}"
    )

    del random_array, random_gpu_array
    cp.get_default_memory_pool().free_all_blocks() # ran into out of memory problems because I experimented with too large arrays

Comment: This is a left-over of me not understanding streams, naively I tried using 3 streams, one for copying HtoD, one for the computations, and one for copying DtoH

In [36]:
# Pipelining
def cp_pipelining(num_elems, iters, block_size):
    random_array = generate_random_array_np(num_elems)

    # Transfer and compute in blocks
    start_time = time.time()
    streams = [
        cp.cuda.Stream(non_blocking=True) for _ in range(3)
    ]  # Create two streams for overlap

    result_blocks = []
    for i in range(0, n, block_size):
        block = random_array[i : i + block_size]
        with streams[0]:  # Transfer to GPU
            gpu_block = cp.asarray(block)

        with streams[1]:  # Compute on GPU
            result_block = naive_cp_iterate_arctan(gpu_block, iters)
            result_blocks.append(result_block)

        with streams[2]:  # Copy back to CPU
            result_cpu_block = cp.asnumpy(result_block)
            result_blocks.append(result_cpu_block)

        del gpu_block, result_block
    # Ensure all operations are completed
    for stream in streams:
        stream.synchronize()

    transfer_computation_duration = time.time() - start_time
    print(
        f"number of elements: {num_elems}, iterations: {iters}, block_size: {block_size}, total computation time pipelining cp: {transfer_computation_duration:.6f}"
    )
    cp.get_default_memory_pool().free_all_blocks()

Comment: Same as before. But maybe pinned memory fixes 

In [37]:
# pinned memory
def cp_pipelining_pinned_mem(num_elems, iters, block_size):
    random_array = generate_random_array_np(num_elems)

    # Allocate pinned memory
    pinned_memory = cp.cuda.alloc_pinned_memory(random_array.nbytes)
    pinned_array = np.frombuffer(pinned_memory, random_array.dtype).reshape(
        random_array.shape
    )
    np.copyto(pinned_array, random_array)

    # Transfer and compute in blocks
    start_time = time.time()
    streams = [cp.cuda.Stream(non_blocking=True) for _ in range(3)]

    result_blocks = []
    for i in range(0, num_elems, block_size):
        block = pinned_array[i : i + block_size]

        with streams[0]:  # Transfer to GPU
            gpu_block = cp.asarray(block)

        with streams[1]:  # Compute on GPU
            result_block = naive_cp_iterate_arctan(gpu_block, iters)

        with streams[2]:  # Copy back to CPU
            result_cpu_block = cp.asnumpy(result_block)
            result_blocks.append(result_cpu_block)

        # Free GPU memory
        del gpu_block, result_block

    # Ensure all operations are completed
    for stream in streams:
        stream.synchronize()

    transfer_computation_duration = time.time() - start_time
    print(
        f"number of elements: {num_elems}, iterations: {iters}, block_size: {block_size}, total computation time pipelining cp: {transfer_computation_duration:.6f}"
    )
    del result_blocks
    cp.get_default_memory_pool().free_all_blocks()

Comment: We've understood that streams work differently, and even better with pinned memory.

In [38]:
# Pipelining
def cp_pipelining_all(num_elems, iters, block_size):
    random_array = generate_random_array_np(num_elems)
    T_out = cpx.zeros_pinned(random_array.shape, dtype=np.float64)

    # Transfer and compute in blocks
    start_time = time.time()
    streams = [
        cp.cuda.Stream(non_blocking=True) for _ in range(0, num_elems, block_size)
    ]  # Create streams for overlap

    result_blocks = []
    for i, s in zip(range(0, num_elems, block_size), streams):
        block = random_array[i : i + block_size]

        with s:
            gpu_block = cp.asarray(block)
            result_block = naive_cp_iterate_arctan(gpu_block, iters)
            result_blocks.append(result_block)
            result_block.get(out=T_out[i : i + block_size])
        del gpu_block, result_block

    # Ensure all operations are completed
    for stream in streams:
        stream.synchronize()

    transfer_computation_duration = time.time() - start_time
    print(
        f"number of elements: {num_elems}, iterations: {iters}, block_size: {block_size}, total computation time pipelining cp: {transfer_computation_duration:.6f}"
    )
    cp.get_default_memory_pool().free_all_blocks()

In [39]:
n = 2**20
iterations = 10**3

In [40]:
np_timed(n, iterations)

number of elements: 1048576, iterations: 1000,  computation time np: 18.018865


In [41]:
cp_timed(n, iterations) #gpu needs some warmup

Transfer duration: 0.001263 seconds
Computation time CuPy: 0.011194 seconds
Copy back to CPU duration: 0.024143 seconds
number of elements: 1048576, iterations: 1000, total computation time naive cp: 0.036600


In [42]:
cp_timed(n, iterations)

Transfer duration: 0.000970 seconds
Computation time CuPy: 0.010591 seconds
Copy back to CPU duration: 0.024732 seconds
number of elements: 1048576, iterations: 1000, total computation time naive cp: 0.036292


Comment: I wonder why the 'pipelining' is not faster than the normal synchronous call :D

In [43]:
cp_pipelining(n, iterations, n)

number of elements: 1048576, iterations: 1000, block_size: 1048576, total computation time pipelining cp: 0.035711


In [44]:
n = 2**24
iterations = 2 * 10**3

elements_sizes_array = np.array([2**i for i in range(14, 24)])
iterations_array = np.array([2**i for i in range(8, 12)])
block_sizes_array = np.array([int(n / 2**i) for i in range(3, 7)])

Comment: Wanted to find a configuration where we have speedups, but that didn't go that well ended up using only different block sizes

In [45]:
cp_timed(n, iterations)  # for reference

Transfer duration: 0.050344 seconds
Computation time CuPy: 0.498564 seconds
Copy back to CPU duration: 0.520006 seconds
number of elements: 16777216, iterations: 2000, total computation time naive cp: 1.068914


In [46]:
for block_size in block_sizes_array:
    cp_pipelining_all(n, iterations, block_size)

number of elements: 16777216, iterations: 2000, block_size: 2097152, total computation time pipelining cp: 1.005827
number of elements: 16777216, iterations: 2000, block_size: 1048576, total computation time pipelining cp: 1.005080
number of elements: 16777216, iterations: 2000, block_size: 524288, total computation time pipelining cp: 1.006255
number of elements: 16777216, iterations: 2000, block_size: 262144, total computation time pipelining cp: 1.274939


In [47]:
for block_size in block_sizes_array:
    cp_pipelining(n, iterations, block_size)

number of elements: 16777216, iterations: 2000, block_size: 2097152, total computation time pipelining cp: 1.042706
number of elements: 16777216, iterations: 2000, block_size: 1048576, total computation time pipelining cp: 1.081355
number of elements: 16777216, iterations: 2000, block_size: 524288, total computation time pipelining cp: 1.142774
number of elements: 16777216, iterations: 2000, block_size: 262144, total computation time pipelining cp: 1.333033


In [48]:
for block_size in block_sizes_array:
    cp_pipelining_pinned_mem(n, iterations, block_size)

number of elements: 16777216, iterations: 2000, block_size: 2097152, total computation time pipelining cp: 1.043696
number of elements: 16777216, iterations: 2000, block_size: 1048576, total computation time pipelining cp: 1.081404
number of elements: 16777216, iterations: 2000, block_size: 524288, total computation time pipelining cp: 1.144172
number of elements: 16777216, iterations: 2000, block_size: 262144, total computation time pipelining cp: 1.300752


Comment: cp_pipelining_all should've been faster, but it wasn't. The story continues in the main file...