# Multi-GPU Computing with [CuPy](https://cupy.chainer.org/)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cupy as cp
from cupy.cuda import Device as Device

In [None]:
from timers import cpu_timer

### Getting the number of GPUs on the node

In [None]:
device_count = cp.cuda.runtime.getDeviceCount()
print(f'The node has "{device_count}" CUDA GPUs')

### Get the properties of each device

In [None]:
props = [cp.cuda.runtime.getDeviceProperties(i) for i in range(device_count)]

In [None]:
for i, p in enumerate(props):
    print(f'Device {i}: {p["name"]}')

In [None]:
for k, v in props[0].items():
    print(f'{k}: {v}')

### The concept of the `Current Device`

#### `CuPy` makes use of the *Current Device* when performing array allocations and kernel launches

#### Getting the GPU where an array resides

In [None]:
print(f'The current device is: {cp.cuda.get_device_id()}')

#### To make use of a different device you can either use: `cupy.cuda.Device.use`

In [None]:
device0 = Device(0)
device1 = Device(1)
device1.use()
print(f'The current device is: {cp.cuda.get_device_id()}')
device0.use()
print(f'The current device is: {cp.cuda.get_device_id()}')

#### Alternatively an instance of `cupy.cuda.Device` can be used as a context manager

In [None]:
with device1:
    print(f'The current device is: {cp.cuda.get_device_id()}')

print(f'The current device is: {cp.cuda.get_device_id()}')

#### Allocate an array on a given device

In [None]:
with Device(1):
    x1 = cp.zeros(1000)

print(f'Array x1 is allocated on device: {x1.device}')

#### CuPy handles copying of arrays between devices, using Peer-to-Peer functionality

In [None]:
with Device(0):
    array_dev0 = cp.ones((100000))

with Device(1):
    array_dev1 = cp.zeros_like(array_dev0)


cp.copyto(array_dev1, array_dev0)

### <mark>Exercise</mark> Measure the bandwidth achieved when copying arrays between devices

In [None]:
julia_kernel = cp.ElementwiseKernel('float64 X, float64 Y, float64 cx, float64 cy, int32 itermax, float64 radius2',
                                    'int32 julia',
                                    f'''julia = 0;
                                    double x = X, y = Y;
                                    double xtemp;
                                    int nit = 0;
                                    while(x * x + y * y < radius2 && nit < itermax) {{
                                        xtemp = x * x - y * y + cx;
                                        y = 2.0 * x * y + cy;
                                        x = xtemp;
                                        nit += 1;
                                    }}
                                    julia = nit;''', 'julia_kernel')

### The concept of the `Current Stream`

#### `CuPy` makes use of the *Current Stream* when launching operations on the GPU

#### Use instances of `cupy.cuda.Stream` to launch kernels asynchronously

In [None]:
kernel_count = 9
rng = np.random.default_rng(29)
complex_values = [complex(rng.uniform(-1, 1), rng.uniform(-1, 1)) for _ in range(kernel_count)]
X, Y = cp.meshgrid(cp.linspace(-2.0 , 2.0, 5000), cp.linspace(-2.0, 2.0, 5000))
events = [None] * kernel_count
julia_arrays = [None] * kernel_count

# Warmup
julia_kernel(X, Y, complex_values[0].real, complex_values[0].imag, 1000, 4.0)

with cpu_timer(log=True):
    for i, c in enumerate(complex_values):
        stream = cp.cuda.Stream(non_blocking=True)
        with stream:
            start = cp.cuda.Event()
            stop = cp.cuda.Event()
            start.record()
            julia_arrays[i] = julia_kernel(X, Y, c.real, c.imag, 1000, 4.0)
            stop.record()
            events[i] = (start, stop)
            
    for i in range(kernel_count):
        stop = events[i][1]
        stop.synchronize()

for i in range(kernel_count):
    start, stop = events[i]
    print(f'Kernel {i}: {cp.cuda.get_elapsed_time(start, stop)} ms')

fig = plt.figure(figsize=(15, 15))

for i in range(kernel_count):
    ax = fig.add_subplot(330 + i + 1)
    julia_array = julia_arrays[i][::5, ::5].get()
    ax.imshow(julia_array, extent=[-2, 2, -2, 2]);

### Multi-gpu kernel launching

#### CuPy is going to use the *Current Device* and the *Current Stream* to launch a kernel

In [None]:
julia_arrays = [None] * device_count
complex_values = [-0.9 + 0.22143j, -0.4 + 0.9j, 0.3 + 0.58j, -2.0 + 0.16j]
events = [None] * device_count

with cpu_timer():
    for i in range(device_count):
        with Device(i):
            c = complex_values[i]
            stream = cp.cuda.Stream(non_blocking=True)
            with stream:
                start = cp.cuda.Event()
                stop = cp.cuda.Event()
                start.record()
                julia_arrays[i] = julia_kernel(*cp.meshgrid(cp.linspace(-2.0 , 2.0, 50000), cp.linspace(-2.0, 2.0, 50000)), c.real, c.imag, 10000, 4.0)
                stop.record()
                events[i] = (start, stop)
            
    for i in range(device_count):
        with Device(i):
            stop = events[i][1]
            stop.synchronize()

for i in range(device_count):
    start, stop = events[i]
    with Device(i):
        print(f'Device {i}: {cp.cuda.get_elapsed_time(start, stop)} ms')

fig = plt.figure(figsize=(10, 10))

for i in range(device_count):
    ax = fig.add_subplot(220 + i + 1)
    julia_array = julia_arrays[i][::100, ::100].get()
    ax.imshow(julia_array, extent=[-2, 2, -2, 2]);