### Element-wise multiplication of two arrays 

In [24]:
from numba import cuda
import numpy as np 

In [19]:
print(cuda.gpus)

<Managed Device 0>


#### CPU method:

In [20]:
def multip(a,b,c):
    for i in range(len(a)):
        c[i] = a[i]*b[i]

#### input arrays

In [21]:
n = 100000000
a = np.full(n, 2.12474, dtype = np.float32)
b = np.full(n, 3.12445, dtype = np.float32)
c = np.empty_like(a)

In [22]:
%time multip(a, b, c)

CPU times: user 29.2 s, sys: 152 ms, total: 29.4 s
Wall time: 29.4 s


In [23]:
print(c)

[6.6386433 6.6386433 6.6386433 ... 6.6386433 6.6386433 6.6386433]


#### GPU kernel: 

In [6]:
@cuda.jit('void(float32[:], float32[:], float32[:])')
def cuda_mult(a,b,c):
    """This kernel function will be executed by a thread."""
    i = cuda.blockIdx.x * cuda.blockDim.x + cuda.threadIdx.x
    if i > c.size:
        return
    c[i] = a[i]*b[i]

In [7]:
#### Memory allocation on GPU and arrays copy

In [8]:
device = cuda.get_current_device()

In [9]:
device.WARP_SIZE 

32

In [10]:
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.device_array_like(a)

tpb = device.WARP_SIZE       #blocksize or thread per block
bpg = int(np.ceil((n)/tpb))  #block per grid

In [11]:
tpb, bpg, tpb*bpg

(32, 3125000, 100000000)

In [None]:
blockspergrid = (n.size + (tpb - 1)) // tpb

In [55]:
%time cuda_mult[bpg, tpb](d_a, d_b, d_c)

CPU times: user 1.25 ms, sys: 68 µs, total: 1.32 ms
Wall time: 924 µs


In [56]:
# Transfer output from device to host
c = d_c.copy_to_host()
print (c)

[6.6386433 6.6386433 6.6386433 ... 6.6386433 6.6386433 6.6386433]


In [57]:
len(c)

100000000

### Race condition:

CUDA, like many general purpose parallel execution frameworks, makes it possible to have race condtions in your code. A race condition in CUDA arises when **threads read or write a memory location that might be modified by another independent thread**. Generally speaking, you need to worry about:

* `read-after-write hazards`: One thread is reading a memory location at the same time another thread might be writing to it.
* `write-after-write hazards`: Two threads are writing to the same memory location, and only one write will be visible when the kernel is complete.

In [13]:
@cuda.jit('void(int32[:])')
def thread_counter_race_condition(global_counter):
    global_counter[0] += 1  # This is bad

Every thread increments a global counter.

1. Read the current value of a global counter.
2. Compute counter + 1.
3. Write that value back to global memory.

In [14]:
global_counter = np.array([0], dtype=np.int32)  
d_global_counter = cuda.to_device(global_counter)

In [15]:
tbp=64
bpg=64

In [16]:
thread_counter_race_condition[bpg, tbp](d_global_counter)

print('Should be %d:' % (tbp*bpg), d_global_counter.copy_to_host())

Should be 4096: [1]
