In [None]:
!pip install pyopencl

### Kernel

In [None]:
%%writefile program.cl

__kernel void sum(
    __global float* a, 
    __global float* b, 
    __global float* c) 
{
    int id = get_global_id(0);

    c[id] = a[id] + b[id];
}

### Runtime

In [None]:
import numpy as np
import pyopencl as cl

np.random.seed(0)

In [None]:
SIZE = 100000

a = np.random.randn(SIZE).astype(np.float32)
b = np.random.randn(SIZE).astype(np.float32)

c = np.zeros(SIZE, dtype=np.float32)

print(a)
print(b)
print(c)

In [None]:
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

In [None]:
mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY, size=a.nbytes)
b_buf = cl.Buffer(ctx, mf.READ_ONLY, size=b.nbytes)
c_buf = cl.Buffer(ctx, mf.WRITE_ONLY, size=c.nbytes)

In [None]:
cl.enqueue_copy(queue, a_buf, a)
cl.enqueue_copy(queue, b_buf, b)
cl.enqueue_copy(queue, c_buf, c)

In [None]:
program_file = open("program.cl", "r")
program_src = program_file.read()

program = cl.Program(ctx, program_src)
program.build()     
kernel = program.sum

kernel.set_args(a_buf, b_buf, c_buf)

In [None]:
global_size = (SIZE,)
cl.enqueue_nd_range_kernel(queue, kernel, global_size, None)

In [None]:
cl.enqueue_copy(queue, c, c_buf)
print(c)