In [None]:
!pip install pyopencl

### Kernel

In [None]:
%%writefile program.cl

__kernel void matrix_mul(const int k, const int m, __global int* a, __global int* b, __global int* c) {
		
	int id_x = get_global_id(1);
	int id_y = get_global_id(0);

	c[id_y*m + id_x] = 0;

	for(int i = 0; i < k; i++)
		c[id_y*m + id_x] += a[id_y*k + i] * b[i*m + id_x];
}

### Runtime

In [None]:
import numpy as np
import pyopencl as cl

np.random.seed(0)

N = 6
M = 8
K = 10
RANGE = 5

a = np.random.randint(-RANGE, RANGE, size=(N, K), dtype=np.int32)
b = np.random.randint(-RANGE, RANGE, size=(K, M), dtype=np.int32)

c = np.zeros((N, M), dtype=np.int32)

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

mf = cl.mem_flags
a_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
b_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b)
c_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=c)

program_src = open("program.cl", "r").read()
program = cl.Program(ctx, program_src)
program.build()
matrix_mul = program.matrix_mul

matrix_mul(queue, c.shape, None, np.int32(K), np.int32(M), a_buf, b_buf, c_buf)
cl.enqueue_copy(queue, c, c_buf)

print("Results matching:", np.allclose(a @ b, c))