In [1]:
import numpy as np
from numba import cuda
import math

In [2]:
@cuda.jit
def gpu_cos(a, out):
    i = cuda.grid(1)
    if i < a.shape[0]:
        out[i] = math.cos(a[i])

In [3]:
x = np.linspace(0, 2 * math.pi, 1e8, dtype=np.float32)
cpu_out = np.empty_like(x)
gpu_out = np.empty_like(x)

thread_config = (len(x) // 512 + 1), 512

In [4]:
%timeit np.cos(x, cpu_out)

1 loops, best of 3: 802 ms per loop


In [5]:
%timeit gpu_cos[thread_config](x, gpu_out)

1 loops, best of 3: 557 ms per loop


In [6]:
cpu_out.sum()

33.234375

In [7]:
gpu_out.sum()

33.234375