# Numba vs. NumPy and Nvidia vs. Intel

In [1]:
import math
import numpy as np
from numba import jit, cuda

In [2]:
if cuda.is_available():
    cuda.detect()

Found 1 CUDA devices
id 0         b'Quadro K2200'                              [SUPPORTED]
                      compute capability: 5.0
                           pci device id: 0
                              pci bus id: 1
Summary:
	1/1 devices are supported


In [3]:
import platform
print(platform.processor())

Intel64 Family 6 Model 42 Stepping 7, GenuineIntel


In [4]:
n = 10000000
a = np.arange(1, n+1, dtype=np.float64)
b = np.arange(n+1, 1, -1, dtype=np.float64)
result = np.zeros(n, dtype=np.float64)

In [5]:
def calc_numpy(a, b):
    return np.sqrt(np.abs(np.exp(np.log(a)+np.log(b))))

%timeit global r_numpy; r_numpy = calc_numpy(a, b)

1 loop, best of 3: 845 ms per loop


In [6]:
@jit
def calc_c(x, y):
    return math.sqrt(abs(math.exp(math.log(x)+math.log(y))))

@jit
def calc_numba(a,b, result, n):
     for i in range(n):
        result[i] = calc_c(a[i], b[i]) 
        
%timeit calc_numba(a, b, result, n)
r_numba = np.copy(result)

1 loop, best of 3: 639 ms per loop


In [7]:
@cuda.jit(device=True)
def calc_device(x, y):
    return math.sqrt(abs(math.exp(math.log(x)+math.log(y))))

@cuda.jit
def calc_cuda(a, b, result, n):
    tx = cuda.threadIdx.x
    bx = cuda.blockIdx.x
    bsz = cuda.blockDim.x
    i = tx + bx * bsz
    if i < n:
        result[i] = calc_device(a[i], b[i])

if cuda.is_available():    
    %timeit calc_cuda[(n+1023)//1024, 1024](a, b, result, n)
    r_cuda = np.copy(result)

The slowest run took 13.17 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 251 ms per loop


The arrays calculated on CPU and GPU are not precisely equal! 

In [8]:
print(np.array_equal(r_numpy, r_numba), np.allclose(r_numpy, r_numba)) 

True True


In [9]:
if cuda.is_available():
    print(np.array_equal(r_numba, r_cuda), np.allclose(r_numba, r_cuda))

False True
