# Benchmark runs

In [1]:
import numba as nb
import numpy as np
import time

## Hadamard product

In [2]:
def hadamard(A, B):
    return A * B

In [3]:
hadamard_ufunc = nb.vectorize(
    ["float32(float32, float32)", "float64(float64, float64)"]
)(hadamard)

In [4]:
def hadamardcore(A, B, C):
    m, n = A.shape
    m, n = B.shape
    for i in range(m):
        for j in range(n):
            C[i, j] = A[i, j] * B[i, j]

In [5]:
gu_hadamard = nb.guvectorize(
    [
        "float32[:,:], float32[:,:], float32[:,:]",
        "float64[:,:], float64[:,:], float64[:,:]",
    ],
    "(m,n),(n,p)->(n,p)",
)(hadamardcore)

In [6]:
n = 4000
A = np.arange(n * n, dtype=np.float64).reshape(n, n)
B = np.flip(np.arange(n * n, dtype=np.float64)).reshape(n, n)

### `numba`

In [7]:
C = hadamard_ufunc(A, B)
C

array([[0.00000000e+00, 1.59999980e+07, 3.19999940e+07, ...,
        6.39360200e+10, 6.39520120e+10, 6.39680040e+10],
       [6.39839960e+10, 6.39999880e+10, 6.40159800e+10, ...,
        1.27888040e+11, 1.27904024e+11, 1.27920008e+11],
       [1.27935992e+11, 1.27951976e+11, 1.27967960e+11, ...,
        1.91808060e+11, 1.91824036e+11, 1.91840012e+11],
       ...,
       [1.91840012e+11, 1.91824036e+11, 1.91808060e+11, ...,
        1.27967960e+11, 1.27951976e+11, 1.27935992e+11],
       [1.27920008e+11, 1.27904024e+11, 1.27888040e+11, ...,
        6.40159800e+10, 6.39999880e+10, 6.39839960e+10],
       [6.39680040e+10, 6.39520120e+10, 6.39360200e+10, ...,
        3.19999940e+07, 1.59999980e+07, 0.00000000e+00]])

In [8]:
%timeit hadamard_ufunc(A, B)

15.4 ms ± 291 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
C = gu_hadamard(A, B)
C

array([[0.00000000e+00, 1.59999980e+07, 3.19999940e+07, ...,
        6.39360200e+10, 6.39520120e+10, 6.39680040e+10],
       [6.39839960e+10, 6.39999880e+10, 6.40159800e+10, ...,
        1.27888040e+11, 1.27904024e+11, 1.27920008e+11],
       [1.27935992e+11, 1.27951976e+11, 1.27967960e+11, ...,
        1.91808060e+11, 1.91824036e+11, 1.91840012e+11],
       ...,
       [1.91840012e+11, 1.91824036e+11, 1.91808060e+11, ...,
        1.27967960e+11, 1.27951976e+11, 1.27935992e+11],
       [1.27920008e+11, 1.27904024e+11, 1.27888040e+11, ...,
        6.40159800e+10, 6.39999880e+10, 6.39839960e+10],
       [6.39680040e+10, 6.39520120e+10, 6.39360200e+10, ...,
        3.19999940e+07, 1.59999980e+07, 0.00000000e+00]])

In [10]:
%timeit gu_hadamard(A, B)

15.3 ms ± 222 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### `numpy`

In [11]:
C = hadamard(A, B)
C

array([[0.00000000e+00, 1.59999980e+07, 3.19999940e+07, ...,
        6.39360200e+10, 6.39520120e+10, 6.39680040e+10],
       [6.39839960e+10, 6.39999880e+10, 6.40159800e+10, ...,
        1.27888040e+11, 1.27904024e+11, 1.27920008e+11],
       [1.27935992e+11, 1.27951976e+11, 1.27967960e+11, ...,
        1.91808060e+11, 1.91824036e+11, 1.91840012e+11],
       ...,
       [1.91840012e+11, 1.91824036e+11, 1.91808060e+11, ...,
        1.27967960e+11, 1.27951976e+11, 1.27935992e+11],
       [1.27920008e+11, 1.27904024e+11, 1.27888040e+11, ...,
        6.40159800e+10, 6.39999880e+10, 6.39839960e+10],
       [6.39680040e+10, 6.39520120e+10, 6.39360200e+10, ...,
        3.19999940e+07, 1.59999980e+07, 0.00000000e+00]])

In [12]:
%timeit hadamard(A, B)

16 ms ± 413 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Matrix multiplication

An example from <http://numba.pydata.org/numba-doc/0.12/tutorial_numpy_and_numba.html>. Their example is wrong, however, because their calls to `gu_matmul` and `np.matmul` yield different result matrices with different dimensions.

In [13]:
def matmulcore(A, B, C):
    m, n = A.shape
    n, p = B.shape
    for i in range(m):
        for j in range(p):
            C[i, j] = 0.0
            for k in range(n):
                C[i, j] += A[i, k] * B[k, j]

In [14]:
gu_matmul = nb.guvectorize(
    [
        "float32[:,:], float32[:,:], float32[:,:]",
        "float64[:,:], float64[:,:], float64[:,:]",
    ],
    "(m,n),(n,p)->(n,p)",
)(matmulcore)

Note: `numba` will fail to compute a result in reasonable time for `n = 4000`.

In [15]:
n = 1000
A = np.arange(n * n, dtype=np.float64).reshape(n, n)
B = np.flip(np.arange(n * n, dtype=np.float64)).reshape(n, n)

### `numba`

In [16]:
C = gu_matmul(A, B)
C

array([[1.66666000e+11, 1.66665501e+11, 1.66665002e+11, ...,
        1.66167999e+11, 1.66167500e+11, 1.66167000e+11],
       [6.67165000e+11, 6.67163501e+11, 6.67162002e+11, ...,
        6.65669999e+11, 6.65668500e+11, 6.65667000e+11],
       [1.16766400e+12, 1.16766150e+12, 1.16765900e+12, ...,
        1.16517200e+12, 1.16516950e+12, 1.16516700e+12],
       ...,
       [4.99164169e+14, 4.99163172e+14, 4.99162174e+14, ...,
        4.98169662e+14, 4.98168664e+14, 4.98167667e+14],
       [4.99664668e+14, 4.99663670e+14, 4.99662671e+14, ...,
        4.98669164e+14, 4.98668165e+14, 4.98667167e+14],
       [5.00165167e+14, 5.00164168e+14, 5.00163168e+14, ...,
        4.99168666e+14, 4.99167666e+14, 4.99166667e+14]])

In [17]:
%timeit gu_matmul(A, B)

959 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### `numpy`

In [18]:
C = np.matmul(A, B)
C

array([[1.66666000e+11, 1.66665501e+11, 1.66665002e+11, ...,
        1.66167999e+11, 1.66167500e+11, 1.66167000e+11],
       [6.67165000e+11, 6.67163501e+11, 6.67162002e+11, ...,
        6.65669999e+11, 6.65668500e+11, 6.65667000e+11],
       [1.16766400e+12, 1.16766150e+12, 1.16765900e+12, ...,
        1.16517200e+12, 1.16516950e+12, 1.16516700e+12],
       ...,
       [4.99164169e+14, 4.99163172e+14, 4.99162174e+14, ...,
        4.98169662e+14, 4.98168664e+14, 4.98167667e+14],
       [4.99664668e+14, 4.99663670e+14, 4.99662671e+14, ...,
        4.98669164e+14, 4.98668165e+14, 4.98667167e+14],
       [5.00165167e+14, 5.00164168e+14, 5.00163168e+14, ...,
        4.99168666e+14, 4.99167666e+14, 4.99166667e+14]])

In [19]:
%timeit np.matmul(A, B)

1.25 s ± 25.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Linear interpolation

Another example from <http://numba.pydata.org/numba-doc/0.12/tutorial_numpy_and_numba.html>.

In [20]:
def lerp(a, b, t):
    return a + t * (b - a)

In [21]:
lerp_ufunc = nb.vectorize(
    ["float32(float32, float32, float32)", "float64(float64, float64, float64)"]
)(lerp)

In [22]:
n = 100000
a = np.arange(0.0, 1.0, 1.0 / n)
b = np.flip(a)
t = np.full(n, 0.5)

### `numba`

In [23]:
result = lerp_ufunc(a, b, t)
result

array([0.499995, 0.499995, 0.499995, ..., 0.499995, 0.499995, 0.499995])

In [24]:
%timeit lerp_ufunc(a, b, t)

53.4 μs ± 169 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


### `numpy`

In [25]:
result = lerp(a, b, t)
result

array([0.499995, 0.499995, 0.499995, ..., 0.499995, 0.499995, 0.499995])

In [26]:
%timeit lerp(a, b, t)

87.9 μs ± 236 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
