In [1]:
import numpy as np

In [2]:
N = 1000000
a = np.random.random(N)
b = np.random.random(N)

print(f"Shape: {a.shape}\nHead: {a[:5]}")

Shape: (1000000,)
Head: [0.61716441 0.52736192 0.112841   0.50037939 0.22061619]


# Naive approach

In [3]:
%%timeit
sum_naive = 0
i = 0
while i < N:
    sum_naive += a[i] * b[i]
    i += 1

153 ms ± 4.75 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


# 2 times loop unrolling

In [4]:
%%timeit
sum_two1 = 0
sum_two2 = 0
i = 0
while i < N/2 - 1:
    sum_two1 += a[2*i] * b[2*i]
    sum_two2 += a[2*i + 1] * b[2*i + 1]
    i += 1
sum_two = sum_two1 + sum_two2


209 ms ± 1.63 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# 4 times loop unrolling

In [5]:
%%timeit
sum_four1 = 0
sum_four2 = 0
sum_four3 = 0
sum_four4 = 0
i = 0
while i < N/4 - 1:
    sum_four1 += a[4*i] * b[4*i]
    sum_four2 += a[4*i + 1] * b[4*i + 1]
    sum_four3 += a[4*i + 2] * b[4*i + 2]
    sum_four4 += a[4*i + 3] * b[4*i + 3]
    i += 1
sum_four = sum_four1 + sum_four2 + sum_four3 + sum_four4

197 ms ± 447 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Numpy Vectorization

In [6]:
%%timeit
sum_vec = np.dot(a, b)

116 µs ± 7.08 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


# NUMBA power

In [7]:
from numba import jit

# Numba Naive approach

In [8]:
%%timeit
@jit(nopython=True)
def speed():
    sum_naive = 0
    i = 0
    while i < N:
        sum_naive += a[i] * b[i]
        i += 1
return sum_naive

22.3 µs ± 59.1 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


# Numba 2 times loop unrolling

In [9]:
%%timeit
@jit(nopython=True)
def speed():
    sum_two1 = 0
    sum_two2 = 0
    i = 0
    while i < N/2 - 1:
        sum_two1 += a[2*i] * b[2*i]
        sum_two2 += a[2*i + 1] * b[2*i + 1]
        i += 1
    return sum_two1 + sum_two2

23.8 µs ± 743 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


# Numba 4 times loop unrolling

In [10]:
%%timeit
@jit(nopython=True)
def speed():
    sum_four1 = 0
    sum_four2 = 0
    sum_four3 = 0
    sum_four4 = 0
    i = 0
    while i < N/4 - 1:
        sum_four1 += a[4*i] * b[4*i]
        sum_four2 += a[4*i + 1] * b[4*i + 1]
        sum_four3 += a[4*i + 2] * b[4*i + 2]
        sum_four4 += a[4*i + 3] * b[4*i + 3]
        i += 1
    return sum_four1 + sum_four2 + sum_four3 + sum_four4

23.6 µs ± 489 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


# Numba Numpy vectorization 

In [11]:
%%timeit
@jit(nopython=True)
def speed():
    return np.dot(a, b)

22.8 µs ± 392 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
