In [1]:
%load_ext Cython
import numpy as np
X = np.random.randint(0, 100, int(1e+7))
Y = np.random.randint(0, 10, int(3e+8))

In [2]:
def list_summation(a):
    s = 0
    for x in a:
        s += x
    return s

In [3]:
%timeit -r7 -n1 list_summation(X)

1.11 s ± 51.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [4]:
%%cython --compile-args=-fopenmp --link-args=-fopenmp --force
def c_list_summation(a):
    s = 0
    for x in a:
        s += x
    return s

In [5]:
%timeit -r7 -n1 c_list_summation(X)

919 ms ± 37.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%cython --compile-args=-fopenmp --link-args=-fopenmp --force
import numpy as np
cimport numpy as np
from cython import boundscheck

DTYPE = np.int
ctypedef np.int_t DTYPE_t

@boundscheck(False)
def c_type_list_summation(np.ndarray[DTYPE_t, ndim=1] a):
    cdef int i = 0
    cdef int n = a.shape[0]
    cdef int s = 0
    for i in range(n):
        s += a[i]
    return s

In [7]:
%timeit -r7 -n1 c_type_list_summation(X)
%timeit -r7 -n1 c_type_list_summation(Y)
c_type_list_summation(Y)

9.77 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
The slowest run took 5.67 times longer than the fastest. This could mean that an intermediate result is being cached.
437 ms ± 421 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


1350018003

In [8]:
%%cython --compile-args=-fopenmp --link-args=-fopenmp --force
import numpy as np
cimport numpy as np
from cython.parallel import prange
from cython import boundscheck

DTYPE = np.int
ctypedef np.int_t DTYPE_t

@boundscheck(False)
def c_type_parallel_list_summation(np.ndarray[DTYPE_t, ndim=1] a):
    cdef int i = 0
    cdef int s = 0
    cdef int n = a.shape[0]
    for i in prange(n, schedule='guided', num_threads=8, nogil=True):
        s += a[i]
    return s

In [9]:
%timeit -r7 -n10 c_type_parallel_list_summation(Y)
c_type_parallel_list_summation(Y)

139 ms ± 19.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


1350018003