# Cython

http://docs.cython.org/en/latest/index.html

In [18]:
%load_ext cython

## Trivial Example

In [4]:
%%cython -a

cdef int a = 0
for i in range(10):
    a += i
print(a)

45


## Speeding up a Function

### Python Version

In [5]:
def get_primes(nmax):
    primes = []
    for i in range(2,nmax+1):
        is_prime = True
        for p in primes:
            if i % p ==0:
                is_prime = False
                break
            if i < p**2: # little speedup
                break
        if is_prime:
            primes.append(i)
    return primes

In [6]:
%timeit get_primes(100)

120 µs ± 121 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### First Try: just add Cython Command

In [7]:
%%cython -a
def get_primes_cython(nmax):
    primes = []
    for i in range(2,nmax+1):
        is_prime = True
        for p in primes:
            if i % p ==0:
                is_prime = False
                break
            if i < p**2: # little speedup
                break
        if is_prime:
            primes.append(i)
    return primes

In [8]:
%timeit get_primes_cython(100)

75.2 µs ± 65.1 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


already a speedup of 40%

### Using C-Type Variables

In [62]:
%%cython -a
def get_primes_cython2(int nmax): #!!!
    cdef int i, p #!!!
    primes = []
    for i in range(2,nmax+1):
        is_prime = True
        for p in primes:
            if i % p ==0:
                is_prime = False
                break
            if i < p**2: # little speedup
                break
        if is_prime:
            primes.append(i)
    return primes

In [63]:
%timeit get_primes_cython2(100)

4.03 µs ± 19.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Now we have a speedup by a factor of 30 compared to the Python implementation by just explicitly defining the integer types.  

In [64]:
%%cython
cdef list[int] get_primes_cython3(int nmax): #!!!
    cdef int i, p
    cdef list[int] primes = [] #!!!
    for i in range(2,nmax+1):
        is_prime = True
        for p in primes:
            if i % p ==0:
                is_prime = False
                break
            if i < p**2: # little speedup
                break
        if is_prime:
            primes.append(i)
    return primes

In [65]:
%timeit get_primes_cython3(100)

3.97 µs ± 15.8 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Defining the list types in function call and function itself does not give a significant speedup anymore.

In [48]:
%%cython
import cython
def get_primes_cython4(nmax: cython.int) -> list:
    i: cython.int
    p: cython.int
    primes: list = []
    for i in range(2,nmax+1):
        is_prime: bool = True
        for p in primes:
            if i % p ==0:
                is_prime = False
                break
            if i < p**2: # little speedup
                break
        if is_prime:
            primes.append(i)
    return primes

In [49]:
%timeit get_primes_cython4(100)

3.95 µs ± 18.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Alternatively to *cdef*, Python 3 type annotations could be used to specify variable types for C compilation.
The advantage is that the code above runs also in pure Python.

Note that *cython.int* instead of *int* should be used because the latter (as unlimited integer) has no equivalent C type and therefore gives less performance gains.

### Comparison to Numba

In [3]:
from numba import njit, int32

In [2]:
@njit
def get_primes_numba(nmax):
    primes = []
    for i in range(2,nmax+1):
        is_prime = True
        for p in primes:
            if i % p ==0:
                is_prime = False
                break
            if i < p**2: # little speedup
                break
        if is_prime:
            primes.append(i)
    return primes

In [3]:
%timeit get_primes_numba(100)

The slowest run took 4.37 times longer than the fastest. This could mean that an intermediate result is being cached.
12.5 µs ± 9.05 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


Numba is about 10 times faster than pure-Python, but 3 times slower than the typed Cython version.

In [12]:
@njit('(int32,)')
def get_primes_numba2(nmax):
    primes = []
    for i in range(2,nmax+1):
        is_prime = True
        for p in primes:
            if i % p ==0:
                is_prime = False
                break
            if i < p**2: # little speedup
                break
        if is_prime:
            primes.append(i)
    return primes

In [13]:
%timeit get_primes_numba2(100)

4.51 µs ± 15.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


Defining the type of input variable, Numba is only slightly slower than Cython.

## More Complex Examples involving Pandas

In [14]:
import pandas as pd
import numpy as np

In [15]:
def calc_on_df():
    df_rdn = pd.DataFrame(np.random.randn(100000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    df_rdn['f'] = df_rdn['a'] + df_rdn.b * df_rdn.c * np.exp(df_rdn.d) / df_rdn.e

In [16]:
%timeit calc_on_df()

50.8 ms ± 68.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
%%cython
import pandas as pd
import numpy as np
def calc_on_df_cython():
    df_rdn = pd.DataFrame(np.random.randn(100000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    df_rdn['f'] = df_rdn['a'] + df_rdn.b * df_rdn.c * np.exp(df_rdn.d) / df_rdn.e

In [22]:
%timeit calc_on_df_cython()

50.9 ms ± 79.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


No speedup here - this is not surprising because the Python example is already vectorized.

In [31]:
def df_apply():
    df_rdn = pd.DataFrame(np.random.randn(10000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    df_rdn['f'] = df_rdn.apply(lambda x: x.a + x.b - x.c * x.d / x.e, axis=1)

In [32]:
%timeit df_apply()

2.15 s ± 3.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
%%cython
import pandas as pd
import numpy as np
def df_apply_cython():
    df_rdn = pd.DataFrame(np.random.randn(10000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    df_rdn['f'] = df_rdn.apply(lambda x: x.a + x.b - x.c * x.d / x.e, axis=1)

In [34]:
%timeit df_apply_cython()

2.15 s ± 7.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [35]:
%%cython
import pandas as pd
import numpy as np

cdef float apply_func(float a, float b, float c, float d, float e):
    return a + b - c * d / e

def df_apply_cython2():
    df_rdn = pd.DataFrame(np.random.randn(10000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    df_rdn['f'] = df_rdn.apply(lambda x: apply_func(x.a, x.b, x.c, x.d, x.e), axis=1)

In [36]:
%timeit df_apply_cython2()

2.13 s ± 8.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [37]:
%%cython
import pandas as pd
import numpy as np

def df_apply_cython3():
    df_rdn = pd.DataFrame(np.random.randn(10000, 5), columns=['a', 'b', 'c', 'd', 'e'])
    
    cdef int n=len(df_rdn)
    cdef int i
    cdef float a, b, c, d, e, x
    res = np.zeros(n)
    for i in range(n):
        a, b, c, d, e = df_rdn.iloc[i, :5]
        x = a + b - c * d / e
        res[i] = x
    df_rdn['f'] = res

In [38]:
%timeit df_apply_cython3()

5.25 s ± 15.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
