Codes with pure python is slow.

Time for discretization and ODE solving takes

In [5]:
%run solve_ode.py

discretizing...
discretization takes 283.452857 seconds
Solving ODE...
solving ODE takes 42.851461 seconds


# Optimizing breakage and selection functions with Cython

Code for breakage function with lognormal distribution and selection function in `lognormal.py` is shown below. 

In [None]:
def lnpdf(x, m, sg):
    num = np.exp(-(np.log(x) - m)**2 / (2 * sg**2))
    den = x * sg * np.sqrt(2 * np.pi)
    return num / den

def lognorm_b(x, y, m, sg):
    assert sg > 0, "sigma must be larger than 0"
   
    num = lnpdf(x, m, sg)
    den = erfc(-(np.log(y) - m) / (np.sqrt(2) * sg))/2

    # In case 'y' is too small compared to 'mu',
    # 'den' can be numerically zero 
    # if it is smaller than the machine precision epsilon 
    # which is not correct theoretically
    if den == 0:
        den = np.finfo(float).eps
    # convert volume to number
    return (y / x)**3 * num / den

def breakagefunc(x, y, k, *args):
    mu = args[0]
    sigma = args[1]
    res = k[1] * lognorm_b(x, y, mu[0], sigma[0])\
        + k[2] * lognorm_b(x, y, mu[1], sigma[1])\
        + (1 - k[1] - k[2]) * lognorm_b(x, y, mu[2], sigma[2])
    return res

def selectionfunc(y, k, *args):
    return k[0] * y**3

Benchmark result is

In [1]:
import benchmark

benchmark.breakage('python')

breakage function takes 26.58 μs.


In [2]:
benchmark.selection('python')

breakage function takes  0.60 μs.


## Simply cythonize without modification

Compling was done with `setup.py`

`$ python setup.py bulid_ext -i`

In [1]:
import benchmark

benchmark.breakage('cython')

breakage function takes 20.80 μs.


In [2]:
benchmark.selection('cython')

breakage function takes  0.37 μs.


## Use C library for math functions

Cythonized code in `lognormal_cy.pyx` is shown below 

In [None]:
import numpy as np
from libc.math cimport exp, log, sqrt, erfc

def lnpdf(x, m, sg):
    num = exp(-(log(x) - m)**2 / (2 * sg**2))
    den = x * sg * sqrt(2 * np.pi)
    return num / den

def lognorm_b(x, y, m, sg):
    assert sg > 0, "sigma must be larger than 0"
   
    num = lnpdf(x, m, sg)
    den = erfc(-(log(y) - m) / (sqrt(2) * sg))/2

    if den == 0:
        den = np.finfo(float).eps

    return (y / x)**3 * num / den

In [1]:
import benchmark

benchmark.breakage('cython')

breakage function takes  5.48 μs.


In [2]:
benchmark.selection('cython')

breakage function takes  0.42 μs.


## Static types

In [None]:
cdef double lnpdf(double x, double m, double sg):
    cdef double pi = 3.141592653589793115997963468544185161590576171875
    cdef double num = exp(-(log(x) - m) ** 2 / (2 * sg**2))
    cdef double den = x * sg * sqrt(2 * pi)
    return num / den

cdef double lognorm_b(double x, double y, double m, double sg):
    assert sg > 0, "sigma must be larger than 0"
    cdef double num = lnpdf(x, m, sg)
    cdef double den = erfc(-(log(y) - m) / (sqrt(2) * sg)) / 2
    if den == 0:
        den = np.finfo(float).eps
    return (y / x)**3 * num / den

cpdef double breakagefunc(double x, double y, double[:] k, args):
    cdef double[:] mu = args[0]
    cdef double[:] sigma = args[1]
    cdef double res = k[1] * lognorm_b(x, y, mu[0], sigma[0])\
                    + k[2] * lognorm_b(x, y, mu[1], sigma[1])\
                    + (1 - k[1] - k[2]) * lognorm_b(x, y, mu[2], sigma[2])
    return res

cpdef double selectionfunc(double y, double[:] k, args):
    return k[0] * y**3

In [1]:
import benchmark

benchmark.breakage('cython')

breakage function takes  1.65 μs.


In [2]:
benchmark.selection('cython')

breakage function takes  0.52 μs.


Cythonization makes the breakage function more than 10 times faster but there is no significant improvement for selection function. 

# Optimizing discretization with Cython

Code for discretization in `discretize.py` is shown below

In [None]:
def den_integrand(x, k, *args):
    return x**3 * selectionfunc(x, k, *args)

def num_integrand(x, y, k, *args):
    return x**3 * selectionfunc(y, k, *args) * breakagefunc(x, y, k, *args)

def breakage_discretize(L, n, k, *args):
    L = np.insert(L, 0, 0)
    res = np.zeros((n, n))

    for i in range(n):
        den, err = quad(den_integrand, L[i], L[i+1], args=(k, *args))
        assert den != 0, 'breakage_discretize: division by zero'
        for j in range(i):
            num, err = dblquad(num_integrand, L[i], L[i+1],
                               lambda x: L[j], lambda x: L[j+1],
                               args=(k, *args))
            Li = (L[i]+L[i+1])/2
            Lj = (L[j]+L[j+1])/2
            res[j, i] = (Li / Lj)**3 * num / den
        num, err = dblquad(num_integrand, L[i], L[i+1],
                           lambda x: L[i], lambda x: x,
                           args=(k, *args))
        res[i, i] = num / den
        
    return res 

def particle_number(x, k, *args): 
    res = quad(lambda a: breakagefunc(a, x, k, *args), 0, x)[0]
    return res

def selection_integrand(x, k, *args):
    return (particle_number(x, k, *args) - 1) * selectionfunc(x, k, *args)

def selection_discretize(L, n, k, breakage_mat, *args):
    res = np.empty(n)
    L = np.insert(L, 0, 0)
    
    for i in range(1, n):
        integ = quad(selection_integrand, L[i], L[i+1], args=(k, *args))[0]
        num = integ / (L[i+1] - L[i])
        sum = np.sum(breakage_mat[:i+1, i])
        den = sum - 1
        assert den != 0, 'selection_discretize: division by zero'
        res[i] = num / den
        
    res[0] = 0.0
    return res

Benchmark result is

In [1]:
import benchmark

benchmark.discretize_check()

No error


In [2]:
benchmark.discretize('python')

discretization of breakage takes 23.956 s.
discretization of selection takes 11.992 s.


## Simply inserting cythonized lognormal function

The `cdef` function cannot have starred argument `*args` for variable number of arguments as Python, `*args` should be converted to `args`. With this modification, simply inserting cythonized functions into `discretize.py` results in

In [None]:
def den_integrand(x, k, *args):
    return x**3 * selectionfunc(x, k, args)

def num_integrand(x, y, k, *args):
    return x**3 * selectionfunc(y, k, args) * breakagefunc(x, y, k, args)

def breakage_discretize(L, n, k, *args):
    L = np.insert(L, 0, 0)
    res = np.zeros((n, n))

    for i in range(n):
        den, err = quad(den_integrand, L[i], L[i+1], args=(k, *args))
        assert den != 0, 'breakage_discretize: division by zero'
        for j in range(i):
            num, err = dblquad(num_integrand, L[i], L[i+1],
                               lambda x: L[j], lambda x: L[j+1],
                               args=(k, *args))
            Li = (L[i]+L[i+1])/2
            Lj = (L[j]+L[j+1])/2
            res[j, i] = (Li / Lj)**3 * num / den
        num, err = dblquad(num_integrand, L[i], L[i+1],
                           lambda x: L[i], lambda x: x,
                           args=(k, *args))
        res[i, i] = num / den
        
    return res 

def particle_number(x, k, *args): 
    res = quad(lambda a: breakagefunc(a, x, k, args), 0, x)[0]
    return res

def selection_integrand(x, k, *args):
    return (particle_number(x, k, *args) - 1) * selectionfunc(x, k, args)

def selection_discretize(L, n, k, breakage_mat, *args):
    res = np.empty(n)
    L = np.insert(L, 0, 0)
    
    for i in range(1, n):
        integ = quad(selection_integrand, L[i], L[i+1], args=(k, *args))[0]
        num = integ / (L[i+1] - L[i])
        sum = np.sum(breakage_mat[:i+1, i])
        den = sum - 1
        assert den != 0, 'selection_discretize: division by zero'
        res[i] = num / den
        
    res[0] = 0.0
    return res

In [1]:
import benchmark

benchmark.discretize_check()

No error


In [2]:
benchmark.discretize('cython')

discretization of breakage takes  2.018 s.
discretization of selection takes  0.684 s.


## Add static types

Closures inside `cpdef` functions is not supported so that a function using `lambda` function cannot be converted to a `cpdef` function.

In [None]:
cpdef double den_integrand(double x, double[:] k, args):
    return x**3 * selectionfunc(x, k, args)

cpdef double num_integrand(double x, double y, double[:] k, args):
    return x**3 * selectionfunc(y, k, args) * breakagefunc(x, y, k, args)

def breakage_discretize(L, n, k, *args):
    L = np.insert(L, 0, 0)
    res = np.zeros((n, n))

    for i in range(n):
        den, err = quad(den_integrand, L[i], L[i+1], args=(k, args))
        assert den != 0, 'breakage_discretize: division by zero'
        for j in range(i):
            num, err = dblquad(num_integrand, L[i], L[i+1],
                               lambda x: L[j], lambda x: L[j+1],
                               args=(k, args))
            Li = (L[i]+L[i+1])/2
            Lj = (L[j]+L[j+1])/2
            res[j, i] = (Li / Lj)**3 * num / den
        num, err = dblquad(num_integrand, L[i], L[i+1],
                           lambda x: L[i], lambda x: x,
                           args=(k, args))
        res[i, i] = num / den
        
    return res 

def particle_number(double x, double[:] k, args): 
    res = quad(lambda a: breakagefunc(a, x, k, args), 0, x)[0]
    return res

cdef double selection_integrand(double x, double[:] k, args):
    return (particle_number(x, k, args) - 1) * selectionfunc(x, k, args)

def selection_discretize(L, n, k, breakage_mat, *args):
    res = np.empty(n)
    L = np.insert(L, 0, 0)
    
    for i in range(1, n):
        integ = quad(selection_integrand, L[i], L[i+1], args=(k, args))[0]
        num = integ / (L[i+1] - L[i])
        sum = np.sum(breakage_mat[:i+1, i])
        den = sum - 1
        assert den != 0, 'selection_discretize: division by zero'
        res[i] = num / den
        
    res[0] = 0.0
    return res

In [1]:
import benchmark

benchmark.discretize_check()

No error


In [2]:
benchmark.discretize('cython')

discretization of breakage takes  2.152 s.
discretization of selection takes  0.632 s.


There is no performance improvement for `cdef` integrands. This is probably due to `quad` function takes Python function as argument.

## Static types for loops

In [None]:
def den_integrand(x, k, *args):
    return x**3 * selectionfunc(x, k, args)

def num_integrand(x, y, k, *args):
    return x**3 * selectionfunc(y, k, args) * breakagefunc(x, y, k, args)

def breakage_discretize(L, Py_ssize_t n, k, *args):
    L = np.insert(L, 0, 0)
    res = np.zeros((n, n))
    
    cdef Py_ssize_t i, j

    for i in range(n):
        den, err = quad(den_integrand, L[i], L[i+1], args=(k, *args))
        assert den != 0, 'breakage_discretize: division by zero'
        for j in range(i):
            num, err = dblquad(num_integrand, L[i], L[i+1],
                               lambda x: L[j], lambda x: L[j+1],
                               args=(k, *args))
            Li = (L[i]+L[i+1])/2
            Lj = (L[j]+L[j+1])/2
            res[j, i] = (Li / Lj)**3 * num / den
        num, err = dblquad(num_integrand, L[i], L[i+1],
                           lambda x: L[i], lambda x: x,
                           args=(k, *args))
        res[i, i] = num / den
        
    return res 

def particle_number(x, k, *args): 
    res = quad(lambda a: breakagefunc(a, x, k, args), 0, x)[0]
    return res

def selection_integrand(x, k, *args):
    return (particle_number(x, k, *args) - 1) * selectionfunc(x, k, args)

def selection_discretize(L, Py_ssize_t n, k, breakage_mat, *args):
    res = np.empty(n)
    L = np.insert(L, 0, 0)
    
    cdef Py_ssize_t i
    
    for i in range(1, n):
        integ = quad(selection_integrand, L[i], L[i+1], args=(k, *args))[0]
        num = integ / (L[i+1] - L[i])
        sum = np.sum(breakage_mat[:i+1, i])
        den = sum - 1
        assert den != 0, 'selection_discretize: division by zero'
        res[i] = num / den
        
    res[0] = 0.0
    return res

In [1]:
import benchmark

benchmark.discretize_check()

No error


In [2]:
benchmark.discretize('cython')

discretization of breakage takes  1.919 s.
discretization of selection takes  0.693 s.


# Parallelize for-loop using Joblib

Since the most of time is spent in calling `quad` and `dblquad` functions of `scipy.integrate` library, Cython has limited effect on performance improvement. For further optimizaiton, it needs to parallelize `for-loop` 

In [None]:
from joblib import Parallel, delayed

def breakage_discretize(L, n, k, *args):
    L = np.insert(L, 0, 0)
    
    def in_for_loop(i):
        temp = np.zeros(n)
        den, err = quad(den_integrand, L[i], L[i+1], args=(k, *args))
        assert den != 0, 'breakage_discretize: division by zero'
        for j in range(i):
            num, err = dblquad(num_integrand, L[i], L[i+1],
                               lambda x: L[j], lambda x: L[j+1],
                               args=(k, *args))
            Li = (L[i]+L[i+1])/2
            Lj = (L[j]+L[j+1])/2
            temp[j] = (Li / Lj)**3 * num / den
        num, err = dblquad(num_integrand, L[i], L[i+1],
                           lambda x: L[i], lambda x: x,
                           args=(k, *args))
        temp[i] = num / den
        
        return temp
    
    r = Parallel(n_jobs=-1)(delayed(in_for_loop)(i) for i in range(n))
    
    res = np.stack(r).T 
        
    return res

def selection_discretize(L, n, k, breakage_mat, *args):
    L = np.insert(L, 0, 0)
    
    def in_for_loop(i):
        integ = quad(selection_integrand, L[i], L[i+1], args=(k, *args))[0]
        num = integ / (L[i+1] - L[i])
        sum = np.sum(breakage_mat[:i+1, i])
        den = sum - 1
        assert den != 0, 'selection_discretize: division by zero'
        return num / den
        
    r = Parallel(n_jobs=-1)(delayed(in_for_loop)(i) for i in range(1, n))
    
    res = np.zeros(n)
    res[1:] = r
    return res

In [1]:
import benchmark

benchmark.parallel_check()

No error


In [2]:
benchmark.discretize('python')

discretization of breakage takes  0.581 s.
discretization of selection takes  0.210 s.


Excution time of discretization of breakage function reduced from 23.956 s to 0.581 s and that of selection functino reduced from 11.992 s to 0.210.