In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

from glow import wrapper
from glow import lenses
from glow import time_domain, time_domain_c
from glow import freq_domain, freq_domain_c
from glow import tools

Temporary notebook to store previous performance tests, before passing them to a script.

# Python

### Performance tests

In [2]:
It = time_domain.It_AnalyticSIS(y=2, p_prec={'tmax':1e7})

The default parameters, that we will use as baseline are

In [3]:
p_default={'wmin':1e-3,
           'wmax':1e3,
           'FFT method':'multigrid',
           'N_below_discard':8,
           'N_above_discard':4,
           'N_keep':2}

with reference computation time:

In [4]:
%timeit F = freq_domain.Fw_FFT_OldReg(It, p_default)

The slowest run took 4.07 times longer than the fastest. This could mean that an intermediate result is being cached.
19.6 ms ± 7.97 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


The performance can be boosted (without affecting the precision) just by reducing the portion of the transformed signal that we keep, thus increasing the number of frequency chunks that we must use. There is a trade-off between reducing the number of points computed in each chunk (reducing 'N_keep') and the additional number of FFTs that we must compute (because the number of chunks increases), i.e. if we reduce 'N_keep' too much computation time increases. 

In [5]:
p_prec = p_default.copy()
p_prec['N_keep'] = 1

%timeit F = freq_domain.Fw_FFT_OldReg(It, p_prec)

18.5 ms ± 11.1 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


Increasing the number of decades thrown away by 1 in the low-frequency regime increases the precision by a factor of (around) 2, while also increasing the computation time by a factor of 2.

In [6]:
p_prec = p_default.copy()
p_prec['N_above_discard'] = 9
p_prec['N_below_discard'] = 5

%timeit F = freq_domain.Fw_FFT_OldReg(It, p_prec)

28 ms ± 761 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


# C

## Testing performance

### Analytic SIS

In [7]:
y = 0.3
psi0 = 1

tau_ini = 0.8
taus = np.geomspace(1e-2, 10, 1000)

Performance for a single point, numpy arrays and numpy arrays in parallel

In [8]:
%timeit wrapper.pyIt_SIS(tau_ini, y, psi0)
%timeit wrapper.pyIt_SIS(taus, y, psi0, parallel=False)
%timeit wrapper.pyIt_SIS(taus, y, psi0)

8.6 µs ± 261 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
602 µs ± 8.78 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
239 µs ± 22.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Single contour

Benchmark values (RK(8,9) integrator with tol=1e-5, compilation with `-Ofast`)

In [9]:
y = 1.1
psi0 = 1
x1_min = psi0 + y
x2_min = 0

Psi = lenses.Psi_SIS({'psi0':psi0})

tau_ini = 0.8
taus = np.geomspace(0.1, 100, 1000)

Performance for a single point, numpy arrays and numpy arrays in parallel

In [10]:
%timeit wrapper.pyContour(tau_ini, x1_min, x2_min, y, Psi)
%timeit wrapper.pyContour(taus, x1_min, x2_min, y, Psi, parallel=False)
%timeit wrapper.pyContour(taus, x1_min, x2_min, y, Psi)

36.8 µs ± 1.15 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
22.6 ms ± 809 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
7.87 ms ± 310 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Same but now with the `'robust'` method (i.e. parametric contours)

In [11]:
%timeit wrapper.pyContour(tau_ini, x1_min, x2_min, y, Psi, method='robust')
%timeit wrapper.pyContour(taus, x1_min, x2_min, y, Psi, parallel=False, method='robust')
%timeit wrapper.pyContour(taus, x1_min, x2_min, y, Psi, method='robust')

79.3 µs ± 4.38 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
68.4 ms ± 2.22 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
21.7 ms ± 396 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Single integral

Benchmark values: `qags` integrator with rtol=1e-4, atol=0, sub_int_limit=1000, compilation with `-Ofast`)

In [12]:
y = 1.3

tau_ini = 0.8
taus = np.geomspace(1e-2, 10, 1000)

Psi = lenses.Psi_SIS()

# extract p_crits first
It = time_domain_c.It_SingleIntegral_C(Psi, y, {'eval_mode':'exact'})
p_crits = It.p_crits

Performance for a single point, numpy arrays and numpy arrays in parallel

In [13]:
%timeit wrapper.pySingleIntegral(tau_ini, y, Psi, p_crits, method='qag15')
%timeit wrapper.pySingleIntegral(taus, y, Psi, p_crits, method='qag15', parallel=False)
%timeit wrapper.pySingleIntegral(taus, y, Psi, p_crits, method='qag15')

10.8 µs ± 302 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
2.01 ms ± 45.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
682 µs ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### Pseudo-analytic SIS

Semi-analytical formula for the SIS

In [14]:
ws = np.geomspace(1e-2, 1e2, 1000)

Strong and weak lensing tests. The `'direct'` method usually outperforms the default `'osc'` method, but it cannot be used for high frequencies.

In [15]:
y = 0.3
Fw1 = freq_domain_c.Fw_SemiAnalyticSIS_C(y)
Fw2 = freq_domain_c.Fw_SemiAnalyticSIS_C(y, {'method':'direct'})

%timeit Fw1.eval_Fw(ws)
%timeit Fw2.eval_Fw(ws)

3.21 ms ± 269 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.93 ms ± 42.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
y = 1.3
Fw1 = freq_domain_c.Fw_SemiAnalyticSIS_C(y)
Fw2 = freq_domain_c.Fw_SemiAnalyticSIS_C(y, {'method':'direct'})

%timeit Fw1.eval_Fw(ws)
%timeit Fw2.eval_Fw(ws)

13 ms ± 398 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
9.12 ms ± 90.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### General F(w)

Weak-lensing speed

In [17]:
y = 1.3
It = time_domain_c.It_AnalyticSIS_C(y, {'Nt':5000})

%timeit Fw = freq_domain_c.Fw_FFT_C(It)

740 µs ± 144 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


Strong-lensing speed

In [18]:
y = 0.3
It = time_domain_c.It_AnalyticSIS_C(y, {'Nt':5000})

%timeit Fw = freq_domain_c.Fw_FFT_C(It)

959 µs ± 38.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
