# `pyeumap` raster sampling benchmarks

In [1]:
from pathlib import Path
import geopandas as gpd
import rasterio as rio
import numpy as np
import multiprocessing as mp

import warnings
warnings.filterwarnings('ignore')

from pyeumap.overlay import SpaceOverlay

max_workers = 8

data_dir = Path('/data/work/geoharmonizer/pilot_tiles/croatia_9529')
layers_dir = data_dir.joinpath('images')
points = gpd.read_file(data_dir.joinpath('training_samples.gpkg'))

print('sample size:', points.index.size)

sample size: 759


Serial sampling with `rasterio`:

In [2]:
def serial_sampling(points, layers_dir):
    sources = [
        rio.open(fn)
        for fn in sorted(layers_dir.glob('**/*.tif'))
    ]

    coordinates = np.c_[
        points.geometry.x,
        points.geometry.y,
    ]

    results = points.copy()
    for src in sources:
        layer_name = Path(src.name).stem
        results[layer_name] = np.stack(src.sample(coordinates)).ravel()

%timeit -n 1 -r 1 serial_sampling(points, layers_dir)

1min 48s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Parallel sampling with `rasterio`:

In [3]:
def sample_one_layer(args):
    fn, coordinates = args
    layer_name = fn.stem
    with rio.open(fn) as src:
        data = np.stack(src.sample(coordinates)).ravel()
    return layer_name, data

def parallel_sampling(points, layers_dir):
    files = sorted(layers_dir.glob('**/*.tif'))

    coordinates = np.c_[
        points.geometry.x,
        points.geometry.y,
    ]

    results = points.copy()

    arg_gen = (
        (fn, coordinates)
        for fn in files
    )

    with mp.Pool(max_workers) as pool:
        for layer_name, data in pool.map(
            sample_one_layer,
            arg_gen,
        ):
            results[layer_name] = data

%timeit -n 1 -r 1 parallel_sampling(points, layers_dir)

15.2 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Parallel sampling with `pyeumap`:

In [4]:
def pyeumap_sampling(points, layers_dir):
    ovr = SpaceOverlay(
        points,
        layers_dir,
        max_workers=max_workers,
        verbose=False,
    )
    data = ovr.run()

%timeit -n 1 -r 1 pyeumap_sampling(points, layers_dir)

27.9 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


`pyeumap` sampling optimizations generate some overhead which makes them unsuitable for smaller datasets. However, if we quadruple the sample size:

In [2]:
for i in range(2):
    points = points.append(points, ignore_index=True)

print('sample size:', points.index.size)

sample size: 3036


Parallel sampling with `rasterio`:

In [6]:
%timeit -n 1 -r 1 parallel_sampling(points, layers_dir)

53.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Parallel sampling with `pyeumap`:

In [7]:
%timeit -n 1 -r 1 pyeumap_sampling(points, layers_dir)


36.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
