## Zonal Module -- Stats method

### Details

TBD

In [2]:
# importing modules
import os
import xarray as xr
import time
import cupy
import numpy as np
from xrspatial.utils import ngjit

from xrspatial.zonal import stats
from xrspatial.utils import has_cuda
from xrspatial.utils import doesnt_have_cuda

In [3]:
# function to allocate array to the requested backend
def create_arr(data=None, H=10, W=10, backend='numpy'):
    assert(backend in ['numpy', 'cupy', 'dask'])
    if data is None:
        data = np.zeros((H, W), dtype=np.float32)
    raster = xr.DataArray(data, dims=['y', 'x'])

    if has_cuda() and 'cupy' in backend:
        import cupy
        raster.data = cupy.asarray(raster.data)

    if 'dask' in backend:
        import dask.array as da
        raster.data = da.from_array(raster.data, chunks=(10, 10))

    return raster

In [4]:
''' Configurable, input parameters.
    @W: width
    @H: height
    @zH: zone width
    @zW: zone height
'''
W = 3000
H = 3000
zH = 4
zW = 4


In [5]:
# values rasters
values = xr.DataArray(np.arange(H * W, dtype=float).reshape(H, W))
values_numpy = create_arr(values, backend='numpy')
values_cupy = create_arr(values, backend='cupy')

# zones rasters
zones = xr.DataArray(np.zeros(H * W).reshape(H, W))
hstep = H//zH
wstep = W//zW
# initialize zones
for i in range(zH):
    for j in range(zW):
        zones[i * hstep: (i+1)*hstep, j*wstep: (j+1)*wstep] = i*zW + j
print('Zones: ', zones)

zones_numpy = create_arr(zones, backend='numpy')
zones_cupy = create_arr(zones, backend='cupy')

Zones:  <xarray.DataArray (dim_0: 3000, dim_1: 3000)>
array([[ 0.,  0.,  0., ...,  3.,  3.,  3.],
       [ 0.,  0.,  0., ...,  3.,  3.,  3.],
       [ 0.,  0.,  0., ...,  3.,  3.,  3.],
       ...,
       [12., 12., 12., ..., 15., 15., 15.],
       [12., 12., 12., ..., 15., 15., 15.],
       [12., 12., 12., ..., 15., 15., 15.]])
Dimensions without coordinates: dim_0, dim_1


### First Run: default stat funcs

In [6]:
# Run numpy stats, using the default stat functions (mean, max, min, sum, std, var, count)
numpy_time = %timeit -o numpy_stats = stats(zones=zones_numpy, values=values_numpy)

1.64 s ± 21.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
#print('Numpy results:', numpy_stats)

In [8]:
# Run cupy stats, using the default stat functions (mean, max, min, sum, std, var, count)
cupy_time = %timeit -o cupy_stats = stats(zones=zones_cupy, values=values_cupy)

220 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
#print('Cupy results:', cupy_stats)

In [10]:
# prepare the results for reporting. 
mean_numpy_time = np.mean(numpy_time.all_runs)/numpy_time.loops
std_numpy_time = np.std(numpy_time.all_runs)/numpy_time.loops

mean_cupy_time = np.mean(cupy_time.all_runs)/cupy_time.loops
std_cupy_time = np.std(cupy_time.all_runs)/cupy_time.loops

speedup = mean_numpy_time / mean_cupy_time

print('HxW      zHxzW    CPU Time (sec)     GPU Time (sec)    Speedup GPU')
print('{}x{}  {}x{}  {:.3f} ± {:.3f}    {:.3f} ± {:.3f}    {:.2f}x'.format(
        H, W, zH, zW, mean_numpy_time, std_numpy_time,
        mean_cupy_time, std_cupy_time, speedup))

HxW      zHxzW    CPU Time (sec)     GPU Time (sec)    Speedup GPU
3000x3000  4x4  1.642 ± 0.021    0.220 ± 0.000    7.48x


### Second Run: custom stat funcs

In [11]:
# define the custom functions for the numpy call
# this function is optimized with numba
@ngjit
def numpyl2normKernel(arr):
    acc = 0
    for x in arr:
        acc += x * x
    return np.sqrt(acc)

numpy_custom_stats = {
    'double_sum': lambda val: val.sum()*2,
    'l2norm': lambda val: np.sqrt(np.sum(val * val)),
    'l2normKernel': lambda val: numpyl2normKernel(val)
}

In [12]:
# define the custom functions for the cupy call
# this one is using the cupy ReductionKernel, a semi-manual way to define a CUDA kernel 
cupyl2normKernel = cupy.ReductionKernel(
            in_params='T x', out_params='float64 y',
            map_expr='x*x', reduce_expr='a+b',
            post_map_expr='y = sqrt(a)',
            identity='0', name='l2normKernel'
        )

cupy_custom_stats = {
    'double_sum': lambda val: val.sum()*2,
    # the same kernel can be implemented in a much more compact way too
    'l2norm': lambda val: np.sqrt(cupy.sum(val * val)),
    'l2normKernel': lambda val: cupyl2normKernel(val)
}

In [13]:
# Run numpy stats, using the the custom stat functions
numpy_time = %timeit -o numpy_stats = stats(zones=zones_numpy, values=values_numpy, stats_funcs=numpy_custom_stats)

1.6 s ± 2.83 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
#print('Numpy results:', numpy_stats)

In [15]:
# Run cupy stats, using the the custom stat functions
cupy_time = %timeit -o cupy_stats = stats(zones=zones_cupy, values=values_cupy, stats_funcs=cupy_custom_stats)

31.4 ms ± 7.58 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
#print('Cupy results:', cupy_stats)

In [17]:
# prepare the results for reporting. 
mean_numpy_time = np.mean(numpy_time.all_runs)/numpy_time.loops
std_numpy_time = np.std(numpy_time.all_runs)/numpy_time.loops

mean_cupy_time = np.mean(cupy_time.all_runs)/cupy_time.loops
std_cupy_time = np.std(cupy_time.all_runs)/cupy_time.loops

speedup = mean_numpy_time / mean_cupy_time

print('HxW      zHxzW    CPU Time (sec)     GPU Time (sec)    Speedup GPU')
print('{}x{}  {}x{}  {:.3f} ± {:.3f}    {:.3f} ± {:.3f}    {:.2f}x'.format(
        H, W, zH, zW, mean_numpy_time, std_numpy_time,
        mean_cupy_time, std_cupy_time, speedup))

HxW      zHxzW    CPU Time (sec)     GPU Time (sec)    Speedup GPU
3000x3000  4x4  1.604 ± 0.003    0.031 ± 0.000    51.07x


In [18]:
# Show GPU platform used for benchmarking
!nvidia-smi -L

GPU 0: Tesla V100S-PCIE-32GB (UUID: GPU-85a4a9af-34fa-d34e-ff49-a574a53d2c85)
