# GCM Filters Scaling Benchmark

Run on Casper full node (36 cores)

In [1]:
import xarray as xr
xr.__version__

'0.18.2'

In [2]:
#from coiled import performance_report
from dask.distributed import performance_report

In [3]:
import dask
dask.__version__

'2021.06.2'

In [4]:
import numpy as np
np.__version__

'1.18.2'

In [6]:
import cupy as cp
cp.__version__

ImportError: CuPy is not correctly installed.

If you are using wheel distribution (cupy-cudaXX), make sure that the version of CuPy you installed matches with the version of CUDA on your host.
Also, confirm that only one CuPy package is installed:
  $ pip freeze

If you are building CuPy from source, please check your environment, uninstall CuPy and reinstall it with:
  $ pip install cupy --no-cache-dir -vvvv

Check the Installation Guide for details:
  https://docs-cupy.chainer.org/en/latest/install.html

original error: libcublas.so.10: cannot open shared object file: No such file or directory

In [5]:
import dask.array as dsa

In [6]:
import gcm_filters as gf
gf.__version__

'0.1'

In [7]:
from contextlib import contextmanager
import time
import pandas as pd

class DiagnosticTimer:
    def __init__(self):
        self.diagnostics = []

    @contextmanager
    def time(self, **kwargs):
        tic = time.time()
        yield
        toc = time.time()
        kwargs["runtime"] = toc - tic
        self.diagnostics.append(kwargs)

    @property
    def df(self):
        return pd.DataFrame(self.diagnostics)

In [8]:
def make_data(shape, chunks, gpu=False):
    nt, ny, nx = shape
    da = xr.DataArray(dsa.random.random(shape, chunks=chunks), dims=['time', 'y', 'x'])
    mask_data = dsa.ones((ny, nx))
    mask_data[(ny // 4):(3 * ny // 4), (nx // 4):(3 * nx // 4)] = 0
    wet_mask = xr.DataArray(mask_data, dims=['y', 'x'])
    
    da_masked = da.where(wet_mask)
    
    if gpu:
        raise NotImplementedError("Can't get cupy working :(")

    filter = gf.Filter(
        filter_scale=4,
        dx_min=1,
        filter_shape=gf.FilterShape.TAPER,
        grid_type=gf.GridType.REGULAR_WITH_LAND,
        grid_vars={'wet_mask': wet_mask}
    )
    filter

    da_filtered = filter.apply(da_masked, dims=['y', 'x'])
    return da_masked, da_filtered

In [9]:
from dask.distributed import Client, LocalCluster

In [17]:
# strong scaling - problem size stays the same

shape = 360, 1024, 1024
chunks = (10,) + shape[1:]

unfiltered, filtered = make_data(shape, chunks)

diag_timer_strong = DiagnosticTimer()

for threads_per_worker in [3, 9, 36]:
    max_workers = 36 // threads_per_worker
    worker_step = max(max_workers // 4, 1)
    cluster = LocalCluster(threads_per_worker=threads_per_worker, n_workers=1)
    client = Client(cluster)
    for nworkers in [1] + list(range(worker_step, max_workers + 1, worker_step)):
        cluster.scale(nworkers)
        client.wait_for_workers(nworkers)
        assert len(client.ncores()) == nworkers
        ncores = sum(client.ncores().values())
        details = dict(ncores=ncores, nworkers=nworkers, shape=shape, chunks=chunks,
                       nbytes=filtered.data.nbytes, dtype=str(filtered.dtype))
        with diag_timer_strong.time(operation='unfiltered_mean', **details):
            unfiltered.data.mean().compute()
        with diag_timer_strong.time(operation='filtered_mean', **details):
            filtered.data.mean().compute()
        print(diag_timer_strong.df.iloc[-2:])
    client.close()
    cluster.close()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33998 instead
  expected, actual


         operation  ncores  nworkers              shape            chunks  \
0  unfiltered_mean       3         1  (360, 1024, 1024)  (10, 1024, 1024)   
1    filtered_mean       3         1  (360, 1024, 1024)  (10, 1024, 1024)   

       nbytes    dtype    runtime  
0  3019898880  float64   1.928494  
1  3019898880  float64  86.083285  
         operation  ncores  nworkers              shape            chunks  \
2  unfiltered_mean       9         3  (360, 1024, 1024)  (10, 1024, 1024)   
3    filtered_mean       9         3  (360, 1024, 1024)  (10, 1024, 1024)   

       nbytes    dtype    runtime  
2  3019898880  float64   0.806149  
3  3019898880  float64  31.352030  
         operation  ncores  nworkers              shape            chunks  \
4  unfiltered_mean      18         6  (360, 1024, 1024)  (10, 1024, 1024)   
5    filtered_mean      18         6  (360, 1024, 1024)  (10, 1024, 1024)   

       nbytes    dtype    runtime  
4  3019898880  float64   0.563238  
5  3019898880  f

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39942 instead
  expected, actual


          operation  ncores  nworkers              shape            chunks  \
10  unfiltered_mean       9         1  (360, 1024, 1024)  (10, 1024, 1024)   
11    filtered_mean       9         1  (360, 1024, 1024)  (10, 1024, 1024)   

        nbytes    dtype    runtime  
10  3019898880  float64   0.826083  
11  3019898880  float64  33.627329  
          operation  ncores  nworkers              shape            chunks  \
12  unfiltered_mean       9         1  (360, 1024, 1024)  (10, 1024, 1024)   
13    filtered_mean       9         1  (360, 1024, 1024)  (10, 1024, 1024)   

        nbytes    dtype    runtime  
12  3019898880  float64   0.701733  
13  3019898880  float64  33.094337  
          operation  ncores  nworkers              shape            chunks  \
14  unfiltered_mean      18         2  (360, 1024, 1024)  (10, 1024, 1024)   
15    filtered_mean      18         2  (360, 1024, 1024)  (10, 1024, 1024)   

        nbytes    dtype    runtime  
14  3019898880  float64   0.491663  

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37744 instead
  expected, actual


          operation  ncores  nworkers              shape            chunks  \
20  unfiltered_mean      36         1  (360, 1024, 1024)  (10, 1024, 1024)   
21    filtered_mean      36         1  (360, 1024, 1024)  (10, 1024, 1024)   

        nbytes    dtype    runtime  
20  3019898880  float64   0.527528  
21  3019898880  float64  18.852935  
          operation  ncores  nworkers              shape            chunks  \
22  unfiltered_mean      36         1  (360, 1024, 1024)  (10, 1024, 1024)   
23    filtered_mean      36         1  (360, 1024, 1024)  (10, 1024, 1024)   

        nbytes    dtype    runtime  
22  3019898880  float64   0.420205  
23  3019898880  float64  18.078998  


In [19]:
# weak scaling - problem size scales with number of cores

diag_timer_weak = DiagnosticTimer()

for threads_per_worker in [3, 9, 36]:
    max_workers = 36 // threads_per_worker
    worker_step = max(max_workers // 4, 1)
    cluster = LocalCluster(threads_per_worker=threads_per_worker, n_workers=1)
    client = Client(cluster)
    for nworkers in [1] + list(range(worker_step, max_workers + 1, worker_step)):
        cluster.scale(nworkers)
        client.wait_for_workers(nworkers)
        assert len(client.ncores()) == nworkers
        ncores = sum(client.ncores().values())
        
        shape = ncores * 10 * 4, 1024, 1024
        chunks = (10,) + shape[1:]
        unfiltered, filtered = make_data(shape, chunks)

        details = dict(ncores=ncores, nworkers=nworkers, shape=shape, chunks=chunks,
                       nbytes=filtered.data.nbytes, dtype=str(filtered.dtype))
        with diag_timer_weak.time(operation='unfiltered_mean', **details):
            unfiltered.data.mean().compute()
        with diag_timer_weak.time(operation='filtered_mean', **details):
            filtered.data.mean().compute()
        print(diag_timer_weak.df.iloc[-2:])
    client.close()
    cluster.close()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41355 instead
  expected, actual


         operation  ncores  nworkers              shape            chunks  \
0  unfiltered_mean       3         1  (120, 1024, 1024)  (10, 1024, 1024)   
1    filtered_mean       3         1  (120, 1024, 1024)  (10, 1024, 1024)   

       nbytes    dtype    runtime  
0  1006632960  float64   0.755714  
1  1006632960  float64  29.135221  
         operation  ncores  nworkers              shape            chunks  \
2  unfiltered_mean       9         3  (360, 1024, 1024)  (10, 1024, 1024)   
3    filtered_mean       9         3  (360, 1024, 1024)  (10, 1024, 1024)   

       nbytes    dtype    runtime  
2  3019898880  float64   0.811971  
3  3019898880  float64  30.836951  
         operation  ncores  nworkers              shape            chunks  \
4  unfiltered_mean      18         6  (720, 1024, 1024)  (10, 1024, 1024)   
5    filtered_mean      18         6  (720, 1024, 1024)  (10, 1024, 1024)   

       nbytes    dtype    runtime  
4  6039797760  float64   0.823065  
5  6039797760  f

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34211 instead
  expected, actual


          operation  ncores  nworkers              shape            chunks  \
10  unfiltered_mean       9         1  (360, 1024, 1024)  (10, 1024, 1024)   
11    filtered_mean       9         1  (360, 1024, 1024)  (10, 1024, 1024)   

        nbytes    dtype    runtime  
10  3019898880  float64   0.842147  
11  3019898880  float64  33.507577  
          operation  ncores  nworkers              shape            chunks  \
12  unfiltered_mean       9         1  (360, 1024, 1024)  (10, 1024, 1024)   
13    filtered_mean       9         1  (360, 1024, 1024)  (10, 1024, 1024)   

        nbytes    dtype    runtime  
12  3019898880  float64   0.714265  
13  3019898880  float64  32.872067  
          operation  ncores  nworkers              shape            chunks  \
14  unfiltered_mean      18         2  (720, 1024, 1024)  (10, 1024, 1024)   
15    filtered_mean      18         2  (720, 1024, 1024)  (10, 1024, 1024)   

        nbytes    dtype    runtime  
14  6039797760  float64   0.857916  

Perhaps you already have a cluster running?
Hosting the HTTP server on port 45430 instead
  expected, actual


          operation  ncores  nworkers               shape            chunks  \
20  unfiltered_mean      36         1  (1440, 1024, 1024)  (10, 1024, 1024)   
21    filtered_mean      36         1  (1440, 1024, 1024)  (10, 1024, 1024)   

         nbytes    dtype    runtime  
20  12079595520  float64   1.381825  
21  12079595520  float64  72.642258  
          operation  ncores  nworkers               shape            chunks  \
22  unfiltered_mean      36         1  (1440, 1024, 1024)  (10, 1024, 1024)   
23    filtered_mean      36         1  (1440, 1024, 1024)  (10, 1024, 1024)   

         nbytes    dtype    runtime  
22  12079595520  float64   1.128636  
23  12079595520  float64  72.032173  


In [20]:
from datetime import datetime

now = datetime.now().isoformat()[:19]
diag_timer_weak.df.to_csv(f'data/scaling_weak_cpu_{now}.csv', index=False)
diag_timer_strong.df.to_csv(f'data/scaling_strong_cpu_{now}.csv', index=False)

In [10]:
# performance report

threads_per_worker = 9
nworkers = 4

cluster = LocalCluster(threads_per_worker=threads_per_worker, n_workers=1)
client = Client(cluster)
cluster.scale(nworkers)
client.wait_for_workers(nworkers)
assert len(client.ncores()) == nworkers
ncores = sum(client.ncores().values())

In [11]:
shape = ncores * 10 * 4, 1024, 1024
chunks = (10,) + shape[1:]
unfiltered, filtered = make_data(shape, chunks)

In [12]:
with performance_report("performance_reports/unfiltered_mean_36_cores_4_workers.html"):
    unfiltered.data.mean().compute()

In [13]:
with performance_report("performance_reports/filtered_mean_36_cores_4_workers.html"):
    filtered.data.mean().compute()