In [1]:
from sim import simulation as sim
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import dask.array as da

In [2]:
point = sim.SystemState([8 for _ in np.arange(0,32)], 0)
point.perturbate()
point

coordinates: [7.96915195 8.01221088 7.99377277 8.02853586 8.02799758 7.97725926
 7.97764643 8.03018722 8.04581394 8.03759326 7.98578173 8.00009951
 8.01834629 8.0212702  7.98702508 8.00611962 8.00030832 7.95137684
 8.02728266 8.03826412 7.9864886  8.01153962 7.95753812 7.9868824
 8.04331401 8.01513781 7.98972026 8.02887301 7.98168361 8.00680987
 8.03691274 7.99361734]
time: 0

In [3]:
simulator = sim.Simulator(system_state=point, forcing=8)

In [4]:
runner = sim.SimulationRunner(simulator, integration_time = 10000, chunk_length=100, out_file='/home/cucchi/phd/prova.nc')

In [5]:
runner.run()

In [15]:
data = xr.open_dataarray('/home/cucchi/phd/prova.nc', chunks={'time': 100})

In [16]:
data

<xarray.DataArray 'var' (time: 1000000, node: 32)>
dask.array<shape=(1000000, 32), dtype=float32, chunksize=(100, 32)>
Coordinates:
  * time     (time) float64 0.0 0.01 0.02 0.03 0.04 ... 1e+04 1e+04 1e+04 1e+04
  * node     (node) int32 0 1 2 3 4 5 6 7 8 9 ... 22 23 24 25 26 27 28 29 30 31

This works, but does it load data before?

In [17]:
%%time
xr.apply_ufunc(np.percentile, data, dask='allowed', input_core_dims=[['time']], vectorize=True, kwargs={'q':95})

CPU times: user 24 s, sys: 1.74 s, total: 25.7 s
Wall time: 12.7 s


<xarray.DataArray 'var' (node: 32)>
array([8.446218, 8.45586 , 8.387377, 8.446781, 8.490216, 8.435039, 8.481731,
       8.531218, 8.470455, 8.473259, 8.482784, 8.418666, 8.459051, 8.424546,
       8.444879, 8.445518, 8.424179, 8.470468, 8.400392, 8.403275, 8.521829,
       8.50205 , 8.442485, 8.436823, 8.538619, 8.391727, 8.433995, 8.454539,
       8.46402 , 8.409094, 8.461784, 8.424083])
Coordinates:
  * node     (node) int32 0 1 2 3 4 5 6 7 8 9 ... 22 23 24 25 26 27 28 29 30 31

This works, but does it load data before? EDIT: doesn't work if core dimension is chunked.

In [18]:
%%time
quantiles = xr.apply_ufunc(np.percentile, data, dask='parallelized', vectorize=True, input_core_dims=[['time']], output_dtypes=['float'], kwargs={'q':95})
quantiles

ValueError: dimension 'time' on 0th function argument to apply_ufunc with dask='parallelized' consists of multiple chunks, but is also a core dimension. To fix, rechunk into a single dask array chunk along this dimension, i.e., ``.rechunk({'time': -1})``, but beware that this may significantly increase memory usage.

In [12]:
%%time
quantiles.values

KeyboardInterrupt: 

This works, but does it load data before?

In [10]:
%%time
data.reduce(np.percentile, dim='time', q=95)

CPU times: user 9.69 s, sys: 744 ms, total: 10.4 s
Wall time: 7.35 s


<xarray.DataArray 'var' (node: 32)>
array([8.446218, 8.45586 , 8.387377, 8.446781, 8.490216, 8.435039, 8.481731,
       8.531218, 8.470455, 8.473259, 8.482784, 8.418666, 8.459051, 8.424546,
       8.444879, 8.445518, 8.424179, 8.470468, 8.400392, 8.403275, 8.521829,
       8.50205 , 8.442485, 8.436823, 8.538619, 8.391727, 8.433995, 8.454539,
       8.46402 , 8.409094, 8.461784, 8.424083])
Coordinates:
  * node     (node) int32 0 1 2 3 4 5 6 7 8 9 ... 22 23 24 25 26 27 28 29 30 31

This works, but maybe is not needed (I can load time-series from just 1 noded without memory issues).

In [None]:
data1d = data.sel(node=0)
data1d = data1d.drop('node')

In [None]:
data1d.data

In [None]:
da.percentile(data1d.data, 95).compute()

In [None]:
data1d.plot()