In [None]:
import numpy as np
import xarray as xr
import awkward as ak
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from pySD.sdmout_src import pysetuptxt, supersdata

In [None]:
def sizeof_fmt(num, suffix="B"):
    """https://stackoverflow.com/a/1094933/16372843"""
    for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"):
        if abs(num) < 1024.0:
            return f"{num:3.1f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.1f}Yi{suffix}"

## Load zarr dataset

In [None]:
data_dir = Path("/home/m/m301096/CLEO/data/output_v4.2/coalbure_condensation_large/cluster_384/")

setupfile_path = data_dir / "config" / "eurec4a1d_setup.txt"
statsfile_path = data_dir / "config" / "eurec4a1d_stats.txt"
zarr_path = data_dir / "eurec4a1d_sol.zarr"

# read in constants and config file from setup .txt file
config = pysetuptxt.get_config(str(setupfile_path), nattrs=3, isprint=False)
consts = pysetuptxt.get_consts(str(setupfile_path), isprint=False)
# display(consts)
# display(config)

In [None]:
ds_zarr = xr.open_zarr(zarr_path, consolidated=False)
ds_zarr = ds_zarr.rename({"gbxindex": "gridbox"})
ds_zarr["time"] = np.round(ds_zarr["time"], 1)
ds_zarr

## Example without explanation

Goal: Extract maximum radius of superdroplets within a gridbox.

**NOTE:** The name *index* should really be thought of as **dimension** or **coordinate**

We can create a new coordinate, by converting the attribute into a coordinate (here still named index).
Only this class instance can be used as a coordinate in the dataset. 

In [None]:
# craete the dataset class
example_dataset = supersdata.SupersDataNew(
    dataset=ds_zarr,
    consts=consts,
)

# create data format, which can be used as coordinate for 'time' and gridbox 'sdgbxindex'
example_dataset.set_attribute(example_dataset["time"].attribute_to_indexer_unique())
example_dataset.set_attribute(example_dataset["sdgbxindex"].attribute_to_indexer_unique())

# make these actual coordinates
example_dataset.index_by_indexer(index=example_dataset["time"])
example_dataset.index_by_indexer(index=example_dataset["sdgbxindex"])

---- Superdrop Properties -----
RHO_L = 998.203 Kg/m^3
RHO_SOL = 2016.5 Kg/m^3
MR_SOL = 0.05844277 Kg/mol
IONIC = 2.0
-------------------------------


Attribute coord1 not found in dataset
Attribute coord2 not found in dataset


The dataset will have coordinates
1. ``time``
2. ``sdgbxindex``
3. A ragged coordinate without a name. You have to know what data it contains.
   In this case, for time T and gridbox G, it contains data of all superdroplets which are within gridbox G at timestep T. 

Below an example for the ``radius`` attribute

In [None]:
print(example_dataset["radius"])

radius (micro-m)
1801 * 39 * var * float64


Now we can extract the maximum radius of superdroplets with the ``ak.nanmax`` function.

The output coordinates will be ``time``, ``sdgbxindex``. 

In [None]:
# apply an awkward array function to the last dimension in the dataset ('time', 'sdgbxindex', ...)
minimum_radius_gridbox = example_dataset.attribute_to_DataArray_reduction(
    attribute_name="radius", reduction_func=ak.nanmax
)
minimum_radius_gridbox.T.plot(norm=mcolors.LogNorm())
plt.title("Maximum radius of superdroplets")

Text(0.5, 1.0, 'Maximum radius of superdroplets')

## Background

You can see, that the dataset, contains a number of ``attributes``.
A better name would be ``variables``, as in xarray Datasets.

- The original variables from the zarr dataset were: ``['coord3', 'msol', 'radius', 'sdId', 'sdgbxindex', 'xi']``.
- New variables are added for convience to this data vector of shape [6 x N], creating a vector of [9 x N]. N is the length of the ``superdroplets`` dimension. 
    - Theses are ``['mass', 'mass_represented', 'time']``
    - ``mass`` ($m$) is simply the mass of each superdroplet if it would be a sphere.
    - ``mass_represented`` is $\xi \cdot m$

### How to deal with ``time``

The original dimension ``time`` is not very different, from the other values of the given data vector.
We can add the corresponding value of ``time`` to each of the N vector, along the ``superdroplets`` dimension.
Now, each value in the underlying awkward-arrays contains related data. 

In [None]:
dataset = supersdata.SupersDataNew(
    dataset=ds_zarr,
    consts=consts,
)

---- Superdrop Properties -----
RHO_L = 998.203 Kg/m^3
RHO_SOL = 2016.5 Kg/m^3
MR_SOL = 0.05844277 Kg/mol
IONIC = 2.0
-------------------------------


Attribute coord1 not found in dataset
Attribute coord2 not found in dataset


In [None]:
print(dataset)

Attributes:
--------------
sdId ()
41082535 * uint32
sdgbxindex ()
41082535 * uint32
xi ()
41082535 * float64
radius (micro-m)
41082535 * float64
msol (g)
41082535 * float64
coord3 (m)
41082535 * float64
time (s)
41082535 * float64
mass (kg)
41082535 * float64
mass_represented (kg)
41082535 * float64

Indexes:
--------------



We can also take a look at the attributes, and will see that they are all SupersAttribute.

In [None]:
dataset.attributes

{'sdId': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb64766bd0>,
 'sdgbxindex': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb64427020>,
 'xi': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb40210ef0>,
 'radius': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb4057cd70>,
 'msol': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb403b0da0>,
 'coord3': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb647ab2c0>,
 'time': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb405e8b60>,
 'mass': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb40212300>,
 'mass_represented': <pySD.sdmout_src.supersdata.SupersAttribute at 0x7ffb4024e3f0>}

Each ``SupersAttribute`` has it's data stored as awkward array

In [None]:
print(dataset["sdId"])
display(dataset["sdId"].data)

sdId ()
41082535 * uint32


## How to track superdroplets


If we want to track all the superdroplets, we can think of the ``sdId`` data to become a new dimension of the dataset, similar to an xarray Dataset.

In [None]:
# the method creates a new dimension, which contains all unique valus in the ``sdId`` data as coordinate values
dataset1 = supersdata.SupersDataNew(
    dataset=ds_zarr,
    consts=consts,
)
superdroplet_dimension = dataset1["sdId"].attribute_to_indexer_unique()
print(superdroplet_dimension)

---- Superdrop Properties -----
RHO_L = 998.203 Kg/m^3
RHO_SOL = 2016.5 Kg/m^3
MR_SOL = 0.05844277 Kg/mol
IONIC = 2.0
-------------------------------


Attribute coord1 not found in dataset
Attribute coord2 not found in dataset
sdId ()
coord: [0, 1, 2, 3, 4, 5, 6, ..., 1844218, 1844219, 1844220, 1844221, 1844222, 1844223]
41082535 * uint32
41082535 * int64


This data class, has a attribute ``coord``.
This is smaller, than the original data, because it only contains all the superdroplet IDs.

In [None]:
print(superdroplet_dimension.coord)

[0, 1, 2, 3, 4, 5, 6, ..., 1844218, 1844219, 1844220, 1844221, 1844222, 1844223]


We have to add this new instance to the dataset. I know it's not super clean :/

In [None]:
dataset1.set_attribute(superdroplet_dimension)

After this, we can transform the whole dataset, to have the ``sdId`` as a first dimension:

In [None]:
dataset1.index_by_indexer(dataset1["sdId"])
print(dataset1)

Attributes:
--------------
sdId ()
coord: [0, 1, 2, 3, 4, 5, 6, ..., 1844218, 1844219, 1844220, 1844221, 1844222, 1844223]
1844224 * var * uint32
1844224 * var * int64
sdgbxindex ()
1844224 * var * uint32
xi ()
1844224 * var * float64
radius (micro-m)
1844224 * var * float64
msol (g)
1844224 * var * float64
coord3 (m)
1844224 * var * float64
time (s)
1844224 * var * float64
mass (kg)
1844224 * var * float64
mass_represented (kg)
1844224 * var * float64

Indexes:
--------------
sdId
[0, 1, 2, 3, 4, 5, 6, ..., 1844218, 1844219, 1844220, 1844221, 1844222, 1844223]



lets say we want to extract the trajectorie of the superdroplets, we can create a xarray Dataset which contains all the information.

The ragged_dimesion in this case, will be the ``timestep`` relative to the spawning of the superdroplet.

The output DataArray needs to be big, because it is really sparse, due to the shear amount of superdroplets.
But mainly, because some of them will be present in the whole simulation and some will only be present for one single timestep.

In [None]:
# vertical = dataset1.attribute_to_DataArray(attribute_name = 'coord3')

# print(f"The dataset has a size of {sizeof_fmt(vertical.nbytes)}")

The dataset has a size of 17.9GiB


In [None]:
# filled_precentage = (np.isfinite(vertical).sum() / np.size(vertical) * 100).data
# print(f"Only {filled_precentage} % are filled with data")

Only 1.7135635367504165 % are filled with data


In [None]:
# # get bytes in human readable form
# timestepping = 2
# nplot = 1000
# np.random.seed(234)
# plt.plot(
#     vertical['ragged_dimension_1'] * timestepping,
#     vertical.sel(sdId = np.random.choice(vertical['sdId'], nplot)).T,
#     marker = '',
# );

### Slow, so the better option would be to extract the awkward-array

In [None]:
trajectories = dataset1["coord3"].data

# only use superdroplets, which are in the simulation for more than 50 timesteps but not longer than 100 timesteps

individual_existence_timesteps = ak.num(trajectories)

mask = individual_existence_timesteps >= 10
trajectories = trajectories[mask]

# get the number of superdroplets, which fulfill this
N = ak.num(trajectories, axis=0)

ids = np.random.randint(0, N, 100)
trajectories = trajectories[ids]

# get the longest existance time
length = ak.max(ak.num(trajectories, axis=1))

np_trajectories = ak.pad_none(trajectories, length, axis=1).to_numpy()

In [None]:
plt.plot(np_trajectories.T);

The part above could be solved, by adding a ``.sel`` and ``.where`` method to the ``SupersDataNew`` class

### Slow, so what is the good thing then in the end?

The advantage comes, when you want to perform reduction calculations on the data.

For instance, you want to know what is the minimum radius, which a superdroplet will have during it's simulation.

You can pass the ``ak.nanmin`` reduction function and get a DataArray containg the data.


In [None]:
minimum_radius_lagrange = dataset.attribute_to_DataArray_reduction(
    attribute_name="radius", reduction_func=ak.nanmin
)
minimum_radius_lagrange

# More powerful: Eulerian transformation with a reduction applied,

Let's say, we want to know what the minimum radius within a all gridboxes over all timesteps is.

We need to have dimensions/coordinates
- time
- sdgbxindex

The we select the radius and extract the minimum values using the ak.nanmin reduction function.

Setting the dimension/coordinate takes a while.
But getting the reduced dataset is pretty quick.

In [None]:
dataset2 = supersdata.SupersDataNew(
    dataset=ds_zarr,
    consts=consts,
)
dataset2.set_attribute(dataset2["time"].attribute_to_indexer_unique())
dataset2.set_attribute(dataset2["sdgbxindex"].attribute_to_indexer_unique())

dataset2.index_by_indexer(index=dataset2["time"])
dataset2.index_by_indexer(index=dataset2["sdgbxindex"])

---- Superdrop Properties -----
RHO_L = 998.203 Kg/m^3
RHO_SOL = 2016.5 Kg/m^3
MR_SOL = 0.05844277 Kg/mol
IONIC = 2.0
-------------------------------


Attribute coord1 not found in dataset
Attribute coord2 not found in dataset


In [None]:
print(dataset2)

Attributes:
--------------
sdId ()
1801 * 39 * var * uint32
sdgbxindex ()
coord: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ..., 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]
1801 * 39 * var * uint32
1801 * 39 * var * int64
xi ()
1801 * 39 * var * float64
radius (micro-m)
1801 * 39 * var * float64
msol (g)
1801 * 39 * var * float64
coord3 (m)
1801 * 39 * var * float64
time (s)
coord: [0, 2, 4, 6, 8, 10, ..., 3.59e+03, 3.59e+03, 3.6e+03, 3.6e+03, 3.6e+03]
1801 * 39 * var * float64
1801 * 39 * var * int64
mass (kg)
1801 * 39 * var * float64
mass_represented (kg)
1801 * 39 * var * float64

Indexes:
--------------
time
[0, 2, 4, 6, 8, 10, ..., 3.59e+03, 3.59e+03, 3.6e+03, 3.6e+03, 3.6e+03]
sdgbxindex
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ..., 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]



In [None]:
minimum_radius_gridbox = dataset2.attribute_to_DataArray_reduction(
    attribute_name="radius", reduction_func=ak.nanmin
)
minimum_radius_gridbox

In [None]:
minimum_radius_gridbox.T.plot(norm=mcolors.LogNorm())

<matplotlib.collections.QuadMesh at 0x7ffb4025c4a0>

If we want to extract all radii present in each gridbox for each timestep, there will be a ragged dimension, which we fill up with ``nan``.

The ``ragged_dimension`` in this case will be a simple count for superdroplets in a gridbox.

The output DataArray has the dimensions:
1. time,
2. sdgbxindex,
3. ragged dimension (see above)

In [None]:
radius_gridbox = dataset2.attribute_to_DataArray(attribute_name="radius")
radius_gridbox