### Problem:

- The precipitation values calculated from the eulerian dataset approach and the box model approach give different results.
- We need to investigate why this is the case

In [None]:
import numpy as np
import xarray as xr
from pathlib import Path
import awkward as ak
from typing import Callable, Union, Tuple

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
from sdm_eurec4a.visulization import set_custom_rcParams


from pySD.sdmout_src import sdtracing
from pySD.sdmout_src import supersdata
from pySD.sdmout_src import pygbxsdat, pysetuptxt, supersdata

set_custom_rcParams()
strength_cmap = sns.cubehelix_palette(start=0.5, rot=-0.5, as_cmap=True)

In [None]:
microphysic = "null_microphysics"
# microphysic = "condensation"
# microphysic = "collision_condensation"
microphysic = "coalbure_condensation_large"
# microphysic = "coalbure_condensation_small"

In [None]:
cloud_id = 141

data_dir = Path(f"/home/m/m301096/CLEO/data/output_v4.0/{microphysic}/cluster_{cloud_id}")

# output_dir = data_dir / "processed"
# output_dir.mkdir(exist_ok=True, parents=False)

# output_path = output_dir / "eulerian_dataset.nc"
# output_path.parent.mkdir(exist_ok=True)

setupfile_path = data_dir / "config" / "eurec4a1d_setup.txt"
statsfile_path = data_dir / "config" / "eurec4a1d_stats.txt"
zarr_path = data_dir / "eurec4a1d_sol.zarr"
gridfile_path = data_dir / "share/eurec4a1d_ddimlessGBxboundaries.dat"


# read in constants and intial setup from setup .txt file
config = pysetuptxt.get_config(str(setupfile_path), nattrs=3, isprint=False)
consts = pysetuptxt.get_consts(str(setupfile_path), isprint=False)
gridbox_dict = pygbxsdat.get_gridboxes(str(gridfile_path), consts["COORD0"], isprint=False)

ds_zarr = xr.open_zarr(zarr_path, consolidated=False)
ds_zarr = ds_zarr.rename({"gbxindex": "gridbox"})
ds_zarr["time"] = np.round(ds_zarr["time"], 1)
ds_zarr = ds_zarr.compute()


ds_eulerian = xr.open_dataset(data_dir / "processed/eulerian_dataset.nc")
ds_eulerian["time"] = np.round(ds_eulerian["time"], 1)
ds_eulerian["radius_bins"] = ds_eulerian["radius_bins"].where(ds_eulerian["radius_bins"] > 0, 1e-3)

ds_conservation = xr.open_dataset(data_dir / "processed/conservation_dataset.nc")
ds_conservation["time"] = np.round(ds_conservation["time"], 1)
ds = xr.merge([ds_eulerian, ds_conservation])

Reading binary file:
 /home/m/m301096/CLEO/data/output_v4.0/coalbure_condensation_large/cluster_141/share/eurec4a1d_ddimlessGBxboundaries.dat


In [None]:
ds["liquid_water_content"] = 1e3 * ds["liquid_water_content"]
ds["liquid_water_content"].attrs["units"] = "g/m^3"
ds["liquid_water_content"].attrs["long_name"] = "Liquid water content"

ds["cloud_liquid_water_content"] = ds["liquid_water_content"].isel(gridbox=-1)
ds["cloud_liquid_water_content"].attrs["long_name"] = "Cloud liquid water content"

ds["source"].attrs["long_name"] = "Evaporation"
ds["inflow"].attrs["long_name"] = "Cloud Base Precipitation Flux"
ds["outflow"].attrs["long_name"] = "Surface Precipitation Flux"

# from kg/m^3/s

# # to mg/m^3/h
# ds['evaporation_rate']  = - ds['massdelta_condensation'] * 1e6 * 3600
# ds['evaporation_rate'].attrs['units'] = 'mg/m^3/h'
# ds['evaporation_rate'].attrs['long_name'] = 'Evaporation rate'

# to mm/m/h
rho_water = 1000  # kg / m^3
ds["evaporation_rate"] = -1e3 / rho_water * ds["massdelta_condensation"] * 3600
ds["evaporation_rate"].attrs["units"] = "mm/h/m"
ds["evaporation_rate"].attrs["long_name"] = "Evaporation rate"

for var in ["source", "inflow", "outflow", "reservoir_change"]:
    attrs = ds[var].attrs.copy()
    # from  kg per dT per domain area
    # dT = 2s

    # # to    g per h per m^2
    # ds[var] = ds[var] / 2 * 3600 / ds['surface_area'] * 1e6
    # ds[var].attrs.update(attrs)
    # ds[var].attrs['units'] = 'mg/m^2/h'

    # to    mm / h
    rho_water = 1000  # kg / m^3
    ds[var] = ds[var] / 2 * 3600 / ds["surface_area"]  # kg / m^2 / h
    ds[var] = 1e3 * ds[var] / rho_water  # mm / h
    ds[var].attrs.update(attrs)
    ds[var].attrs["units"] = "mm/h"


ds["evaporation_fraction"] = -100 * ds["source"] / ds["inflow"]
ds["evaporation_fraction"].attrs["units"] = "\\%"
ds["evaporation_fraction"].attrs["long_name"] = "Evaporation fraction"
ds

In [None]:
(
    ds["mass_left"].sum("radius_bins", skipna=True).sel(gridbox=0)
    / ds["precipitation"].sum("radius_bins", skipna=True)
).plot()

[<matplotlib.lines.Line2D at 0x7ffe3c0dca10>]

# Estiamtion of the precipitation and outflow with the two methods


### Eulerian

In [None]:
def ak_differentiate(sa: supersdata.SupersAttribute) -> supersdata.SupersAttribute:
    """
    This function calculates the difference of the data in the
    supersdata.SupersAttribute along the last axis. The difference is calculated as the
    difference of the next value minus the current value. The last value is set to nan,
    to make sure, that the mass change is at the same timestep, as the original value.

    Notes
    -----
    - The function is designed to work with awkward arrays
    - It is intended to be used on relatively regular arrays, where the last axis has at least 1 value or best 2 values.
    - Arrays which are empty along the last axis will be filled with a nan after execution. So use this function with caution due to high increase in memory usage.

    Parameters
    ----------
    sa : supersdata.SupersAttribute
        The attribute, which should be differentiated.
        Assuming it has the shape (N, M, var), the differentiation is done along the last axis.

    Returns
    -------
    supersdata.SupersAttribute
        The differentiated attribute.
        The output has the same shape as the input, but the last value along the last axis is nan.
        The new name of the attribute is the old name with "_difference" appended.
        All metadata is copied and the long_name is appended with "difference".
    """

    data = sa.data
    # logging.info(data)
    # It is very important, to concate the nan values at the END of the array, so that the last value is nan.
    # This makes sure, that the mass change is at the same timestep, as the original value.
    # With this, the evapoartion fraction can not exceed 1.
    data: ak.Array = ak.concatenate([data, np.nan], axis=-1)

    # if the data has entries, which have only one value, append another nan value
    if ak.min(ak.num(data, axis=-1)) < 2:
        data = ak.concatenate([data, np.nan], axis=-1)

    # calculate the difference
    diff = data[..., 1:] - data[..., :-1]

    # create a new attribute
    result = supersdata.SupersAttribute(
        name=sa.name + "_difference",
        data=diff,
        units=sa.units,
        metadata=sa.metadata.copy(),
    )

    # update metadata
    updated_metadata = sa.metadata.copy()
    try:
        updated_metadata["long_name"] = updated_metadata["long_name"] + " difference"
    except KeyError:
        pass
    result.set_metadata(metadata=updated_metadata)

    return result


def ak_last(sa: supersdata.SupersAttribute) -> supersdata.SupersAttribute:
    """
    This function only keeps the last value along axis 1. The rest will be replaced by
    nans.

    Notes
    -----
    - The function is designed to work with awkward arrays
    - It is intended to be used on relatively regular arrays, where the last axis has at least 1 value or best 2 values.

    Parameters
    ----------
    sa : supersdata.SupersAttribute
        The attribute, which should be lasted.
        Assuming it has the shape (N, M, var), the last values is kept along the last axis.

    Returns
    -------
    supersdata.SupersAttribute
        The lasted attribute.
        The output has the same shape as the input, but the last value along the last axis is nan.
        The new name of the attribute is the old name with "_last" appended.
        All metadata is copied and the long_name is appended with "last".
    """

    data = sa.get_data()

    # in order to remove all values except the last one, we need to create a new array with the same shape
    # data = [
    #     [1, 2, 3, 4],
    #     [5, 6, 7, 8],
    #    ]
    # after concatenate
    # data = [
    #     [n, n, n, n, 1],
    #     [n, n, n, n, 1],
    #    ]
    # after multiplication
    # data = [
    #     [n, n, n, n, 4],
    #     [n, n, n, n, 8],
    #    ]
    # after the slice, the result is
    # data = [
    #     [n, n, n, 4],
    #     [n, n, n, 8],
    #    ]

    last = (ak.concatenate([data * np.nan, 1], axis=-1) * data[..., -1])[..., 1:]
    # create a new attribute
    result = supersdata.SupersAttribute(
        name=sa.name + "_last",
        data=last,
        units=sa.units,
        metadata=sa.metadata.copy(),
    )

    # update metadata
    updated_metadata = sa.metadata.copy()
    try:
        updated_metadata["long_name"] = updated_metadata["long_name"] + " last"
    except KeyError:
        pass
    result.set_metadata(metadata=updated_metadata)

    return result


def create_inflow_outflow_reservoir_dataset(
    dataset: supersdata.SupersDataNew,
    dim0_name: str = "time",
    dim1_name: str = "sdgbxindex",
    attribute_names: Union[Tuple[str], None, str] = None,
) -> Tuple[
    supersdata.SupersDataSimple,
    supersdata.SupersDataSimple,
    supersdata.SupersDataSimple,
]:

    # use only the Superdroplets, which are in more than one timestep!
    # For this, the ak.num > 1
    # An example would be this array:
    # [
    #      [0,1,2,3],   -> usable
    #      [0,1],       -> usable
    #      [0],         -> UNUSABLE
    #      [3, 4, 5],   -> usable
    #  ]
    data = dataset[dim0_name].data
    mask = ak.num(data, axis=-1) > 1

    # create the empty dataset for the inflow, outflow and reservoir
    dataset_inflow = supersdata.SupersDataSimple([])
    dataset_outflow = supersdata.SupersDataSimple([])
    dataset_reservoir = supersdata.SupersDataSimple([])

    # if no attribute names are given, compute all attributes
    if isinstance(attribute_names, (str,)):
        attribute_names = (attribute_names,)

    if attribute_names is None:
        attribute_names = tuple(dataset.attributes.keys())

    # iterate over all attributes and create the inflow, outflow and reservoir
    # also iterate over the dimensions to have them avaiable for the indexing
    for key in set(attribute_names + (dim0_name, dim1_name)):
        # logging.info(f"Processing variable {key}")
        attribute = dataset[key]
        data = attribute.data
        data = data[mask]

        # The inflow is the second value of the array, because the first is the initialisation!
        # The first value of the array would be in the cloud gridbox
        inflow_array = data[:, 1]
        # The outflow of the data is the last value along the SD-Id dimension
        outflow_array = data[:, -1]
        # The reservoir is the data without the first and last value of the dataset
        reservoir_data = data[:, 1:-1]
        reservoir_data = ak.flatten(reservoir_data, axis=-1)

        dataset_inflow.set_attribute(
            supersdata.SupersAttribute(
                name=key, data=inflow_array, units=attribute.units, metadata=attribute.metadata
            )
        )
        dataset_outflow.set_attribute(
            supersdata.SupersAttribute(
                name=key, data=outflow_array, units=attribute.units, metadata=attribute.metadata
            )
        )
        dataset_reservoir.set_attribute(
            supersdata.SupersAttribute(
                name=key, data=reservoir_data, units=attribute.units, metadata=attribute.metadata
            )
        )

    # # logging.info(f"Indexing the datasets")
    # dataset_inflow.set_attribute(dataset_inflow[dim0_name].attribute_to_indexer_unique())
    # dataset_inflow.index_by_indexer(dataset_inflow[dim0_name])

    # dataset_outflow.set_attribute(dataset_outflow[dim0_name].attribute_to_indexer_unique())
    # dataset_outflow.index_by_indexer(dataset_outflow[dim0_name])

    # dataset_reservoir.set_attribute(dataset_reservoir[dim0_name].attribute_to_indexer_unique())
    # dataset_reservoir.set_attribute(dataset_reservoir[dim1_name].attribute_to_indexer_unique())
    # dataset_reservoir.index_by_indexer(dataset_reservoir[dim0_name])
    # dataset_reservoir.index_by_indexer(dataset_reservoir[dim1_name])

    return dataset_inflow, dataset_outflow, dataset_reservoir

In [None]:
sd_eulerian = supersdata.SupersDataNew(
    dataset=zarr_path,
    consts=consts,
)
sd_eulerian.flatten()

# ============
# 1. Create the necessary indexes and pass if they already exist
# ============
# make time an indexer which corresponsd_eulerian to the unique values of the time attribute
try:
    sd_eulerian.set_attribute(sd_eulerian["time"].attribute_to_indexer_unique())
except KeyError:
    pass
try:
    sd_eulerian.set_attribute(sd_eulerian["sdId"].attribute_to_indexer_unique())
except KeyError:
    pass

# ============
# 2. Create the Lagrangian view to calculate the mass change
# ============

# bin by the superdroplet id and calcuate the difference of the mass
sd_eulerian.index_by_indexer(index=sd_eulerian["sdId"])

time_diff = ak_differentiate(sd_eulerian["time"])
time_diff.set_metadata(
    metadata={
        "long_name": "Time difference per timestep",
    }
)
time_diff.set_name("time_difference")
time_diff.set_units("s")

# calculate total mass which left domain

sa = sd_eulerian["mass_represented"]


data = sa.get_data()

# in order to remove all values except the last one, we need to create a new array with the same shape
# data = [
#     [1, 2, 3, 4],
#     [5, 6, 7, 8],
#    ]
# after concatenate
# data = [
#     [n, n, n, n, 1],
#     [n, n, n, n, 1],
#    ]
# after multiplication
# data = [
#     [n, n, n, n, 4],
#     [n, n, n, n, 8],
#    ]
# after the slice, the result is
# data = [
#     [n, n, n, 4],
#     [n, n, n, 8],
#    ]

last = (ak.concatenate([data * np.nan, 1], axis=-1) * data[..., -1])[..., 1:]
# create a new attribute
result = supersdata.SupersAttribute(
    name=sa.name + "_last",
    data=last,
    units=sa.units,
    metadata=sa.metadata.copy(),
)

# update metadata
updated_metadata = sa.metadata.copy()
try:
    updated_metadata["long_name"] = updated_metadata["long_name"] + " last"
except KeyError:
    pass
result.set_metadata(metadata=updated_metadata)

mass_left = result
mass_left.set_name("mass_left")
mass_left.set_metadata(
    metadata={
        "long_name": "mass which left domain",
        "note": r"this is the last represented mass which a super droplet has during the simulation.\nMass represented by a superdroplet: $m = \xi \cdot m_{sd}$",
    }
)
mass_left.set_units("kg")
sd_eulerian.set_attribute(mass_left)

---- Superdrop Properties -----
RHO_L = 998.203 Kg/m^3
RHO_SOL = 2016.5 Kg/m^3
MR_SOL = 0.05844277 Kg/mol
IONIC = 2.0
-------------------------------
supers dataset path:  /home/m/m301096/CLEO/data/output_v4.0/coalbure_condensation_large/cluster_141/eurec4a1d_sol.zarr
Attribute coord1 not found in dataset
Attribute coord2 not found in dataset


In [None]:
sd_eulerian.set_attribute(sd_eulerian["sdgbxindex"].attribute_to_indexer_unique())
sd_eulerian.set_attribute(sd_eulerian["time"].attribute_to_indexer_unique())
sd_eulerian.set_attribute(sd_eulerian["sdId"].attribute_to_indexer_unique())
sd_eulerian.set_attribute(
    sd_eulerian["radius"].attribute_to_indexer_binned(
        bins=np.geomspace(10, 4e3, 151), new_name="radius_bins"
    )
)

sd_eulerian.flatten()
sd_eulerian.index_by_indexer(sd_eulerian["time"])
sd_eulerian.index_by_indexer(sd_eulerian["sdgbxindex"])
sd_eulerian.index_by_indexer(sd_eulerian["radius_bins"])

da_sd_eulerian = sd_eulerian.attribute_to_DataArray_reduction(
    attribute_name="mass_left",
    reduction_func=ak.nansum,
)
da_sd_eulerian_nan = sd_eulerian.attribute_to_DataArray_reduction(
    attribute_name="mass_left",
    reduction_func=ak.sum,
)
sd_eulerian.flatten()
sd_eulerian.index_by_indexer(sd_eulerian["sdId"])

In [None]:
sd_conservation = supersdata.SupersDataNew(
    dataset=zarr_path,
    consts=consts,
)
# Use the SupersDataNew class to read the dataset
sd_conservation.set_attribute(sd_conservation["sdId"].attribute_to_indexer_unique())
sd_conservation.set_attribute(sd_conservation["time"].attribute_to_indexer_unique())

sd_conservation.index_by_indexer(sd_conservation["sdId"])

inflow, outflow, dt_reservoir = create_inflow_outflow_reservoir_dataset(
    dataset=sd_conservation,
    dim0_name="time",
    dim1_name="sdgbxindex",
    attribute_names=("mass_represented", "sdId"),
)
print(outflow)

---- Superdrop Properties -----
RHO_L = 998.203 Kg/m^3
RHO_SOL = 2016.5 Kg/m^3
MR_SOL = 0.05844277 Kg/mol
IONIC = 2.0
-------------------------------
supers dataset path:  /home/m/m301096/CLEO/data/output_v4.0/coalbure_condensation_large/cluster_141/eurec4a1d_sol.zarr
Attribute coord1 not found in dataset
Attribute coord2 not found in dataset
Attributes:
--------------
time (s)
144225 * float64
sdgbxindex ()
144225 * uint32
mass_represented (kg)
144225 * float64
sdId ()
144225 * uint32

Indexes:
--------------



In [None]:
outflow.set_attribute(outflow["sdId"].attribute_to_indexer_unique())
outflow.set_attribute(outflow["time"].attribute_to_indexer_unique())
outflow.set_attribute(outflow["sdgbxindex"].attribute_to_indexer_unique())
outflow.flatten()
outflow.index_by_indexer(outflow["sdId"])
print(outflow)

Attributes:
--------------
time (s)
coord: [64, 66, 68, 70, 72, 74, ..., 3.59e+03, 3.59e+03, 3.6e+03, 3.6e+03, 3.6e+03]
144225 * var * float64
144225 * var * int64
sdgbxindex ()
coord: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ..., 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
144225 * var * uint32
144225 * var * int64
mass_represented (kg)
144225 * var * float64
sdId ()
coord: [169, 216, 314, 352, 401, 425, ..., 1843164, 1843182, 1843190, 1843196, 1843198]
144225 * var * uint32
144225 * var * int64

Indexes:
--------------
sdId
[169, 216, 314, 352, 401, 425, ..., 1843164, 1843182, 1843190, 1843196, 1843198]



firstly it seems, that the data is the same

In [None]:
mask = ak.num(sd_eulerian["sdId"].data) > 1

print(ak.sum(ak.mean(sd_eulerian["sdId"].data[mask], axis=1) != ak.flatten(outflow["sdId"].data)))
print(
    ak.sum(
        ak.nansum(sd_eulerian["mass_left"].data[mask], axis=1)
        != ak.flatten(outflow["mass_represented"].data)
    )
)
print(
    ak.sum(
        ak.mean(sd_eulerian["sdgbxindex"].data[mask], axis=1) != ak.flatten(outflow["sdgbxindex"].data)
    )
)
print(
    ak.sum(
        ak.mean(sd_eulerian["mass_left"].data[mask], axis=1)
        != ak.flatten(outflow["mass_represented"].data)
    )
)

0
0
144225
144225


In [None]:
same_sdid_attrs_list = []
diff_sdid_attrs_list = []

same_data = sd_eulerian["mass_left"].data[mask]
same_mask = ~ak.is_none(ak.nan_to_none(same_data), axis=-1)

diff_data = sd_eulerian["mass_left"].data[~mask]
diff_mask = ~ak.is_none(ak.nan_to_none(diff_data), axis=-1)


for key in sd_eulerian.attributes.keys():
    print(key)
    sa = sd_eulerian[key]
    same_sdid_attrs_list.append(
        supersdata.SupersAttribute(
            name=key,
            data=sa.data[mask],
            units=sa.units,
            metadata=sa.metadata,
        )
    )
    diff_sdid_attrs_list.append(
        supersdata.SupersAttribute(
            name=key,
            data=sa.data[~mask],
            units=sa.units,
            metadata=sa.metadata,
        )
    )

same_sd_eulerian = supersdata.SupersDataSimple(same_sdid_attrs_list)
diff_sd_eulerian = supersdata.SupersDataSimple(diff_sdid_attrs_list)

sdId
sdgbxindex
xi
radius
msol
coord3
time
mass
mass_represented
mass_left
radius_bins


In [None]:
diff_sd_eulerian.flatten()
same_sd_eulerian.flatten()
outflow.flatten()

for sd, name in zip(
    [diff_sd_eulerian, same_sd_eulerian, outflow],
    ["diff_sd_eulerian", "same_sd_eulerian", "outflow"],
):
    print(name)
    print(sd)
    sd.set_attribute(sd["sdId"].attribute_to_indexer_unique())
    sd.set_attribute(sd["time"].attribute_to_indexer_unique())
    sd.set_attribute(sd["sdgbxindex"].attribute_to_indexer_unique())
    sd.index_by_indexer(sd["time"])
    sd.index_by_indexer(sd["sdgbxindex"])
    print(sd)

In [None]:
da_diff_sd_eulerian = diff_sd_eulerian.attribute_to_DataArray_reduction(
    attribute_name="mass_left",
    reduction_func=ak.nansum,
)
da_diff_sd_eulerian.name = "diff"
da_same_sd_eulerian = same_sd_eulerian.attribute_to_DataArray_reduction(
    attribute_name="mass_left",
    reduction_func=ak.nansum,
)
da_same_sd_eulerian.name = "same"
da_same_sd_eulerian_nan = same_sd_eulerian.attribute_to_DataArray_reduction(
    attribute_name="mass_left",
    reduction_func=ak.sum,
)
da_same_sd_eulerian_nan.name = "same_nan"
da_outflow = outflow.attribute_to_DataArray_reduction(
    attribute_name="mass_represented",
    reduction_func=ak.nansum,
)
da_outflow.name = "outflow"

In [None]:
total = xr.merge([da_diff_sd_eulerian, da_same_sd_eulerian, da_same_sd_eulerian_nan, da_outflow])

total["eulerian"] = da_sd_eulerian
total["eulerian_nan"] = da_sd_eulerian_nan

total = total.fillna(0)
total = total.swap_dims({"sdgbxindex": "gridbox"})
total["time"] = np.round(total["time"], 1)

In [None]:
xr.testing.assert_allclose(
    total["same"],
    total["outflow"],
)

In [None]:
(total["eulerian"] - total["eulerian_nan"]).sum("radius_bins", skipna=False).plot()
plt.ylim(500, 550)

(500.0, 550.0)

In [None]:
# (total['eulerian'].sum('radius_bins', skipna = False) - total['outflow']).sel(gridbox = 0).plot()
# (total['same'] - total['outflow']).sel(gridbox = 0).plot()
(total["eulerian"]).sum("radius_bins").sel(gridbox=0).plot()
(total["outflow"]).sel(gridbox=0).plot()
(total["same"]).sel(gridbox=0).plot()
(total["eulerian_nan"]).sum("radius_bins").sel(gridbox=0).plot()
ds["mass_left"].sum("radius_bins").sel(gridbox=0).plot()

[<matplotlib.lines.Line2D at 0x7ffe2b97dac0>]

In [None]:
(-ds_conservation["outflow"]) == total["outflow"].sum("gridbox")