In [None]:
import warnings
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import xarray as xr
from numpy.random import default_rng
from scipy.optimize import least_squares, Bounds
from typing import List, Tuple, Dict, TypedDict, Union, Callable

from sdm_eurec4a.reductions import mean_and_stderror_of_mean
from sdm_eurec4a.visulization import set_custom_rcParams, adjust_lightness_array, label_from_attrs
from sdm_eurec4a.identifications import match_clouds_and_cloudcomposite, select_individual_cloud_by_id
from sdm_eurec4a.conversions import msd_from_psd_dataarray, psd_from_msd_dataarray

warnings.filterwarnings("ignore")

default_colors = set_custom_rcParams()
dark_colors = adjust_lightness_array(default_colors, 0.7)

In [None]:
def create_variance_field(
    y: np.ndarray,
    variance: Union[None, float, int, np.ndarray] = None,
    variance_scale: float = 0.01,
    variance_minimal: float = 1e30,
    #   variance_replace: Union[None, float] = None
) -> Union[np.ndarray, float]:
    """
    Create a variance field based on the input data and specified parameters.
    Parameters:
    -----------
    y : np.ndarray
        The input data array for which the variance field is to be created.
    variance : Union[bool, np.ndarray], optional
        If True, the variance is calculated as the scaled absolute value of `y`.
        If False, the variance is set to 1.
        If an array, it is used directly as the variance.
        Default is True.
    variance_scale : float, optional
        The scaling factor applied to the absolute value of `y` to calculate the variance.
        Default is 0.01.
    # variance_minimal : float, optional
    #     The minimal threshold for the variance. Values below this threshold are replaced.
    #     Default is 1e-12.
    # variance_replace : Union[None, float], optional
    #     The value to replace variances below the minimal threshold. If None, the minimum
    #     non-NaN variance is used. Default is None.
    Returns:
    --------
    np.ndarray
        The calculated variance field based on the input data and specified parameters.
    """

    # plt.figure()
    # also devide by the variance of the data
    if variance == None:
        # we scale the variance by the absolute value of the data
        var = np.abs(variance_scale * (y / np.nanstd(y)))
        # plt.plot(var, '--')
        # we handle the case where the data is zero, by setting the variance there to the maximum value
        var_nozero = np.where(y != 0, var, np.nan)
        var_truemin = np.nanmin(var_nozero)
        # we replace the zero values with the minimum
        # plt.axhline(y=var_truemin, color='r', linestyle =  ':')
        var = np.where(var != 0, var, var_truemin)
        # we replace the variances below the minimal threshold
        # var = np.where(var <= variance_minimal, var, var_min)

    elif isinstance(variance, (np.ndarray, int, float)):
        var = variance
    else:
        raise TypeError(
            f"The variance parameter must be either None, a float or a numpy array.\nBut is of type: {type(variance)}."
        )

    # var = var * 0 + 1

    # plt.plot(var)
    # plt.show()

    return var

## Distributions to be used for fit
### LnNormal

In [None]:
class LnParams(TypedDict):
    mu1: float
    sigma1: float
    scale_factor1: float


def ln_normal_distribution(
    t: np.ndarray,
    mu1: float,
    sigma1: float,
    scale_factor1: float,
) -> np.ndarray:

    result = t * 0

    sigtilda = np.log(sigma1)
    mutilda = np.log(mu1)

    norm = scale_factor1 / (np.sqrt(2 * np.pi) * sigtilda)
    exponent = -((np.log(t) - mutilda) ** 2) / (2 * sigtilda**2)

    dn_dlnr = norm * np.exp(exponent)  # eq.5.8 [lohmann intro 2 clouds]

    result += dn_dlnr

    return result


def ln_normal_distribution_cost(
    x: Tuple[float, float, float],
    t: np.ndarray,
    y: np.ndarray,
    variance: Union[None, np.ndarray, float] = None,
    variance_scale: float = 0.01,
    variance_minimal: float = 1e-12,
) -> np.ndarray:

    y_pred = ln_normal_distribution(t, *x)

    var = create_variance_field(y, variance, variance_scale, variance_minimal)

    return np.ravel((y_pred - y) / np.sqrt(var))

### Double LnNormal

In [None]:
class DoubleLnParams(TypedDict):
    mu1: float
    sigma1: float
    scale_factor1: float
    mu2: float
    sigma2: float
    scale_factor2: float


def double_ln_normal_distribution(
    t: np.ndarray,
    mu1: float,
    sigma1: float,
    scale_factor1: float,
    mu2: float,
    sigma2: float,
    scale_factor2: float,
) -> np.ndarray:

    result = t * 0

    for mu, sigma, scale_factor in zip(
        (mu1, mu2),
        (sigma1, sigma2),
        (scale_factor1, scale_factor2),
    ):
        sigtilda = np.log(sigma)
        mutilda = np.log(mu)

        norm = scale_factor / (np.sqrt(2 * np.pi) * sigtilda)
        exponent = -((np.log(t) - mutilda) ** 2) / (2 * sigtilda**2)

        dn_dlnr = norm * np.exp(exponent)  # eq.5.8 [lohmann intro 2 clouds]

        result += dn_dlnr

    return result


def double_ln_normal_distribution_cost(
    x: Tuple[float, float, float, float, float, float],
    t: np.ndarray,
    y: np.ndarray,
    variance: Union[None, float, int, np.ndarray] = None,
    variance_scale: float = 0.01,
    variance_minimal: float = 1e-12,
) -> np.ndarray:

    y_pred = double_ln_normal_distribution(t, *x)

    var = create_variance_field(y, variance, variance_scale, variance_minimal)

    return np.ravel((y_pred - y) / np.sqrt(var))

### Gamma distribution

In [None]:
class GammaParams(TypedDict):
    shape: float
    slope: float
    intercept: float


def gamma_distribution(
    radius: np.ndarray,  # radius
    shape: float,
    slope: float,
    intercept: float,
) -> np.ndarray:

    n = intercept + slope * (2 * radius) ** shape * np.exp(-slope * (2 * radius))

    return n


def gamma_distribution_cost(
    x: Tuple[float, float, float],
    t: np.ndarray,  # radius,
    y: np.ndarray,
    variance: Union[None, float, int, np.ndarray] = None,
    variance_scale: float = 0.01,
    variance_minimal: float = 1e-12,
) -> np.ndarray:

    y_pred = gamma_distribution(t, *x)

    var = create_variance_field(y, variance, variance_scale, variance_minimal)

    return np.ravel((y_pred - y) / np.sqrt(var))


def gamma_distribution_stats(
    t: Union[np.ndarray, xr.DataArray],
    a: float,
    loc: float,
    scale: float,
    N0: float,
) -> np.ndarray:

    result = stats.gamma.pdf(t, a=a, loc=loc, scale=scale)
    # normalize to have the same maximum as the data
    result = N0 / np.max(result) * result

    return t * 0 + result


def gamma_distribution_stats_cost(
    x: np.ndarray,
    t: np.ndarray,
    y: np.ndarray,
    variance: Union[None, float, int, np.ndarray] = None,
    variance_scale: float = 0.01,
    variance_minimal: float = 1e-12,
) -> np.ndarray:

    a, loc, scale, N0 = x
    y_pred = gamma_distribution_stats(t, a, loc, scale, N0)

    var = create_variance_field(y, variance, variance_scale, variance_minimal)

    return np.ravel((y_pred - y) / np.sqrt(var))

## Generate example for the fitting of the 2LnNormal

In [None]:
rng = default_rng()


def gen_data(
    t: np.ndarray,
    func: Callable,
    func_args: Tuple = (),
    func_kwargs: Union[dict, TypedDict] = dict(),
    noise=0.0,
    n_outliers=0,
    seed=None,
):
    rng = default_rng(seed)

    y = func(t, *func_args, **func_kwargs)
    error = noise * rng.standard_normal(t.size)
    outliers = rng.integers(0, t.size, n_outliers)
    error[outliers] = np.sqrt(t[outliers]) * error[outliers]

    return y + error

In [None]:
params = DoubleLnParams(
    mu1=1e-2,
    sigma1=2,
    scale_factor1=5,
    mu2=0.5e1,
    sigma2=3,
    scale_factor2=2,
)

t_min = 0.1
t_max = 10
n_points = 40
n_outliers = 5

t_train = np.logspace(-3, 2, n_points)
m_train = gen_data(
    t=t_train,
    func=double_ln_normal_distribution,
    func_kwargs=params,
    noise=0.2,
    n_outliers=n_outliers,
    seed=42,
)


x0 = np.array([1e-1, 2.0, 1.0, 10.0, 2.0, 1.0])


bounds = Bounds(
    lb=[1e-10, 1e-10, -np.inf, 2e-2, 1e-10, -np.inf],
    ub=[5e-1, np.inf, np.inf, np.inf, np.inf, np.inf],
    keep_feasible=[True, True, True, False, True, True],
)


res_lsq = least_squares(double_ln_normal_distribution_cost, x0, bounds=bounds, args=(t_train, m_train))

res_soft_l1 = least_squares(
    double_ln_normal_distribution_cost,
    x0,
    loss="soft_l1",
    f_scale=0.1,
    bounds=bounds,
    args=(t_train, m_train),
)

res_log = least_squares(
    double_ln_normal_distribution_cost,
    x0,
    loss="cauchy",
    f_scale=0.1,
    bounds=bounds,
    args=(t_train, m_train),
)


t_test = np.logspace(-5, 2, n_points * 10)
m_true = gen_data(
    t=t_test,
    func=double_ln_normal_distribution,
    func_kwargs=params,
)

m_lsq = gen_data(t=t_test, func=double_ln_normal_distribution, func_args=res_lsq.x)
m_soft_l1 = gen_data(t=t_test, func=double_ln_normal_distribution, func_args=res_soft_l1.x)
m_log = gen_data(t=t_test, func=double_ln_normal_distribution, func_args=res_log.x)
plt.plot(t_train, m_train, "o")
plt.plot(t_test, m_true, "k", linewidth=2, label="true", linestyle="-")
plt.plot(t_test, m_lsq, label="linear loss", linestyle="--")
plt.plot(t_test, m_soft_l1, label="soft_l1 loss", linestyle="-.")
plt.plot(t_test, m_log, label="cauchy loss", linestyle="")
plt.xlabel("t")
plt.ylabel("y")
plt.legend()
plt.xscale("log")

# Applying the ``least_square`` method to the cloud composite dataset

In [None]:
from sdm_eurec4a import RepositoryPath
from pathlib import Path

data_dir = RepositoryPath("nils_private").get_data_dir()

cloud_composite = xr.open_dataset(
    data_dir / Path("observation/cloud_composite/processed/cloud_composite_SI_units_20241025.nc")
)
identified_clouds = xr.open_dataset(
    data_dir
    / Path(
        "observation/cloud_composite/processed/identified_clusters/identified_clusters_rain_mask_5.nc"
    )
)
identified_clouds = identified_clouds.swap_dims({"time": "cloud_id"})

attrs = cloud_composite["radius"].attrs.copy()
attrs.update({"units": "µm"})
cloud_composite["radius"] = cloud_composite["radius"]
cloud_composite["radius_micro"] = 1e6 * cloud_composite["radius"]
cloud_composite["radius"].attrs = attrs

cloud_composite["radius2D"] = cloud_composite["radius"].expand_dims(time=cloud_composite["time"])
cloud_composite = cloud_composite.transpose("radius", ...)


# cloud_composite = cloud_composite.sel(radius = slice(10, None))

identified_clouds = identified_clouds.where(
    (
        (identified_clouds.duration.dt.seconds >= 3)
        & (identified_clouds.altitude < 1200)
        & (identified_clouds.altitude > 500)
    ),
    drop=True,
)

If we want to coarsen the results, we need to make sure to apply the coarsening on the **NON** normalized data.
Then we can normalized afterwards again

In [None]:
radius_split = 95e-6  # 50 µm
coarsen_factor = 3


coarse_composite = cloud_composite.sel(radius=slice(radius_split, None)).copy()

# make sure to have non normalized data to be coarsened
# otherwise, the sum will not be conserved
coarse_composite["particle_size_distribution"] = (
    coarse_composite["particle_size_distribution"] * coarse_composite["bin_width"]
)
coarse_composite["mass_size_distribution"] = (
    coarse_composite["mass_size_distribution"] * coarse_composite["bin_width"]
)

# use mean for radius and radius2D
coarse_composite_radius = coarse_composite["radius"].coarsen(radius=coarsen_factor).mean()
coarse_composite_radius2D = coarse_composite["radius2D"].coarsen(radius=coarsen_factor).mean()
# use the sum for the rest
coarse_composite = coarse_composite.coarsen(radius=coarsen_factor).sum()

coarse_composite["radius"] = coarse_composite_radius
coarse_composite["radius2D"] = coarse_composite_radius2D
coarse_composite["diameter"] = 2 * coarse_composite["radius"]

# make sure to have normalized data again
coarse_composite["particle_size_distribution"] = (
    coarse_composite["particle_size_distribution"] / coarse_composite["bin_width"]
)
coarse_composite["mass_size_distribution"] = (
    coarse_composite["mass_size_distribution"] / coarse_composite["bin_width"]
)

coarse_composite["particle_size_distribution"].attrs = dict(
    long_name="Number concentration",
    unit=cloud_composite["particle_size_distribution"].attrs["unit"],
)
coarse_composite["mass_size_distribution"].attrs = dict(
    long_name="Mass concentration",
    unit=cloud_composite["mass_size_distribution"].attrs["unit"],
)

# merge the two composites with higher resoltion at small radii
# and lower resolution at large radii
coarse_composite = xr.merge(
    [
        coarse_composite.sel(radius=slice(radius_split, None)),
        cloud_composite.sel(radius=slice(None, radius_split)),
    ]
)


# Test liquid water content is conserved
np.testing.assert_allclose(
    (coarse_composite["bin_width"] * coarse_composite["mass_size_distribution"]).sum("radius"),
    (cloud_composite["bin_width"] * cloud_composite["mass_size_distribution"]).sum("radius"),
    rtol=0.001,
)
# Test particle concentration is conserved
np.testing.assert_allclose(
    (coarse_composite["bin_width"] * coarse_composite["particle_size_distribution"]).sum("radius"),
    (cloud_composite["bin_width"] * cloud_composite["particle_size_distribution"]).sum("radius"),
    rtol=0.001,
)

In [None]:
def plot_distributions(dataset, axs):
    for i, cloud_id in enumerate(cloud_ids):
        cloud = identified_clouds.sel(cloud_id=cloud_id)
        ds = match_clouds_and_cloudcomposite(cloud, dataset)
        m, v = mean_and_stderror_of_mean(ds["particle_size_distribution"], dims=("time",))
        axs[0].errorbar(
            x=m["radius"],
            xerr=0,
            y=m,
            yerr=2 * v,
            label=f"cloud {cloud_id}",
            color=default_colors[i],
            marker=".",
            linestyle="None",
        )
        m, v = mean_and_stderror_of_mean(ds["mass_size_distribution"], dims=("time",))
        axs[1].errorbar(
            x=m["radius"],
            xerr=0,
            y=m,
            yerr=2 * v,
            label=f"cloud {cloud_id}",
            color=default_colors[i],
            marker=".",
            linestyle="None",
        )
        # print(f"{cloud_id} {ds['mass_size_distribution'].sum('radius').mean('time').values} LWC")

    m, v = mean_and_stderror_of_mean(dataset["particle_size_distribution"], dims=("time",))
    axs[0].plot(
        m.radius,
        m,
        label="mean",
        color="k",
        zorder=10,
    )
    axs[0].fill_between(
        m.radius,
        m - 2 * v,
        m + 2 * v,
        alpha=0.5,
        color="k",
        label="2 std",
        zorder=10,
    )

    m, v = mean_and_stderror_of_mean(dataset["mass_size_distribution"], dims=("time",))
    axs[1].plot(m.radius, m, label="mean", color="k", zorder=10)
    axs[1].fill_between(
        m.radius,
        m - 2 * v,
        m + 2 * v,
        alpha=0.5,
        color="k",
        label="2 std",
        zorder=10,
    )

    for _ax in axs:
        _ax.set_xscale("log")
        _ax.set_yscale("log")

    axs[0].set_title("particle size distribution")
    axs[1].set_title("mass size distribution")

In [None]:
np.random.seed(42)
cloud_ids = rng.choice(identified_clouds["cloud_id"], 2, replace=False)


fig, axss = plt.subplots(nrows=2, ncols=2, figsize=(10, 6), sharex=True)

plot_distributions(cloud_composite, axss[0])
plot_distributions(coarse_composite, axss[1])

axss[0, 0].legend(loc="upper right", fontsize="small")

fig, ax = plt.subplots()

ax.scatter(
    cloud_composite["radius"],
    cloud_composite["radius"],
    label="original",
    color="k",
    marker=".",
)
ax.scatter(
    coarse_composite["radius"],
    coarse_composite["radius"],
    label="coarse",
    color="r",
    marker=".",
)

<matplotlib.collections.PathCollection at 0x246801c3c70>

In [None]:
class LeastSquareFit:

    def __init__(
        self,
        name: str,
        func: Callable,
        cost_func: Callable,
        x0: np.ndarray,
        bounds: Bounds,
        t_train: Union[np.ndarray, xr.DataArray],
        y_train: Union[np.ndarray, xr.DataArray],
        fit_kwargs: Dict = dict(),
        plot_kwargs: Dict = dict(),
    ):

        self.name = name
        self.func = func
        self.cost_func = cost_func
        self.x0 = x0
        self.bounds = bounds
        self.t_train = t_train
        self.y_train = y_train
        self.fit_kwargs = fit_kwargs
        self.plot_kwargs = plot_kwargs
        self.fit_result = None

    def fit(self, repetitions: int = 1):

        for i in range(repetitions):
            if i != 0:
                x0 = self.fit_result.x
            else:
                x0 = self.x0

            self.fit_result = least_squares(
                self.cost_func,
                x0=x0,
                bounds=self.bounds,
                args=(np.ravel(self.t_train), np.ravel(self.y_train)),
                **self.fit_kwargs,
            )

        return self.fit_result

    def predict(
        self, t_test: Union[np.ndarray, xr.DataArray]
    ) -> Tuple[Union[np.ndarray, xr.DataArray], Union[np.ndarray, xr.DataArray]]:
        self.t_test = t_test
        self.y_test = self.func(self.t_test, *self.fit_result.x)

        return self.t_test, self.y_test

## Apply the fitting to the dataset 

### Adapt start to end radii range

##### Double LnNormal

In [None]:
# np.random.seed(42)
cloud_id = rng.choice(identified_clouds["cloud_id"])

train_data = match_clouds_and_cloudcomposite(
    identified_clouds.sel(cloud_id=cloud_id),
    coarse_composite,
)

t_train = train_data["radius2D"]
y_train = train_data["particle_size_distribution"]
m_train = train_data["mass_size_distribution"]
w_train = train_data["bin_width"]
lwc_train = train_data["liquid_water_content"].mean("time")

# we can also use only the radii where we have data:
radii_measured = train_data.max("time")["particle_size_distribution"] > 0
end = train_data["radius"].where(radii_measured).max("radius")
start = train_data["radius"].where(radii_measured).min("radius")

# create a log spaced array of radii
r = np.geomspace(start, end, 1000)
t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
w_test = (t_test - t_test.shift(radius=2)).shift(radius=-1)
w_test = w_test.interpolate_na("radius", method="linear", fill_value="extrapolate")
# t_test = t_test.where(np.isfinite(w_test), drop=True)
# w_test = w_test.where(np.isfinite(w_test), drop=True)

x0_psd = np.array([3e-6, 2, 1e10, 200e-6, 2, 1e6])
bounds_psd = Bounds(
    # mu1, sig1, sc1, mu2, sig2, sc2
    lb=[1e-6, 1.1, 1e7, 200e-6, 1.1, 1e0],
    ub=[10e-6, 3.0, 1e13, 0.5e-3, 3.0, 1e8],
    # keep_feasible = [True, True, True, False, True, True]
)
lqs_psd = LeastSquareFit(
    name="PSD vari",
    fit_kwargs=dict(loss="linear"),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_psd,
    bounds=bounds_psd,
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=default_colors[0], linestyle="-"),
)

lqs_psd_var1 = LeastSquareFit(
    name="PSD nova",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_psd,
    bounds=bounds_psd,
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=dark_colors[0], linestyle="--"),
)

lsq_gamma = LeastSquareFit(
    name="Gamma",
    fit_kwargs=dict(loss="linear"),
    func=gamma_distribution,
    cost_func=gamma_distribution_cost,
    x0=[1, 1, 1],
    bounds=Bounds(
        lb=[0, 0, 0],
        ub=[np.inf, np.inf, np.inf],
    ),
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=default_colors[3], linestyle="-"),
)


setups_psd = [lqs_psd, lqs_psd_var1, lsq_gamma]


x0_msd = np.array([3e-6, 2, 1e-1, 300e-6, 2, 1e0])
bounds_msd = Bounds(
    lb=[1e-6, 1.1, 1e-3, 200e-6, 1.3, 1e-3],
    ub=[10e-6, 4.0, 1e2, 0.5e-3, 3.0, 1e1],
    # keep_feasible = [True, True, True, False, True, True]
)
lqs_msd = LeastSquareFit(
    name="MSD vari",
    fit_kwargs=dict(loss="linear"),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=default_colors[1], linestyle="-"),
)

lqs_msd_var1 = LeastSquareFit(
    name="MSD nova",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=dark_colors[1], linestyle="-"),
)


softl1_msd = LeastSquareFit(
    name="MSD Soft",
    fit_kwargs=dict(loss="soft_l1", f_scale=0.1, kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=default_colors[2], linestyle="-"),
)


setups_msd = [lqs_msd, lqs_msd_var1, softl1_msd]  # , soft_l1, soft_l1_mean]
setups = setups_psd + setups_msd

for doublefit in setups:
    doublefit.fit()


# --- Plot the results ---
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(8, 4), sharex=True)
ax0, ax1 = axs

# plot the particle size distribution
ax0.scatter(t_train, y_train, marker="o", color="grey")
ax0.scatter(t_train.mean("time"), y_train.mean("time"), marker="o", color="k")

# plot the mass size distribution
ax1.scatter(t_train, m_train, marker="o", color="grey")
ax1.scatter(t_train.mean("time"), m_train.mean("time"), marker="o", color="k")

# --- Plot the results ---
for fit in setups_psd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, y = fit.predict(t)
    m = msd_from_psd_dataarray(y)
    lwc_psd = 1e3 * (m * w).sum("radius").values

    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_psd:.2f}", **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_psd:.2f}", **fit.plot_kwargs)

for fit in setups_msd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, m = fit.predict(t)
    y = psd_from_msd_dataarray(m)
    lwc_msd = 1e3 * (m * w).sum("radius").values
    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_msd:.2f}", **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_msd:.2f}", **fit.plot_kwargs)


fig.suptitle(f"Fits for cloud {cloud_id}: Desired={lwc_train.values:.2f}", fontsize="medium")

for _ax in axs:
    _ax.set_xscale("log")
    _ax.set_xlim(1e-6, 0.8e-1)
axs[0].set_yscale("symlog", linthresh=1.1e3, linscale=0.1)
axs[1].set_yscale("symlog", linthresh=1.1e-7, linscale=0.1)

axs[0].set_ylim(1e-12, 1e12)
axs[1].set_ylim(1e-12, 0.5e2)
axs[0].legend(fontsize="small", loc="center right")
axs[1].legend(fontsize="small", loc="center right")

<matplotlib.legend.Legend at 0x24683418160>

##### Easy sinle log normal diff

In [None]:
(identified_clouds["liquid_water_content"] / identified_clouds["duration"].dt.seconds).plot()

[<matplotlib.lines.Line2D at 0x246858c3e50>]

# Meeting
## Show the comparison of the fit to PSD and MSD

In [None]:
cloud_id = rng.choice(
    identified_clouds.where(
        # (identified_clouds["liquid_water_content"] / identified_clouds['duration'].dt.seconds) > 1,
        (identified_clouds["liquid_water_content"] / identified_clouds["duration"].dt.seconds) > 0.4,
        # identified_clouds['duration'].dt.seconds > 40,
        drop=True,
    )["cloud_id"]
)
# cloud_id = 362 # weird fit
# cloud_id = 193  # good fit
# # cloud_id = 216 # sparse data
# cloud_id = 230  # low cutoff range
# cloud_id = 361 # low LWC range
# cloud_id = 220 # high LWC and good MSD fit
# cloud_id = 58  # very high LWC
# cloud_id = 21

In [None]:
# np.random.seed(42)

train_data = match_clouds_and_cloudcomposite(
    identified_clouds.sel(cloud_id=cloud_id),
    coarse_composite,
)

t_train = train_data["radius2D"]
y_train = train_data["particle_size_distribution"]
m_train = train_data["mass_size_distribution"]
w_train = train_data["bin_width"]
lwc_train = train_data["liquid_water_content"].mean("time")

# we can also use only the radii where we have data:
radii_measured = train_data.max("time")["particle_size_distribution"] > 0
end = train_data["radius"].where(radii_measured).max("radius")
start = train_data["radius"].where(radii_measured).min("radius")
end = 1.5 * end
# create a log spaced array of radii
r = np.geomspace(start, end, 1000)
t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
w_test = (t_test - t_test.shift(radius=2)).shift(radius=-1)
w_test = w_test.interpolate_na("radius", method="linear", fill_value="extrapolate")
# t_test = t_test.where(np.isfinite(w_test), drop=True)
# w_test = w_test.where(np.isfinite(w_test), drop=True)

x0_psd = np.array([3e-6, 2, 1e10, 200e-6, 2, 1e6])
bounds_psd = Bounds(
    # mu1, sig1, sc1, mu2, sig2, sc2
    lb=[1e-6, 1.1, 1e7, 200e-6, 1.1, 1e0],
    ub=[10e-6, 3.0, 1e13, 0.5e-3, 3.0, 1e8],
    # keep_feasible = [True, True, True, False, True, True]
)
lqs_psd = LeastSquareFit(
    name="PSD",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_psd,
    bounds=bounds_psd,
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=default_colors[0], linestyle="-"),
)

# lqs_psd_var1 = LeastSquareFit(
#     name = "PSD nova",
#     fit_kwargs=dict(loss="linear", kwargs = dict(variance=1)),
#     func = double_ln_normal_distribution,
#     cost_func = double_ln_normal_distribution_cost,
#     x0 = x0_psd,
#     bounds= bounds_psd,
#     t_train= t_train.mean('time'),
#     y_train= y_train.mean('time'),
#     plot_kwargs=dict(color=dark_colors[0], linestyle="--")
#     )


setups_psd = [lqs_psd]


x0_msd = np.array([3e-6, 2, 1e-1, 300e-6, 2, 1e0])
bounds_msd = Bounds(
    lb=[1e-6, 1.1, 1e-3, 200e-6, 1.3, 1e-3],
    ub=[10e-6, 3.0, 1e2, 0.5e-3, 3.0, 1e1],
    # keep_feasible = [True, True, True, False, True, True]
)
# lqs_msd = LeastSquareFit(
#     name = "MSD vari",
#     fit_kwargs=dict(loss="linear"),
#     func = double_ln_normal_distribution,
#     cost_func = double_ln_normal_distribution_cost,
#     x0 = x0_msd,
#     bounds= bounds_msd,
#     t_train= t_train.mean('time'),
#     y_train= m_train.mean('time'),
#     plot_kwargs=dict(color=default_colors[1], linestyle="-")
#     )

lqs_msd_var1 = LeastSquareFit(
    name="MSD nova",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=dark_colors[1], linestyle="-"),
)


# softl1_msd = LeastSquareFit(
#     name = "MSD Soft",
#     fit_kwargs=dict(loss="soft_l1", f_scale=0.1, kwargs=dict(variance=1)),
#     func = double_ln_normal_distribution,
#     cost_func = double_ln_normal_distribution_cost,
#     x0 = x0_msd,
#     bounds= bounds_msd,
#     t_train= t_train.mean('time'),
#     y_train= m_train.mean('time'),
#     plot_kwargs=dict(color=default_colors[2], linestyle="-")
#     )


setups_msd = [
    lqs_msd_var1,
]  # , soft_l1, soft_l1_mean]
setups = setups_psd + setups_msd

for doublefit in setups:
    doublefit.fit(repetitions=10)


# --- Plot the results ---
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 6), sharex=True)
ax0, ax1 = axs

# plot the particle size distribution
ax0.scatter(t_train, y_train, marker=".", color="grey", alpha=0.5)
ax0.scatter(t_train.mean("time"), y_train.mean("time"), marker="o", color="k")

# plot the mass size distribution
ax1.scatter(t_train, m_train, marker=".", color="grey", alpha=0.5)
ax1.scatter(t_train.mean("time"), m_train.mean("time"), marker="o", color="k")

# --- Plot the results ---
for fit in setups_psd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, y = fit.predict(t)
    m = msd_from_psd_dataarray(y)
    lwc_psd = 1e3 * (m * w).sum("radius").values

    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)

for fit in setups_msd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, m = fit.predict(t)
    y = psd_from_msd_dataarray(m)
    lwc_msd = 1e3 * (m * w).sum("radius").values
    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)


fig.suptitle(f"Fits for cloud {cloud_id}: Desired={lwc_train.values:.2f}", fontsize="medium")

for _ax in axs:
    _ax.set_xscale("log")
    _ax.set_xlim(1e-6, 3.5e-3)
axs[0].set_yscale("symlog", linthresh=1.1e3, linscale=0.1)
axs[1].set_yscale("symlog", linthresh=1.1e-7, linscale=0.1)

axs[0].set_ylim(1e-12, 1e14)
axs[1].set_ylim(1e-12, 0.5e2)
axs[0].legend(fontsize="small", loc="upper center")
axs[1].legend(fontsize="small", loc="upper center")

axs[0].set_ylabel(label_from_attrs(y_train, linebreak=True), fontsize="small")
axs[1].set_ylabel(label_from_attrs(m_train, linebreak=True), fontsize="small")

Text(0, 0.5, 'Mass concentration\n$\\left[  kg m^{-3} m^{-1}  \\right]$')

In [None]:
# np.random.seed(42)

train_data = match_clouds_and_cloudcomposite(
    identified_clouds.sel(cloud_id=cloud_id),
    coarse_composite,
)

t_train = train_data["radius2D"]
y_train = train_data["particle_size_distribution"]
m_train = train_data["mass_size_distribution"]
w_train = train_data["bin_width"]
lwc_train = train_data["liquid_water_content"].mean("time")

# we can also use only the radii where we have data:
radii_measured = train_data.max("time")["particle_size_distribution"] > 0
end = train_data["radius"].where(radii_measured).max("radius")
start = train_data["radius"].where(radii_measured).min("radius")
end = 1.5 * end
# create a log spaced array of radii
r = np.geomspace(start, end, 1000)
t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
w_test = (t_test - t_test.shift(radius=2)).shift(radius=-1)
w_test = w_test.interpolate_na("radius", method="linear", fill_value="extrapolate")
# t_test = t_test.where(np.isfinite(w_test), drop=True)
# w_test = w_test.where(np.isfinite(w_test), drop=True)

x0_psd = np.array([3e-6, 2, 1e10, 200e-6, 2, 1e6])
bounds_psd = Bounds(
    # mu1, sig1, sc1, mu2, sig2, sc2
    lb=[1e-6, 1.1, 1e7, 200e-6, 1.1, 1e0],
    ub=[10e-6, 3.0, 1e13, 0.5e-3, 3.0, 1e8],
    # keep_feasible = [True, True, True, False, True, True]
)
lqs_psd = LeastSquareFit(
    name="PSD",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_psd,
    bounds=bounds_psd,
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=default_colors[0], linestyle="-"),
)

# lqs_psd_var1 = LeastSquareFit(
#     name = "PSD nova",
#     fit_kwargs=dict(loss="linear", kwargs = dict(variance=1)),
#     func = double_ln_normal_distribution,
#     cost_func = double_ln_normal_distribution_cost,
#     x0 = x0_psd,
#     bounds= bounds_psd,
#     t_train= t_train.mean('time'),
#     y_train= y_train.mean('time'),
#     plot_kwargs=dict(color=dark_colors[0], linestyle="--")
#     )


setups_psd = [lqs_psd]


x0_msd = np.array([3e-6, 2, 1e-1, 300e-6, 2, 1e0])
bounds_msd = Bounds(
    lb=[1e-6, 1.1, 1e-3, 200e-6, 1.3, 1e-3],
    ub=[10e-6, 3.0, 1e2, 0.5e-3, 3.0, 1e1],
    # keep_feasible = [True, True, True, False, True, True]
)
# lqs_msd = LeastSquareFit(
#     name = "MSD vari",
#     fit_kwargs=dict(loss="linear"),
#     func = double_ln_normal_distribution,
#     cost_func = double_ln_normal_distribution_cost,
#     x0 = x0_msd,
#     bounds= bounds_msd,
#     t_train= t_train.mean('time'),
#     y_train= m_train.mean('time'),
#     plot_kwargs=dict(color=default_colors[1], linestyle="-")
#     )

lqs_msd_var1 = LeastSquareFit(
    name="MSD nova",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=dark_colors[1], linestyle="-"),
)


# softl1_msd = LeastSquareFit(
#     name = "MSD Soft",
#     fit_kwargs=dict(loss="soft_l1", f_scale=0.1, kwargs=dict(variance=1)),
#     func = double_ln_normal_distribution,
#     cost_func = double_ln_normal_distribution_cost,
#     x0 = x0_msd,
#     bounds= bounds_msd,
#     t_train= t_train.mean('time'),
#     y_train= m_train.mean('time'),
#     plot_kwargs=dict(color=default_colors[2], linestyle="-")
#     )


setups_msd = [
    lqs_msd_var1,
]  # , soft_l1, soft_l1_mean]
setups = setups_psd + setups_msd

for doublefit in setups:
    doublefit.fit(repetitions=10)


# --- Plot the results ---
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 6), sharex=True)
ax0, ax1 = axs

# plot the particle size distribution
ax0.scatter(t_train, y_train, marker=".", color="grey", alpha=0.5)
ax0.scatter(t_train.mean("time"), y_train.mean("time"), marker="o", color="k")

# plot the mass size distribution
ax1.scatter(t_train, m_train, marker=".", color="grey", alpha=0.5)
ax1.scatter(t_train.mean("time"), m_train.mean("time"), marker="o", color="k")

# --- Plot the results ---
for fit in setups_psd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, y = fit.predict(t)
    m = msd_from_psd_dataarray(y)
    lwc_psd = 1e3 * (m * w).sum("radius").values

    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)

for fit in setups_msd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, m = fit.predict(t)
    y = psd_from_msd_dataarray(m)
    lwc_msd = 1e3 * (m * w).sum("radius").values
    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)


fig.suptitle(f"Fits for cloud {cloud_id}: Desired={lwc_train.values:.2f}", fontsize="medium")

for _ax in axs:
    _ax.set_xscale("log")
    _ax.set_xlim(1e-6, 3.5e-3)
axs[0].set_yscale("linear")
axs[1].set_yscale("linear")

axs[0].set_ylim(0, 4e11)
axs[1].set_ylim(0, 1.5)
axs[0].legend(fontsize="small", loc="upper center")
axs[1].legend(fontsize="small", loc="upper center")

axs[0].set_ylabel(label_from_attrs(y_train, linebreak=True), fontsize="small")
axs[1].set_ylabel(label_from_attrs(m_train, linebreak=True), fontsize="small")

Text(0, 0.5, 'Mass concentration\n$\\left[  kg m^{-3} m^{-1}  \\right]$')

##### Single LnNormal

In [None]:
# np.random.seed(42)
cloud_id = rng.choice(identified_clouds["cloud_id"])

train_data = match_clouds_and_cloudcomposite(
    identified_clouds.sel(cloud_id=cloud_id), coarse_composite  # .sel(radius=slice(50e-6, None)),
)

t_train = train_data["radius2D"]
y_train = train_data["particle_size_distribution"]
m_train = train_data["mass_size_distribution"]
w_train = train_data["bin_width"]
lwc_train = train_data["liquid_water_content"].mean("time")

# we can also use only the radii where we have data:
radii_measured = train_data.max("time")["particle_size_distribution"] > 0
end = train_data["radius"].where(radii_measured).max("radius")
start = train_data["radius"].where(radii_measured).min("radius")
start = 10e-6
# create a log spaced array of radii
r = np.geomspace(start, end, 1000)
t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
w_test = (t_test - t_test.shift(radius=2)).shift(radius=-1)
w_test = w_test.interpolate_na("radius", method="linear", fill_value="extrapolate")
# t_test = t_test.where(np.isfinite(w_test), drop=True)
# w_test = w_test.where(np.isfinite(w_test), drop=True)


x0_psd = np.array([200e-6, 2, 1e6])
bounds_psd = Bounds(
    # mu1, sig1, sc1, mu2, sig2, sc2
    lb=[200e-6, 1.1, 1e0],
    ub=[0.5e-3, 3.0, 1e8],
    # keep_feasible = [True, True, True, False, True, True]
)
lqs_psd = LeastSquareFit(
    name="PSD vari",
    fit_kwargs=dict(loss="linear"),
    func=ln_normal_distribution,
    cost_func=ln_normal_distribution_cost,
    x0=x0_psd,
    bounds=bounds_psd,
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=default_colors[0], linestyle="-"),
)

lqs_psd_var1 = LeastSquareFit(
    name="PSD nova",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=ln_normal_distribution,
    cost_func=ln_normal_distribution_cost,
    x0=x0_psd,
    bounds=bounds_psd,
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=dark_colors[0], linestyle="--"),
)

x0_gamma = np.array([2.5, 0, 0.5e-4, 4e6])
bounds_gamma = Bounds(
    lb=[0, 0, 0, 0.1e7],
    ub=[np.inf, np.inf, np.inf, 2e7],
)

gamma_lsq = LeastSquareFit(
    name="Gamma",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=gamma_distribution_stats,
    cost_func=gamma_distribution_stats_cost,
    x0=x0_gamma,
    bounds=bounds_gamma,
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=default_colors[3], linestyle="-"),
)


setups_psd = [lqs_psd, lqs_psd_var1, gamma_lsq]


x0_msd = np.array([300e-6, 2, 1e0])
bounds_msd = Bounds(
    lb=[200e-6, 1.3, 1e-3],
    ub=[0.5e-3, 3.0, 1e1],
    # keep_feasible = [True, True, True, False, True, True]
)
lqs_msd = LeastSquareFit(
    name="MSD vari",
    fit_kwargs=dict(loss="linear"),
    func=ln_normal_distribution,
    cost_func=ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=default_colors[1], linestyle="-"),
)

lqs_msd_var1 = LeastSquareFit(
    name="MSD nova",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=ln_normal_distribution,
    cost_func=ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=dark_colors[1], linestyle="-"),
)


softl1_msd = LeastSquareFit(
    name="MSD Soft",
    fit_kwargs=dict(loss="soft_l1", f_scale=0.1, kwargs=dict(variance=1)),
    func=ln_normal_distribution,
    cost_func=ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=default_colors[2], linestyle="-"),
)


setups_msd = [lqs_msd, lqs_msd_var1, softl1_msd]  # , soft_l1, soft_l1_mean]
setups = setups_psd + setups_msd

for doublefit in setups:
    doublefit.fit(repetitions=3)


# --- Plot the results ---
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(8, 4), sharex=True)
ax0, ax1 = axs

# plot the particle size distribution
ax0.scatter(t_train, y_train, marker="o", color="grey")
ax0.scatter(t_train.mean("time"), y_train.mean("time"), marker="o", color="k")

# plot the mass size distribution
ax1.scatter(t_train, m_train, marker="o", color="grey")
ax1.scatter(t_train.mean("time"), m_train.mean("time"), marker="o", color="k")

# --- Plot the results ---
for fit in setups_psd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, y = fit.predict(t)
    m = msd_from_psd_dataarray(y)
    total_number_concentration = (y * w).sum("radius").values
    lwc = 1e3 * (m * w).sum("radius").values
    ax0.plot(t, y, label=f"{fit.name} N: {total_number_concentration:.2f}", **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC: {lwc:.2f}", **fit.plot_kwargs)

for fit in setups_msd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, m = fit.predict(t)
    y = psd_from_msd_dataarray(m)
    total_number_concentration = (y * w).sum("radius").values
    lwc = 1e3 * (m * w).sum("radius").values

    ax0.plot(t, y, label=f"{fit.name} N: {total_number_concentration:.2f}", **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC: {lwc:.2f}", **fit.plot_kwargs)


lwc_train = 1e3 * (m_train * w_train).sum("radius").mean("time")
total_number_concentration_train = (y_train * w_train).sum("radius").mean("time")
fig.suptitle(
    f"Fits for cloud {cloud_id}: N: {total_number_concentration_train.values:.2f}, LWC: {lwc_train.values:.2f}",
    fontsize="medium",
)

for _ax in axs:
    _ax.set_xscale("log")
    _ax.set_xlim(1e-6, 0.8e-1)
# axs[0].set_yscale("symlog", linthresh=1.1e3, linscale=0.1)
# axs[1].set_yscale("symlog", linthresh=1.1e-7, linscale=0.1)

axs[0].set_ylim(1e-12, 1e12)
axs[1].set_ylim(1e-12, 0.5e2)
axs[0].set_ylim(1e-12, 1e7)
axs[1].set_ylim(1e-12, 0.25)
axs[0].legend(fontsize="small", loc="center right")
axs[1].legend(fontsize="small", loc="center right")

<matplotlib.legend.Legend at 0x2468b93c0a0>

In [None]:
x0 = np.array([2.5, 0, 0.5e-4, 3e7])

gamma_lsq = LeastSquareFit(
    name="Gamma",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=gamma_distribution_stats,
    cost_func=gamma_distribution_stats_cost,
    x0=x0,
    bounds=Bounds(
        lb=[0, 0, 0, 0],
        ub=[np.inf, np.inf, np.inf, np.inf],
    ),
    t_train=t_train,
    y_train=y_train,
    plot_kwargs=dict(color=default_colors[3], linestyle="-"),
)

gamma_lsq.fit(10)

plt.scatter(t_train, y_train, marker=".", color="grey")
plt.scatter(t_train.mean("time"), y_train.mean("time"), marker="o", color="k")
t, y = gamma_lsq.predict(t_test)

plt.plot(t, y, label=f"{gamma_lsq.name}", color="k")

plt.xscale("log")
plt.yscale("log")

### Whole cloud set

In [None]:
def calc_Ln_fit(cloud_id: int, identified_clouds: xr.Dataset, coarse_composite: xr.Dataset) -> dict:

    train_data = match_clouds_and_cloudcomposite(
        identified_clouds.sel(cloud_id=cloud_id),
        coarse_composite,
    )

    t_train = train_data["radius2D"]
    y_train = train_data["particle_size_distribution"]
    m_train = train_data["mass_size_distribution"]
    w_train = train_data["bin_width"]
    lwc_train = train_data["liquid_water_content"].mean("time")

    # we can also use only the radii where we have data:
    radii_measured = train_data["particle_size_distribution"].mean("time") > 0
    end = train_data["radius"].sel(radius=radii_measured).max().values
    start = train_data["radius"].sel(radius=radii_measured).min().values
    start = 10e-6
    end = 1.5 * end

    # create a log spaced array of radii
    r = np.logspace(np.log10(start), np.log10(end), 1000)
    t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
    w_test = (t_test - t_test.shift(radius=2)).shift(radius=-1)
    # w_test = w_test.interpolate_na('radius', method='linear', fill_value="extrapolate")
    t_test = t_test.where(np.isfinite(w_test), drop=True)
    w_test = w_test.where(np.isfinite(w_test), drop=True)

    x0_psd = np.array([200e-6, 2, 1e6])
    bounds_psd = Bounds(
        # mu1, sig1, sc1, mu2, sig2, sc2
        lb=[200e-6, 1.1, 1e0],
        ub=[0.5e-3, 3.0, 1e8],
        # keep_feasible = [True, True, True, False, True, True]
    )
    lqs_psd = LeastSquareFit(
        name="PSD vari",
        fit_kwargs=dict(loss="linear"),
        func=ln_normal_distribution,
        cost_func=ln_normal_distribution_cost,
        x0=x0_psd,
        bounds=bounds_psd,
        t_train=t_train.mean("time"),
        y_train=y_train.mean("time"),
        plot_kwargs=dict(color=default_colors[0], linestyle="-"),
    )

    lqs_psd_var1 = LeastSquareFit(
        name="PSD nova",
        fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
        func=ln_normal_distribution,
        cost_func=ln_normal_distribution_cost,
        x0=x0_psd,
        bounds=bounds_psd,
        t_train=t_train.mean("time"),
        y_train=y_train.mean("time"),
        plot_kwargs=dict(color=dark_colors[0], linestyle="--"),
    )

    x0_gamma = np.array([2.5, 0, 0.5e-4, 1.5e7])
    bounds_gamma = Bounds(
        lb=[0, 0, 0, 0.05e7],
        ub=[np.inf, np.inf, np.inf, 3e7],
    )

    gamma_lsq = LeastSquareFit(
        name="Gamma",
        fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
        func=gamma_distribution_stats,
        cost_func=gamma_distribution_stats_cost,
        x0=x0_gamma,
        bounds=bounds_gamma,
        t_train=t_train.mean("time"),
        y_train=y_train.mean("time"),
        plot_kwargs=dict(color=default_colors[3], linestyle="-"),
    )

    setups_psd = [lqs_psd, lqs_psd_var1, gamma_lsq]

    x0_msd = np.array([300e-6, 2, 1e0])
    bounds_msd = Bounds(
        lb=[200e-6, 1.3, 1e-3],
        ub=[0.5e-3, 3.0, 1e1],
        # keep_feasible = [True, True, True, False, True, True]
    )
    lqs_msd = LeastSquareFit(
        name="MSD vari",
        fit_kwargs=dict(loss="linear"),
        func=ln_normal_distribution,
        cost_func=ln_normal_distribution_cost,
        x0=x0_msd,
        bounds=bounds_msd,
        t_train=t_train.mean("time"),
        y_train=m_train.mean("time"),
        plot_kwargs=dict(color=default_colors[1], linestyle="-"),
    )

    lqs_msd_var1 = LeastSquareFit(
        name="MSD nova",
        fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
        func=ln_normal_distribution,
        cost_func=ln_normal_distribution_cost,
        x0=x0_msd,
        bounds=bounds_msd,
        t_train=t_train.mean("time"),
        y_train=m_train.mean("time"),
        plot_kwargs=dict(color=dark_colors[1], linestyle="-"),
    )

    softl1_msd = LeastSquareFit(
        name="MSD Soft",
        fit_kwargs=dict(loss="soft_l1", f_scale=0.1, kwargs=dict(variance=1)),
        func=ln_normal_distribution,
        cost_func=ln_normal_distribution_cost,
        x0=x0_msd,
        bounds=bounds_msd,
        t_train=t_train.mean("time"),
        y_train=m_train.mean("time"),
        plot_kwargs=dict(color=default_colors[2], linestyle="-"),
    )

    setups_msd = [lqs_msd, lqs_msd_var1, softl1_msd]  # , soft_l1, soft_l1_mean]
    setups = setups_psd + setups_msd

    for doublefit in setups:
        doublefit.fit(5)

    results = dict()
    fits = dict()

    # --- Plot the results ---
    for fit in setups_psd:
        t, w = t_train.mean("time"), w_train
        t, w = t_test, w_test

        t, y = fit.predict(t)
        m = msd_from_psd_dataarray(y)

        total_number_concentration = (y * w).sum("radius")
        liquid_water_content = 1e3 * (m * w).sum("radius")

        results[fit.name] = dict(
            total_number_concentration=total_number_concentration.values,
            liquid_water_content=liquid_water_content.values,
        )
        fits[fit.name] = fit.fit_result

    for fit in setups_msd:
        t, w = t_train.mean("time"), w_train
        t, w = t_test, w_test

        t, m = fit.predict(t)
        y = psd_from_msd_dataarray(m)

        total_number_concentration = (y * w).sum("radius")
        liquid_water_content = 1e3 * (m * w).sum("radius")

        results[fit.name] = dict(
            total_number_concentration=total_number_concentration.values,
            liquid_water_content=liquid_water_content.values,
        )
        fits[fit.name] = fit.fit_result

    return results, fits


def calc_doubleLn_fit(
    cloud_id: int, identified_clouds: xr.Dataset, coarse_composite: xr.Dataset
) -> dict:

    train_data = match_clouds_and_cloudcomposite(
        identified_clouds.sel(cloud_id=cloud_id),
        coarse_composite,
    )

    t_train = train_data["radius2D"]
    y_train = train_data["particle_size_distribution"]
    m_train = train_data["mass_size_distribution"]
    w_train = train_data["bin_width"]
    lwc_train = train_data["liquid_water_content"].mean("time")

    # we can also use only the radii where we have data:
    radii_measured = train_data["particle_size_distribution"].mean("time") > 0
    end = train_data["radius"].sel(radius=radii_measured).max().values
    start = train_data["radius"].sel(radius=radii_measured).min().values
    end = 1.5 * end
    # create a log spaced array of radii
    r = np.logspace(np.log10(start), np.log10(end), 1000)
    t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
    w_test = 0.5 * (t_test - t_test.shift(radius=2)).shift(radius=-1)
    # w_test = w_test.interpolate_na('radius', method='linear', fill_value="extrapolate")
    t_test = t_test.where(np.isfinite(w_test), drop=True)
    w_test = w_test.where(np.isfinite(w_test), drop=True)

    x0_psd = np.array([3e-6, 2, 1e10, 200e-6, 2, 1e6])
    bounds_psd = Bounds(
        # mu1, sig1, sc1, mu2, sig2, sc2
        lb=[1e-6, 1.1, 1e7, 200e-6, 1.1, 1e0],
        ub=[10e-6, 3.0, 1e13, 0.5e-3, 3.0, 1e8],
        # keep_feasible = [True, True, True, False, True, True]
    )
    lqs_psd = LeastSquareFit(
        name="PSD vari",
        fit_kwargs=dict(loss="linear"),
        func=double_ln_normal_distribution,
        cost_func=double_ln_normal_distribution_cost,
        x0=x0_psd,
        bounds=bounds_psd,
        t_train=t_train.mean("time"),
        y_train=y_train.mean("time"),
        plot_kwargs=dict(color=default_colors[0], linestyle="-"),
    )

    lqs_psd_var1 = LeastSquareFit(
        name="PSD nova",
        fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
        func=double_ln_normal_distribution,
        cost_func=double_ln_normal_distribution_cost,
        x0=x0_psd,
        bounds=bounds_psd,
        t_train=t_train.mean("time"),
        y_train=y_train.mean("time"),
        plot_kwargs=dict(color=dark_colors[0], linestyle="--"),
    )

    setups_psd = [lqs_psd, lqs_psd_var1]

    x0_msd = np.array([3e-6, 2, 1e-1, 300e-6, 2, 1e0])
    bounds_msd = Bounds(
        lb=[1e-6, 1.1, 1e-3, 200e-6, 1.3, 1e-3],
        ub=[10e-6, 4.0, 1e2, 0.5e-3, 3.0, 1e1],
        # keep_feasible = [True, True, True, False, True, True]
    )
    lqs_msd = LeastSquareFit(
        name="MSD vari",
        fit_kwargs=dict(loss="linear"),
        func=double_ln_normal_distribution,
        cost_func=double_ln_normal_distribution_cost,
        x0=x0_msd,
        bounds=bounds_msd,
        t_train=t_train.mean("time"),
        y_train=m_train.mean("time"),
        plot_kwargs=dict(color=default_colors[1], linestyle="-"),
    )

    lqs_msd_var1 = LeastSquareFit(
        name="MSD nova",
        fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
        func=double_ln_normal_distribution,
        cost_func=double_ln_normal_distribution_cost,
        x0=x0_msd,
        bounds=bounds_msd,
        t_train=t_train.mean("time"),
        y_train=m_train.mean("time"),
        plot_kwargs=dict(color=dark_colors[1], linestyle="-"),
    )

    softl1_msd = LeastSquareFit(
        name="MSD Soft",
        fit_kwargs=dict(loss="soft_l1", f_scale=0.1, kwargs=dict(variance=1)),
        func=double_ln_normal_distribution,
        cost_func=double_ln_normal_distribution_cost,
        x0=x0_msd,
        bounds=bounds_msd,
        t_train=t_train.mean("time"),
        y_train=m_train.mean("time"),
        plot_kwargs=dict(color=default_colors[2], linestyle="-"),
    )

    setups_msd = [lqs_msd, lqs_msd_var1, softl1_msd]  # , soft_l1, soft_l1_mean]
    setups = setups_psd + setups_msd

    for doublefit in setups:
        doublefit.fit()

    results = dict()
    fits = dict()

    # --- Plot the results ---
    for fit in setups_psd:
        t, w = t_train.mean("time"), w_train
        t, w = t_test, w_test

        t, y = fit.predict(t)
        m = msd_from_psd_dataarray(y)

        total_number_concentration = (y * w).sum("radius")
        liquid_water_content = 1e3 * (m * w).sum("radius")

        results[fit.name] = dict(
            total_number_concentration=total_number_concentration.values,
            liquid_water_content=liquid_water_content.values,
        )
        fits[fit.name] = fit.fit_result

    for fit in setups_msd:
        t, w = t_train.mean("time"), w_train
        t, w = t_test, w_test

        t, m = fit.predict(t)
        y = psd_from_msd_dataarray(m)

        total_number_concentration = (y * w).sum("radius")
        liquid_water_content = 1e3 * (m * w).sum("radius")

        results[fit.name] = dict(
            total_number_concentration=total_number_concentration.values,
            liquid_water_content=liquid_water_content.values,
        )
        fits[fit.name] = fit

    return results, fits

In [None]:
# Transposing function
def transpose_dict(d):
    transposed = {}
    for cloud_id, cloud_dict in d.items():
        for fit_type, fit_dict in cloud_dict.items():
            for metric_key, metric in fit_dict.items():
                if fit_type not in transposed:
                    transposed[fit_type] = {}
                if metric_key not in transposed[fit_type]:
                    transposed[fit_type][metric_key] = {}
                transposed[fit_type][metric_key][cloud_id] = float(metric)

    return transposed

##### whole radii range

In [None]:
all_results = {}
fit_results = {}
error_clouds = []
for cloud_id in identified_clouds["cloud_id"]:
    cloud_id = identified_clouds["cloud_id"].sel(cloud_id=cloud_id)
    cloud_id_str = str(cloud_id.values)
    try:
        results, fits = calc_doubleLn_fit(
            cloud_id=cloud_id,
            identified_clouds=identified_clouds,
            coarse_composite=coarse_composite,
        )
        all_results[cloud_id_str] = results
        fit_results[cloud_id_str] = fits
    except:
        error_clouds.append(cloud_id)
        print(f"error at {cloud_id_str}")
        continue

transposed_data = transpose_dict(all_results)
cloud_ids = identified_clouds["cloud_id"].data
cloud_ids = cloud_ids[~np.isin(cloud_ids, error_clouds)]

lwc_observations = []
lwc_observations_var = []
for cloud_id in cloud_ids:
    cc = match_clouds_and_cloudcomposite(
        identified_clouds.sel(cloud_id=cloud_id),
        cloud_composite,
    )
    m, v = mean_and_stderror_of_mean(cc["liquid_water_content"], dims=("time",))

    lwc_observations.append(m)
    lwc_observations_var.append(v)

lwc_fits = dict()
for fit_type, fit_dict in transposed_data.items():
    metric_dict = fit_dict["liquid_water_content"]
    keys = np.array(list(metric_dict.keys())).astype(int)
    lwc_fit = np.array(list(metric_dict.values()))
    lwc_fits[fit_type] = (["cloud_id"], lwc_fit)


dataset_lwc = xr.Dataset(
    data_vars=dict(
        observations=(["cloud_id"], lwc_observations),
        observations_var=(["cloud_id"], lwc_observations_var),
        **lwc_fits,
    ),
    coords=dict(cloud_id=cloud_ids),
    attrs=dict(
        setup="Coarse",
    ),
)
fits_allradii = fit_results

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(6, 7), sharex=True, sharey=True)

axs = axs.flatten()
import itertools

marker_iter = itertools.cycle(("x", "+"))
color_iter = itertools.cycle(default_colors)


i = 0
for fit_type in list(dataset_lwc.data_vars)[2:]:
    marker = next(marker_iter)
    color = next(color_iter)

    observations = dataset_lwc["observations"]
    fit = dataset_lwc[fit_type]

    corr = np.corrcoef(observations, fit)[0, 1]
    axs[i].scatter(observations, fit, label=f"{fit_type}", marker=marker, color=color)
    axs[i].set_xlabel(r"LWC obs [$g m^{-3}$]")
    axs[i].set_ylabel(r"LWC fit [$g m^{-3}$]")
    axs[i].set_title(f"{fit_type} {corr:.2f}")

    i += 1

for _ax in axs:

    _ax.set_xlim(0, 0.6)
    _ax.set_ylim(0, 0.6)
    _ax.set_xscale("linear")
    _ax.set_yscale("linear")
    _ax.plot(_ax.get_xlim(), _ax.get_xlim(), "k--")

fig.tight_layout()

##### radii above 50um

In [None]:
all_results = {}
fit_results = {}
error_clouds = []

for cloud_id in identified_clouds["cloud_id"]:
    cloud_id = identified_clouds["cloud_id"].sel(cloud_id=cloud_id)
    cloud_id_str = str(cloud_id.values)
    try:
        results, fits = calc_Ln_fit(
            cloud_id=cloud_id,
            identified_clouds=identified_clouds,
            coarse_composite=coarse_composite.sel(radius=slice(50e-6, None)),
        )
        all_results[cloud_id_str] = results
        fit_results[cloud_id_str] = fits
    except:
        error_clouds.append(cloud_id)
        print(f"error at {cloud_id_str}")
        continue

transposed_data = transpose_dict(all_results)

lwc_observations = []
lwc_rain_observations = []

cloud_ids = identified_clouds["cloud_id"].data
cloud_ids = cloud_ids[~np.isin(cloud_ids, error_clouds)]

for cloud_id in cloud_ids:
    cc = match_clouds_and_cloudcomposite(
        identified_clouds.sel(cloud_id=cloud_id),
        coarse_composite,
    )
    lwc_observations.append(cc["liquid_water_content"].mean("time"))

lwc_fits = dict()
for fit_type, fit_dict in transposed_data.items():
    metric_dict = fit_dict["liquid_water_content"]
    keys = np.array(list(metric_dict.keys())).astype(int)
    lwc_fit = np.array(list(metric_dict.values()))
    lwc_fits[fit_type] = (["cloud_id"], lwc_fit)


dataset_lwc_split = xr.Dataset(
    data_vars=dict(
        observations=(["cloud_id"], lwc_observations),
        **lwc_fits,
    ),
    coords=dict(cloud_id=cloud_ids),
    attrs=dict(setup="Coarse Rain Only"),
)

fits_50radii = fit_results

error at 8
error at 13
error at 14
error at 22
error at 23
error at 36
error at 79
error at 100
error at 115
error at 116
error at 147
error at 183
error at 184
error at 189
error at 229
error at 269
error at 299
error at 305
error at 308
error at 310
error at 317
error at 380
error at 390
error at 409
error at 445
error at 565
error at 571


In [None]:
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(6, 7), sharex=True, sharey=True)

axs = axs.flatten()
import itertools

marker_iter = itertools.cycle(("x", "+"))
color_iter = itertools.cycle(default_colors)


i = 0
for fit_type in list(dataset_lwc_split.data_vars)[1:]:
    marker = next(marker_iter)
    color = next(color_iter)

    observations = dataset_lwc_split["observations"]
    fit = dataset_lwc_split[fit_type]

    corr = np.corrcoef(observations, fit)[0, 1]
    axs[i].scatter(observations, fit, label=f"{fit_type}", marker=marker, color=color)
    axs[i].set_title(f"{fit_type} {corr:.2f}")

    i += 1

for _ax in axs:

    _ax.set_xlim(0, 0.6)
    _ax.set_ylim(0, 0.6)
    _ax.set_xscale("linear")
    _ax.set_yscale("linear")
    _ax.plot(_ax.get_xlim(), _ax.get_xlim(), "k--")

fig.supxlabel(r"LWC obs [$g m^{-3}$]")
fig.supylabel(r"LWC fit [$g m^{-3}$]")
fig.tight_layout()

#### Radii above 50um AND NOT coarsened 

In [None]:
all_results = {}
fit_results = {}
error_clouds = []
for cloud_id in identified_clouds["cloud_id"]:
    cloud_id = identified_clouds["cloud_id"].sel(cloud_id=cloud_id)
    cloud_id_str = str(cloud_id.values)
    try:
        results, fits = calc_Ln_fit(
            cloud_id=cloud_id,
            identified_clouds=identified_clouds,
            coarse_composite=cloud_composite.sel(radius=slice(50e-6, None)),
        )
        all_results[cloud_id_str] = results
        fit_results[cloud_id_str] = fits

    except:
        error_clouds.append(cloud_id)
        print(f"error at {cloud_id_str}")
        continue

transposed_data = transpose_dict(all_results)

lwc_observations = []

cloud_ids = identified_clouds["cloud_id"].data
cloud_ids = cloud_ids[~np.isin(cloud_ids, error_clouds)]

for cloud_id in cloud_ids:
    cc = match_clouds_and_cloudcomposite(
        identified_clouds.sel(cloud_id=cloud_id),
        coarse_composite,
    )
    lwc_observations.append(cc["liquid_water_content"].mean("time"))

lwc_fits = dict()
for fit_type, fit_dict in transposed_data.items():
    metric_dict = fit_dict["liquid_water_content"]
    keys = np.array(list(metric_dict.keys())).astype(int)
    lwc_fit = np.array(list(metric_dict.values()))
    lwc_fits[fit_type] = (["cloud_id"], lwc_fit)


dataset_lwc_split_nocoarse = xr.Dataset(
    data_vars=dict(
        observations=(["cloud_id"], lwc_observations),
        **lwc_fits,
    ),
    coords=dict(cloud_id=cloud_ids),
    attrs=dict(
        setup="Original Rain Only",
    ),
)

error at 8
error at 13
error at 14
error at 22
error at 23
error at 36
error at 79
error at 100
error at 115
error at 116
error at 147
error at 183
error at 184
error at 189
error at 229
error at 269
error at 299
error at 305
error at 308
error at 310
error at 317
error at 380
error at 404
error at 409
error at 445
error at 565
error at 571


In [None]:
fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(6, 7), sharex=True, sharey=True)

axs = axs.flatten()
import itertools

marker_iter = itertools.cycle(("x", "+"))
color_iter = itertools.cycle(["k"] + default_colors)


i = 0
for fit_type in list(dataset_lwc_split_nocoarse.data_vars)[2:]:
    marker = next(marker_iter)
    color = next(color_iter)

    observations = dataset_lwc_split_nocoarse["observations"]
    fit = dataset_lwc_split_nocoarse[fit_type]

    corr = np.corrcoef(observations, fit)[0, 1]
    axs[i].scatter(observations, fit, label=f"{fit_type}", marker=marker, color=color)
    axs[i].set_title(f"{fit_type} {corr:.2f}")

    i += 1

for _ax in axs:

    _ax.set_xlim(0, 0.6)
    _ax.set_ylim(0, 0.6)
    _ax.set_xscale("linear")
    _ax.set_yscale("linear")
    _ax.plot(_ax.get_xlim(), _ax.get_xlim(), "k--")

fig.supxlabel(r"LWC obs [$g m^{-3}$]")
fig.supylabel(r"LWC fit [$g m^{-3}$]")
fig.tight_layout()

### compare all setups of input

In [None]:
fig, axs = plt.subplots(ncols=3, nrows=2, figsize=(10, 6), sharex=True, sharey=True)

axs_psd = axs[0]
axs_msd = axs[1]

for ax, ds in zip(
    axs_psd,
    [dataset_lwc, dataset_lwc_split, dataset_lwc_split_nocoarse],
):
    observations = ds["observations"]
    fit = ds["PSD vari"]

    corr = np.corrcoef(observations, fit)[0, 1]
    total_diff = np.abs(observations - fit).mean()
    relative_diff = np.abs((observations - fit) / observations).mean()

    ax.scatter(observations, fit, label=f"{fit_type}", marker=marker, color=color)
    ax.set_title(
        f"{ds.attrs['setup']}\ncorr: {corr:.2f} diff: {total_diff:.2f} rel.diff: {relative_diff:.2f}"
    )

for ax, ds in zip(
    axs_msd,
    [dataset_lwc, dataset_lwc_split, dataset_lwc_split_nocoarse],
):
    observations = ds["observations"]
    fit = ds["MSD nova"]

    corr = np.corrcoef(observations, fit)[0, 1]
    total_diff = np.abs(observations - fit).mean()
    relative_diff = np.abs((observations - fit) / observations).mean()
    ax.scatter(observations, fit, label=f"{fit_type}", marker=marker, color=color)
    ax.set_title(
        f"{ds.attrs['setup']}\ncorr: {corr:.2f} diff: {total_diff:.2f} rel.diff: {relative_diff:.2f}"
    )


for _ax in axs.flatten():

    _ax.set_xlim(0, 1)
    _ax.set_ylim(0, 1)
    _ax.set_xscale("linear")
    _ax.set_yscale("linear")
    _ax.plot(_ax.get_xlim(), _ax.get_xlim(), "k--")

fig.tight_layout()

### Best setup: whole radii range and MSD fit

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(8, 4))

observations = dataset_lwc["observations"]
observations_var = dataset_lwc["observations_var"]
fit = dataset_lwc["MSD nova"]
fit.name = r"Fit to MSD with radius limited and coarsening above 50$\mu m$"

observations = observations.where(observations >= 0.1)

args = np.isfinite(observations) & np.isfinite(fit)
corr = np.corrcoef(observations[args], fit[args])[0, 1]
diff = observations - fit
relative_diff = (observations - fit) / observations
total_diff_mean, total_diff_std = diff.mean("cloud_id"), diff.std("cloud_id")
relative_diff_mean, relative_diff_std = relative_diff.mean("cloud_id"), relative_diff.std("cloud_id")

for _ax in axs:
    # _ax.scatter(observations, fit, label=f"{fit.name}", marker=marker, color=color)
    _ax.errorbar(
        x=observations,
        xerr=observations_var,
        y=fit,
        yerr=0,
        linestyle="",
        label=f"{fit.name}",
        marker=marker,
        color=color,
    )
    _ax.plot((0, 2), (0, 2), "k--")
    _ax.set_xlabel(r"LWC obs [$g m^{-3}$]")
    _ax.set_ylabel(r"LWC fit [$g m^{-3}$]")

axs[0].set_xlim(0, 2)
axs[0].set_ylim(0, 2)
axs[1].set_xlim(0, 0.6)
axs[1].set_ylim(0, 0.6)

str_corr = f"{corr:.2f}"
str_total_diff = rf"{total_diff_mean:.2f}$\pm${total_diff_std:.2f}" + r"$g m^{-3}$"
str_relative_diff = rf"{100* relative_diff_mean:.0f}$\pm${100* relative_diff_std:.0f}" + r"$\%$"

fig.suptitle(f"{fit.name}\ncorr: {str_corr} diff: {str_total_diff} rel.diff: {str_relative_diff}")
fig.tight_layout()

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(8, 4))

observations = dataset_lwc["observations"]
observations_var = dataset_lwc["observations_var"]
fit = dataset_lwc["PSD vari"]
fit.name = r"Fit to PSD with radius limited and coarsening above 50$\mu m$"

observations = observations.where(observations >= 0.1)

args = np.isfinite(observations) & np.isfinite(fit)
corr = np.corrcoef(observations[args], fit[args])[0, 1]
diff = observations - fit
relative_diff = (observations - fit) / observations
total_diff_mean, total_diff_std = diff.mean("cloud_id"), diff.std("cloud_id")
relative_diff_mean, relative_diff_std = relative_diff.mean("cloud_id"), relative_diff.std("cloud_id")


for _ax in axs:
    # _ax.scatter(observations, fit, label=f"{fit.name}", marker=marker, color=color)
    _ax.errorbar(
        x=observations,
        xerr=observations_var,
        y=fit,
        yerr=0,
        linestyle="",
        label=f"{fit.name}",
        marker=marker,
        color=color,
    )
    _ax.plot((0, 2), (0, 2), "k--")
    _ax.set_xlabel(r"LWC obs [$g m^{-3}$]")
    _ax.set_ylabel(r"LWC fit [$g m^{-3}$]")

axs[0].set_xlim(0, 2)
axs[0].set_ylim(0, 2)
axs[1].set_xlim(0, 0.6)
axs[1].set_ylim(0, 0.6)

str_corr = f"{corr:.2f}"
str_total_diff = rf"{total_diff_mean:.2f}$\pm${total_diff_std:.2f}" + r"$g m^{-3}$"
str_relative_diff = rf"{100* relative_diff_mean:.0f}$\pm${100* relative_diff_std:.0f}" + r"$\%$"

fig.suptitle(f"{fit.name}\ncorr: {str_corr} diff: {str_total_diff} rel.diff: {str_relative_diff}")
fig.tight_layout()

plot all fits

In [None]:
r = np.geomspace(0.1e-6, 3e-3, 100)
t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
w_test = (t_test - t_test.shift(radius=2)).shift(radius=-1)
w_test = w_test.interpolate_na("radius", method="linear", fill_value="extrapolate")

In [None]:
x, m, cost, opt = [], [], [], []
for cloud_id in fits_allradii:
    xx, mm = fits_allradii[cloud_id]["MSD nova"].predict(t_test)
    mm = w_test * mm
    x.append(xx)
    m.append(mm)
    cost.append(fits_allradii[cloud_id]["MSD nova"].fit_result.cost)
    opt.append(fits_allradii[cloud_id]["MSD nova"].fit_result.optimality)

x = xr.concat(x, dim="cloud_id")
m = xr.concat(m, dim="cloud_id")
y = psd_from_msd_dataarray(m)

ds = xr.Dataset(
    data_vars=dict(
        radius2D=x,
        mass_size_distribution=m,
        particle_size_distribution=y,
        bin_width=w_test,
    ),
    coords=dict(radius=t_test, cloud_id=dataset_lwc["cloud_id"]),
)

In [None]:
ids = [
    220,
]

plt.plot(
    1e6 * ds["radius2D"].T,
    ds["particle_size_distribution"].T,
    color="grey",
    alpha=0.2,
)

# plt.plot(
#     ds["radius2D"].sel(cloud_id=ids).T,
#     ds["particle_size_distribution"].sel(cloud_id=ids).T,
#     alpha=1,
#     linewidth=2,
# )

plt.xscale("log")
plt.yscale("log")
plt.ylim(1e-1, 1e9)
plt.xlim(7e-2, 2e3)
plt.yticks((1e0, 1e3, 1e6), minor=False)
plt.yticks(10.0 ** np.arange(-1, 8, 1), minor=True, labels="")
plt.title("Particle Size Distribution")
plt.xlabel("Radius [µm]")
plt.ylabel("PSD [m$^{-3}$ m$^{-1}$]")

Text(0, 0.5, 'PSD [m$^{-3}$ m$^{-1}$]')

# Meeting 14.11.2024 

In [None]:
cloud_id = rng.choice(
    identified_clouds.where(
        # (identified_clouds["liquid_water_content"] / identified_clouds['duration'].dt.seconds) > 1,
        (identified_clouds["liquid_water_content"] / identified_clouds["duration"].dt.seconds) > 0.4,
        # identified_clouds['duration'].dt.seconds > 40,
        drop=True,
    )["cloud_id"]
)
# cloud_id = 217 # both good
cloud_id = 412  # both good but MSD much higher LWC

In [None]:
# np.random.seed(42)

train_data = match_clouds_and_cloudcomposite(
    identified_clouds.sel(cloud_id=cloud_id),
    coarse_composite,
)

t_train = train_data["radius2D"]
y_train = train_data["particle_size_distribution"]
m_train = train_data["mass_size_distribution"]
w_train = train_data["bin_width"]
lwc_train = train_data["liquid_water_content"].mean("time")

# we can also use only the radii where we have data:
radii_measured = train_data.max("time")["particle_size_distribution"] > 0
end = train_data["radius"].where(radii_measured).max("radius")
start = train_data["radius"].where(radii_measured).min("radius")
end = 1.5 * end
# create a log spaced array of radii
r = np.geomspace(start, end, 1000)
t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
w_test = (t_test - t_test.shift(radius=2)).shift(radius=-1)
w_test = w_test.interpolate_na("radius", method="linear", fill_value="extrapolate")
# t_test = t_test.where(np.isfinite(w_test), drop=True)
# w_test = w_test.where(np.isfinite(w_test), drop=True)

x0_psd = np.array([8e-6, 2, 1e10, 200e-6, 2, 1e6])
bounds_psd = Bounds(
    # mu1, sig1, sc1, mu2, sig2, sc2
    lb=[1e-6, 1.1, 1e7, 200e-6, 1.1, 1e0],
    ub=[10e-6, 3.0, 1e13, 0.5e-3, 3.0, 1e8],
    # keep_feasible = [True, True, True, False, True, True]
)
lqs_psd = LeastSquareFit(
    name="PSD",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_psd,
    bounds=bounds_psd,
    t_train=t_train.mean("time"),
    y_train=y_train.mean("time"),
    plot_kwargs=dict(color=default_colors[0], linestyle="-"),
)

setups_psd = [lqs_psd]


x0_msd = np.array([8e-6, 2, 1e-1, 300e-6, 2, 1e0])
bounds_msd = Bounds(
    lb=[1e-6, 1.1, 1e-3, 200e-6, 1.3, 1e-3],
    ub=[10e-6, 3.0, 1e2, 0.5e-3, 3.0, 1e1],
    # keep_feasible = [True, True, True, False, True, True]
)

lqs_msd_var1 = LeastSquareFit(
    name="MSD nova",
    fit_kwargs=dict(loss="linear", kwargs=dict(variance=1)),
    func=double_ln_normal_distribution,
    cost_func=double_ln_normal_distribution_cost,
    x0=x0_msd,
    bounds=bounds_msd,
    t_train=t_train.mean("time"),
    y_train=m_train.mean("time"),
    plot_kwargs=dict(color=dark_colors[1], linestyle="-"),
)


# softl1_msd = LeastSquareFit(
#     name = "MSD Soft",
#     fit_kwargs=dict(loss="soft_l1", f_scale=0.1, kwargs=dict(variance=1)),
#     func = double_ln_normal_distribution,
#     cost_func = double_ln_normal_distribution_cost,
#     x0 = x0_msd,
#     bounds= bounds_msd,
#     t_train= t_train.mean('time'),
#     y_train= m_train.mean('time'),
#     plot_kwargs=dict(color=default_colors[2], linestyle="-")
#     )


setups_msd = [
    lqs_msd_var1,
]  # , soft_l1, soft_l1_mean]
setups = setups_psd + setups_msd

for doublefit in setups:
    doublefit.fit(repetitions=10)

In [None]:
# --- Plot the results ---
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 6), sharex=True)
ax0, ax1 = axs

# plot the particle size distribution
ax0.scatter(t_train, y_train, marker=".", color="grey", alpha=0.5)
ax0.scatter(t_train.mean("time"), y_train.mean("time"), marker="o", color="k")

# plot the mass size distribution
ax1.scatter(t_train, m_train, marker=".", color="grey", alpha=0.5)
ax1.scatter(t_train.mean("time"), m_train.mean("time"), marker="o", color="k")

# --- Plot the results ---
for fit in setups_psd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, y = fit.predict(t)
    m = msd_from_psd_dataarray(y)
    lwc_psd = 1e3 * (m * w).sum("radius").values

    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)

for fit in setups_msd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, m = fit.predict(t)
    y = psd_from_msd_dataarray(m)
    lwc_msd = 1e3 * (m * w).sum("radius").values
    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)


fig.suptitle(f"Fits for cloud {cloud_id}: Desired={lwc_train.values:.2f}", fontsize="medium")

for _ax in axs:
    _ax.set_xscale("log")
    _ax.set_xlim(1e-6, 3.5e-3)
axs[0].set_yscale("symlog", linthresh=1.1e3, linscale=0.1)
axs[1].set_yscale("symlog", linthresh=1.1e-7, linscale=0.1)

axs[0].set_ylim(1e-12, 1e14)
axs[1].set_ylim(1e-12, 0.5e2)
axs[0].legend(fontsize="small", loc="upper center")
axs[1].legend(fontsize="small", loc="upper center")

axs[0].set_ylabel(label_from_attrs(y_train, linebreak=True), fontsize="small")
axs[1].set_ylabel(label_from_attrs(m_train, linebreak=True), fontsize="small")

Text(0, 0.5, 'Mass concentration\n$\\left[  kg m^{-3} m^{-1}  \\right]$')

In [None]:
# --- Plot the results ---
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 6), sharex=True)
ax0, ax1 = axs

# plot the particle size distribution
ax0.scatter(t_train, y_train, marker=".", color="grey", alpha=0.5)
ax0.scatter(t_train.mean("time"), y_train.mean("time"), marker="o", color="k")

# plot the mass size distribution
ax1.scatter(t_train, m_train, marker=".", color="grey", alpha=0.5)
ax1.scatter(t_train.mean("time"), m_train.mean("time"), marker="o", color="k")

# --- Plot the results ---
for fit in setups_psd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, y = fit.predict(t)
    m = msd_from_psd_dataarray(y)
    lwc_psd = 1e3 * (m * w).sum("radius").values

    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)

for fit in setups_msd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, m = fit.predict(t)
    y = psd_from_msd_dataarray(m)
    lwc_msd = 1e3 * (m * w).sum("radius").values
    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)


fig.suptitle(f"Fits for cloud {cloud_id}: Desired={lwc_train.values:.2f}", fontsize="medium")

for _ax in axs:
    _ax.set_xscale("log")
    _ax.set_xlim(1e-6, 3.5e-3)
axs[0].set_yscale("linear")
axs[1].set_yscale("linear")

axs[0].set_ylim(0, 4e11)
axs[1].set_ylim(0, 1.5)
axs[0].legend(fontsize="small", loc="upper center")
axs[1].legend(fontsize="small", loc="upper center")

axs[0].set_ylabel(label_from_attrs(y_train, linebreak=True), fontsize="small")
axs[1].set_ylabel(label_from_attrs(m_train, linebreak=True), fontsize="small")

Text(0, 0.5, 'Mass concentration\n$\\left[  kg m^{-3} m^{-1}  \\right]$')

In [None]:
# --- Plot the results ---
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 6), sharex=True)
ax0, ax1 = axs

# plot the particle size distribution
ax0.scatter(t_train, y_train, marker=".", color="grey", alpha=0.5)
ax0.scatter(t_train.mean("time"), y_train.mean("time"), marker="o", color="k")

# plot the mass size distribution
ax1.scatter(t_train, m_train, marker=".", color="grey", alpha=0.5)
ax1.scatter(t_train.mean("time"), m_train.mean("time"), marker="o", color="k")

# --- Plot the results ---
for fit in setups_psd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, y = fit.predict(t)
    m = msd_from_psd_dataarray(y)
    lwc_psd = 1e3 * (m * w).sum("radius").values

    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)

for fit in setups_msd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, m = fit.predict(t)
    y = psd_from_msd_dataarray(m)
    lwc_msd = 1e3 * (m * w).sum("radius").values
    ax0.plot(t, y, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, m, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)


fig.suptitle(f"Fits for cloud {cloud_id}: Desired={lwc_train.values:.2f}", fontsize="medium")

for _ax in axs:
    _ax.set_xscale("linear")
    _ax.set_xlim(1e-6, 3.5e-3)
axs[0].set_yscale("linear")
axs[1].set_yscale("linear")

axs[0].set_ylim(0, 4e11)
axs[1].set_ylim(0, 1.5)
axs[0].legend(fontsize="small", loc="upper center")
axs[1].legend(fontsize="small", loc="upper center")

axs[0].set_ylabel(label_from_attrs(y_train, linebreak=True), fontsize="small")
axs[1].set_ylabel(label_from_attrs(m_train, linebreak=True), fontsize="small")

Text(0, 0.5, 'Mass concentration\n$\\left[  kg m^{-3} m^{-1}  \\right]$')

In [None]:
# --- Plot the results ---
fig, axs = plt.subplots(nrows=2, ncols=1, figsize=(12, 6), sharex=True)
ax0, ax1 = axs

# # plot the particle size distribution
# ax0.scatter(t_train, w_train * y_train, marker=".", color="grey", alpha=0.5)
# ax0.scatter(t_train.mean("time"), w_train * y_train.mean("time"), marker="o", color="k")

# # plot the mass size distribution
# ax1.scatter(t_train, w_train * m_train, marker=".", color="grey", alpha=0.5)
# ax1.scatter(t_train.mean("time"), w_train * m_train.mean("time"), marker="o", color="k")


# create a log spaced array of radii
r = np.linspace(start, end, 10000)
t_test = xr.DataArray(data=r, coords={"radius": r}, dims=["radius"])
w_test = (t_test - t_test.shift(radius=2)).shift(radius=-1)
w_test = w_test.interpolate_na("radius", method="linear", fill_value="extrapolate")
# t_test = t_test.where(np.isfinite(w_test), drop=True)
# w_test = w_test.where(np.isfinite(w_test), drop=True)


# --- Plot the results ---
for fit in setups_psd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, y = fit.predict(t)
    m = msd_from_psd_dataarray(y * w)
    m = m / w
    lwc_psd = 1e3 * (m * w).sum("radius").values

    ax0.plot(t, w * y, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, w * m, label=f"{fit.name} LWC:{lwc_psd:.2f}", linewidth=2, **fit.plot_kwargs)

for fit in setups_msd:
    # t, w = t_train.mean('time'), w_train
    t, w = t_test, w_test

    t, m = fit.predict(t)
    y = psd_from_msd_dataarray(m * w)
    y = y / w
    lwc_msd = 1e3 * (m * w).sum("radius").values
    ax0.plot(t, w * y, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)
    ax1.plot(t, w * m, label=f"{fit.name} LWC:{lwc_msd:.2f}", linewidth=2, **fit.plot_kwargs)


fig.suptitle(f"Fits for cloud {cloud_id}: Desired={lwc_train.values:.2f}", fontsize="medium")

for _ax in axs:
    # _ax.set_xscale("log")
    _ax.set_xlim(1e-6, 2e-3)
axs[0].set_yscale("log")
axs[1].set_yscale("linear")

# axs[0].set_ylim(0, 4e11)
# axs[1].set_ylim(0, 0.0001)
axs[0].legend(fontsize="small", loc="upper center")
axs[1].legend(fontsize="small", loc="upper center")

axs[0].set_ylabel(label_from_attrs(y_train, return_units=False) + r"[$m^{-3}$]", fontsize="small")
axs[1].set_ylabel(label_from_attrs(m_train, return_units=False) + r"[$kg m^{-3}$]", fontsize="small")
axs[1].set_xlabel("Radius [m]", fontsize="small")

Text(0.5, 0, 'Radius [m]')