# Figure 4

Histograms of the TB departure between clear-sky observations and forward simulations.

In [None]:
from string import ascii_lowercase as abc

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import xarray as xr
from lizard.readers.band_pass import read_band_pass
from lizard.writers.figure_to_file import write_figure
from sklearn.metrics import r2_score

from si_clouds.io.readers.ancillary import read_ancillary_data
from si_clouds.io.readers.oem_result import read_oem_result_concat

In [None]:
ds_anc = read_ancillary_data()
ds_bp = read_band_pass("HAMP")

In [None]:
ds_a_un, ds_op_un, _, _ = read_oem_result_concat(
    version="pub_r1_clearsky_v1", test_id="", write=False
)

ds_a_fu, ds_op_fu, _, _ = read_oem_result_concat(
    version="pub_r2_clearsky_v1", test_id="", write=False
)

# remove times where the retrieval was not valid
ds_a_un = ds_a_un.sel(time=ds_anc.ix_retrieval_valid.sel(time=ds_a_un.time))
ds_op_un = ds_op_un.sel(time=ds_anc.ix_retrieval_valid.sel(time=ds_op_un.time))
ds_a_fu = ds_a_fu.sel(time=ds_anc.ix_retrieval_valid.sel(time=ds_a_fu.time))
ds_op_fu = ds_op_fu.sel(time=ds_anc.ix_retrieval_valid.sel(time=ds_op_fu.time))

In [None]:
# align the uncalibrated data and full retrieval data
ds_a_un_aligned, ds_a_fu_aligned, ds_op_un_aligned, ds_op_fu_aligned = (
    xr.align(ds_a_un, ds_a_fu, ds_op_un, ds_op_fu)
)

In [None]:
print(ds_a_un.conv.mean("time").item())
print(ds_a_fu.conv.mean("time").item())

In [None]:
print(len(ds_a_fu.time))
print(len(ds_a_un.time))

In [None]:
print(len(ds_a_un_aligned.time) / len(ds_a_un.time))

In [None]:
colors = {
    "UNC": "C0",
    "FUL": "C1",
}

In [None]:
# print statistics for each of the channels and simulations
data_names = ["R1 (a priori)", "R2 (a priori)", "R1 (optimal)", "R2 (optimal)"]
for i, ds in enumerate(
    [ds_a_un_aligned, ds_a_fu_aligned, ds_op_un_aligned, ds_op_fu_aligned]
):
    for channel in ds_a_un_aligned.channel.values:
        bias = (ds.y_obs - ds.y_sim).sel(channel=channel).mean().values
        mae = np.abs((ds.y_obs - ds.y_sim).sel(channel=channel)).mean().values
        rmse = np.sqrt(
            ((ds.y_obs - ds.y_sim).sel(channel=channel) ** 2).mean().values
        )
        r2 = r2_score(
            ds.y_obs.sel(channel=channel), ds.y_sim.sel(channel=channel)
        )
        print(
            f"{data_names[i]} C{channel} | Bias: {bias:.1f} K | MAE: {mae:.1f} K | RMSE: {rmse:.1f} K | R$^2$: {r2:.2f}",
        )

In [None]:
tb_bins = np.arange(-50, 50.1, 1)

fig, axes = plt.subplots(
    2, 3, figsize=(7, 4.5), sharex=True, sharey=True, layout="constrained"
)

for i, channel in enumerate(ds_a_un.channel.values):
    ax = axes.flat[i]

    ax.annotate(
        f"({abc[i]}) {ds_bp.label.sel(channel=channel).values}",
        xy=(0, 1),
        xycoords="axes fraction",
        ha="left",
        va="bottom",
    )

    # a priori difference
    da_diff_un_a = (ds_a_un_aligned.y_obs - ds_a_un_aligned.y_sim).sel(
        channel=channel
    )
    da_diff_fu_a = (ds_a_fu_aligned.y_obs - ds_a_fu_aligned.y_sim).sel(
        channel=channel
    )

    da_diff_op_un = (ds_op_un_aligned.y_obs - ds_op_un_aligned.y_sim).sel(
        channel=channel
    )
    da_diff_op_fu = (ds_op_fu_aligned.y_obs - ds_op_fu_aligned.y_sim).sel(
        channel=channel
    )

    ax.hist(
        da_diff_un_a,
        bins=tb_bins,
        histtype="step",
        color=colors["UNC"],
        label="R1 (a priori)",
        density=True,
    )
    ax.hist(
        da_diff_fu_a,
        bins=tb_bins,
        histtype="step",
        color=colors["FUL"],
        label="R2 (a priori)",
        density=True,
    )

    ax.hist(
        da_diff_op_un,
        bins=tb_bins,
        histtype="stepfilled",
        color=colors["UNC"],
        label="R1 (opt.)",
        density=True,
        linewidth=plt.rcParams["lines.linewidth"],
        alpha=0.75,
    )
    ax.hist(
        da_diff_op_fu,
        bins=tb_bins,
        histtype="stepfilled",
        color=colors["FUL"],
        label="R2 (opt.)",
        density=True,
        linewidth=plt.rcParams["lines.linewidth"],
        alpha=0.5,
    )

    # effective measurement uncertainty (no big difference among retrievals)
    unc_un_a = np.sqrt(
        ds_a_un_aligned.unc_meas_eff.sel(
            channel1=channel, channel2=channel, update=ds_a_un_aligned.conv_i
        )
    ).mean("time")
    # unc_fu_a = np.sqrt(ds_a_fu_aligned.unc_meas_eff.sel(channel1=channel, channel2=channel, update=ds_a_fu_aligned.conv_i)).mean("time")

    ax.axvline(
        -unc_un_a,
        color="k",
        linestyle="--",
        label="Unc.",
        linewidth=0.75,
    )
    ax.axvline(unc_un_a, color="k", linestyle="--", linewidth=0.75)

    ax.set_xlim(-20, 20)
    ax.axvline(0, color="black", linewidth=0.75)
    ax.xaxis.set_minor_locator(mticker.MultipleLocator(2))

axes[1, 0].set_xlabel("$T_{b,obs} - T_{b,sim}$ [K]")
axes[1, 1].set_xlabel("$T_{b,obs} - T_{b,sim}$ [K]")
axes[1, 2].set_xlabel("$T_{b,obs} - T_{b,sim}$ [K]")
axes[0, 0].set_ylabel("Frequency")
axes[1, 0].set_ylabel("Frequency")

handles, labels = plt.gca().get_legend_handles_labels()
by_label = dict(zip(labels, handles))
fig.legend(
    by_label.values(),
    by_label.keys(),
    ncol=5,
    frameon=True,
    loc="lower center",
    bbox_to_anchor=(0.5, 1),
)

write_figure(
    fig,
    f"paper/fig04.png",
    dpi=300,
    bbox_inches="tight",
)

plt.show()