# Notebook for calculating significance between MCS tracking results (MCS-associated precipitation) for PI and Plio for Fig. 2

In [1]:
import glob
import os

import numpy as np
import xarray as xr
from scipy.stats import mannwhitneyu

PI_DIR = "/glade/work/malbright/final_nam_manuscript_files/mcs/preindustrial/processing/"
PLIO_DIR = "/glade/work/malbright/final_nam_manuscript_files/mcs/pliocene/processing/"
OUTFILE = "/glade/work/malbright/final_nam_manuscript_files/significance/mcs_precip_mannwhitney_utest.nc"

VAR_NAME = "__xarray_dataarray_variable__"
ALPHA_DEFAULT = 0.05

In [2]:
def _mw_u_and_p(a, b):
    """
    a, b: 1D numpy arrays for one grid cell across years (may contain NaNs)
    returns: (u_statistic, p_value, n_a, n_b)
    """
    a = np.asarray(a)
    b = np.asarray(b)

    a = a[np.isfinite(a)]
    b = b[np.isfinite(b)]

    n_a = a.size
    n_b = b.size

    # Need at least 1 sample in each group
    if (n_a < 1) or (n_b < 1):
        return np.nan, np.nan, n_a, n_b

    # Mann–Whitney U (two-sided). method="auto" handles ties/large samples appropriately.
    res = mannwhitneyu(a, b, alternative="two-sided", method="auto")
    return float(res.statistic), float(res.pvalue), n_a, n_b

In [None]:
pi_files = sorted(glob.glob(os.path.join(PI_DIR, "mcs_day_prect_PI_*.nc")))
plio_files = sorted(
    glob.glob(os.path.join(PLIO_DIR, "mcs_day_prect_Plio_*.nc"))
)

if not pi_files:
    raise FileNotFoundError(f"No PI files found in {PI_DIR}")
if not plio_files:
    raise FileNotFoundError(f"No Plio files found in {PLIO_DIR}")

# Load and concatenate along separate year dimensions
print("Loading PI data...")
pi = xr.open_mfdataset(
    pi_files, combine="nested", concat_dim="year_pi", parallel=False
)[VAR_NAME].load()

print("Loading Plio data...")
plio = xr.open_mfdataset(
    plio_files, combine="nested", concat_dim="year_plio", parallel=False
)[VAR_NAME].load()

# Ensure we have lat/lon coordinates in the output
lat = pi["lat"]
lon = pi["lon"]

# Apply Mann–Whitney per grid cell
print("Running Mann-Whitney U test...")
u_stat, p_val, n_pi, n_plio = xr.apply_ufunc(
    _mw_u_and_p,
    pi,
    plio,
    input_core_dims=[["year_pi", "time"], ["year_plio", "time"]],
    output_core_dims=[[], [], [], []],
    vectorize=True,
    dask="forbidden",
    output_dtypes=[float, float, int, int],
    exclude_dims={"time"},
)
print("Calculated!")

ds_out = xr.Dataset(
    data_vars={
        "u_statistic": u_stat,
        "p_value": p_val,
        "n_years_pi_used": n_pi,
        "n_years_plio_used": n_plio,
    },
    coords={"lat": lat, "lon": lon},
    attrs={
        "test": "Mann-Whitney U",
        "alternative": "two-sided",
        "nan_handling": "NaNs removed independently per grid cell",
        "pi_source_dir": PI_DIR,
        "plio_source_dir": PLIO_DIR,
        "note": "Significance for differences in unique_id_counts_climatology between PI and Plio at each lat/lon",
    },
)

outdir = os.path.dirname(OUTFILE)
os.makedirs(outdir, exist_ok=True)

# Reasonable encodings
enc = {
    "u_statistic": {"dtype": "float32", "_FillValue": np.float32(np.nan)},
    "p_value": {"dtype": "float32", "_FillValue": np.float32(np.nan)},
    "n_years_pi_used": {"dtype": "int16"},
    "n_years_plio_used": {"dtype": "int16"},
}

print("Saving output...")
ds_out.to_netcdf(OUTFILE, encoding=enc)
print(f"Wrote: {OUTFILE}")

Loading PI data...
Loading Plio data...
Running Mann-Whitney U test...
Calculated!
Saving output...
Wrote: /glade/work/malbright/final_nam_manuscript_files/significance/mcs_precip_mannwhitney_utest.nc
