Splits inputs into into training and test and normalizes (from and to /work/milesep/convective_outlook_ml)

In [1]:
import xarray as xr
import numpy as np

In [2]:
detail = 'slgt_small'

In [9]:
ds = xr.open_zarr("/glade/work/milesep/convective_outlook_ml/inputs_raw_" + detail + ".zarr")

In [10]:
ds.load()

In [5]:
def estimate_dataset_size_bytes(ds):
    total_bytes = 0
    for var in ds.data_vars.values():
        if var.chunks is not None:
            total_bytes += var.nbytes
        else:
            # Use 64-bit integers to avoid overflow
            n_elements = np.prod(var.shape, dtype=np.int64)
            dtype_size = np.dtype(var.dtype).itemsize
            total_bytes += int(n_elements * dtype_size)
    return total_bytes


size_bytes = estimate_dataset_size_bytes(ds)
print(f"Estimated uncompressed size: {size_bytes / 1e9:.2f} GB")


Estimated uncompressed size: 4.17 GB


In [7]:
# split into training and test datasets

train_ds = ds.sel(day=slice('2002-01-01', '2019-12-31'))
test_ds = ds.sel(day=slice('2020-01-01', '2024-12-31'))
test_ds = test_ds.chunk({'day': 10})

In [8]:
# mean and std for each day
def compute_daily_stats(ds):
    day_stats = {}
    for var in ds.data_vars:
        dims = ds[var].dims

        # Determine whether level is in the variable
        reduce_dims = set(dims) & {'latitude', 'longitude', 'tod'}
        group_dims = ['day']
        if 'level' in dims:
            group_dims.append('level')

        # Compute stats
        mean = ds[var].groupby('day').mean(dim=list(reduce_dims | {'tod'}), skipna=True)
        std = ds[var].groupby('day').std(dim=list(reduce_dims | {'tod'}), skipna=True)

        # Preserve level as dimension
        mean = mean.rename(f'{var}_mean')
        std = std.rename(f'{var}_std')

        day_stats[f'{var}_mean'] = mean
        day_stats[f'{var}_std'] = std

    return xr.Dataset(day_stats)


# Compute the stats
daily_stats_ds = compute_daily_stats(train_ds)

In [9]:
daily_stats_ds

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.30 kiB 4 B Shape (332,) (1,) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  1,

Unnamed: 0,Array,Chunk
Bytes,1.30 kiB,4 B
Shape,"(332,)","(1,)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 5 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 5 graph layers,332 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 6.48 kiB 20 B Shape (5, 332) (5, 1) Dask graph 332 chunks in 6 graph layers Data type float32 numpy.ndarray",332  5,

Unnamed: 0,Array,Chunk
Bytes,6.48 kiB,20 B
Shape,"(5, 332)","(5, 1)"
Dask graph,332 chunks in 6 graph layers,332 chunks in 6 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [10]:
daily_stats_ds = daily_stats_ds.compute()
daily_stats_ds.to_netcdf("data/processed_data/daily_input_stats_" + detail + ".nc")

In [11]:
def compute_overall_from_daily_stats(daily_stats_ds):

    overall_stats = {}

    for var in daily_stats_ds.data_vars:
        if var.endswith("_mean"):
            base_name = var[:-5]
            mean_name = f"{base_name}_mean"
            std_name = f"{base_name}_std"

            mean_da = daily_stats_ds[mean_name]
            std_da = daily_stats_ds[std_name]

            # Mean of daily means
            overall_mean = mean_da.mean(dim="day", skipna=True)

            # Pooled std calculation:
            # std_total = sqrt( mean(std_i^2 + (mu_i - mu_total)^2) ) (law of total variance)
            variance_component = std_da**2 + (mean_da - overall_mean)**2
            overall_std = (variance_component.mean(dim="day", skipna=True))**0.5

            overall_stats[f"{base_name}_mean"] = overall_mean
            overall_stats[f"{base_name}_std"] = overall_std

    return xr.Dataset(overall_stats)

In [12]:
def standardize_with_stats(ds, stats):
    standardized = {}
    for var in ds.data_vars:
        da = ds[var]
        mean = stats[f"{var}_mean"]
        std = stats[f"{var}_std"]
        standardized[var] = (da - mean) / std
    return xr.Dataset(standardized)

In [13]:
train_stats = compute_overall_from_daily_stats(daily_stats_ds)

In [14]:
train_ds_std = standardize_with_stats(train_ds, train_stats)
test_ds_std = standardize_with_stats(test_ds, train_stats)

In [15]:
train_ds_std.to_zarr("/glade/work/milesep/convective_outlook_ml/train_inputs_" + detail + ".zarr", mode="w")

<xarray.backends.zarr.ZarrStore at 0x1524cb190d40>

In [16]:
test_ds_std.to_zarr("/glade/work/milesep/convective_outlook_ml/test_inputs_" + detail + ".zarr", mode="w")

<xarray.backends.zarr.ZarrStore at 0x1524cb13a640>