### Packages + Configuration

In [1]:
# 1. Packages
import runpy
import numpy as np
import netCDF4
from pathlib import Path

# 2. Configuration
YEARS = [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010]
VARIABLES = ["2m_temperature",
             "10m_u_component_of_wind",
             "10m_v_component_of_wind",
             "mean_surface_downward_long_wave_radiation_flux",
             "mean_surface_downward_short_wave_radiation_flux",
             "specific_humidity",
             "surface_pressure",
             "total_precipitation"]
NORM_NAME = "zero_one"

# 3. Input and output paths
BASE = Path.cwd()
IN_DIR = BASE / "input_raw"
OUT_DIR = BASE / "input_labels"
OUT_DIR.mkdir(exist_ok=True)

# 4. File for saving ranges of values (useful for normalising and denormalising)
RANGES_PATH = BASE / "ranges.py"

if RANGES_PATH.exists():
    ns = runpy.run_path(str(RANGES_PATH))
    RANGES = ns.get("RANGES", {})
    RANGES = {k: [float(v[0]), float(v[1])] for k, v in RANGES.items()}
else:
    RANGES = {}

### Helper Functions

In [2]:
# 5. If masked values exist, turn to NaN, then cast values to float32
def to_float_nan(a):
    if np.ma.isMaskedArray(a):
        a = a.filled(np.nan)
    return np.asarray(a, dtype=np.float32)

# 6. NaN values will be filled using mean of non-NaN values
def fill_nan_mean(a: np.ndarray) -> np.ndarray:
    if not np.isnan(a).any():
        return a
    m = np.nanmean(a)
    if np.isnan(m):
        m = 0.0
    return np.nan_to_num(a, nan=float(m)).astype(np.float32, copy=False)

# 7. Generate label
def hr_to_lr_up(HR: np.ndarray) -> np.ndarray:
    H, W = HR.shape
    H2, W2 = (H // 2) * 2, (W // 2) * 2
    HRc = HR[:H2, :W2]

    blocks = np.ascontiguousarray(HRc).reshape(H2 // 2, 2, W2 // 2, 2)
    LR_small = np.nanmean(blocks, axis=(1, 3)).astype(np.float32)
    LR_up = np.repeat(np.repeat(LR_small, 2, axis=0), 2, axis=1).astype(np.float32)
    return LR_up

# 8. Normalization helper
def minmax_stats(y: np.ndarray) -> tuple[float, float]:
    vmin = float(np.min(y))
    vmax = float(np.max(y))
    if vmax - vmin < 1e-12:
        vmax = vmin + 1.0
    return vmin, vmax

# 9. Normalizer
def to_zero_one(a: np.ndarray, vmin: float, vmax: float) -> np.ndarray:
    return ((a - vmin) / (vmax - vmin)).astype(np.float32, copy=False)

### Main loop

In [3]:
# 10. loop over all years and variables
for year in YEARS:
    nc_path = IN_DIR / f"{year}.nc"
    with netCDF4.Dataset(nc_path, "r") as ds:
        for variable in VARIABLES:
            var = ds.variables[variable]
            T = int(var.shape[0])
            H = int(var.shape[1])
            W = int(var.shape[2])

            HR = np.empty((T, 1, H, W), dtype=np.float32)
            for t in range(T):
                hr = to_float_nan(var[t, :, :])
                hr = fill_nan_mean(hr)

                # This specific variable has small negative noise, so we clamp it
                if variable == "mean_surface_downward_short_wave_radiation_flux":
                    hr = np.maximum(hr, 0.0).astype(np.float32, copy=False)
    
                HR[t, 0] = hr

            vmin, vmax = minmax_stats(HR)

            # 11. Turn current year's range into floats
            vmin_f, vmax_f = float(vmin), float(vmax)

            # 12. Add variable if not yet in dictionary
            if variable not in RANGES:
                RANGES[variable] = [vmin_f, vmax_f]
                changed = True
                
            # 13. If already there, update if necessary
            else:
                old = RANGES[variable]
                new_min = min(old[0], vmin_f)
                new_max = max(old[1], vmax_f)
                changed = (new_min != old[0]) or (new_max != old[1])
                old[0], old[1] = new_min, new_max
            
            # 14. Create if doesn't exist or save if changes were made
            if (not RANGES_PATH.exists()) or changed:
                with open(RANGES_PATH, "w", encoding="utf-8") as f:
                    f.write("RANGES = {\n")
                    for k in sorted(RANGES.keys()):
                        f.write(f'    "{k}": [{RANGES[k][0]}, {RANGES[k][1]}],\n')
                    f.write("}\n")
        
            X = np.empty_like(HR)
            Y = np.empty_like(HR)

            for t in range(T):
                hr = HR[t, 0]
                lr_up = hr_to_lr_up(hr)

                X[t, 0] = to_zero_one(lr_up, vmin, vmax)
                Y[t, 0] = to_zero_one(hr,   vmin, vmax)

            OUT_DIR_VAR = OUT_DIR / f"{variable}"
            OUT_DIR_VAR.mkdir(exist_ok=True)
            out_path = OUT_DIR_VAR / f"{year}_{variable}.npz"
            np.savez_compressed(
                out_path,
                X=X,
                Y=Y,
                norm=NORM_NAME,
                vmin=np.float32(vmin),
                vmax=np.float32(vmax),
            )
            print(f"Saved {out_path.name} | X,Y={X.shape} | range = {NORM_NAME} [{vmin}, {vmax}]")

Saved 2001_2m_temperature.npz | X,Y=(365, 1, 180, 360) | range = zero_one [193.45382690429688, 318.35064697265625]
Saved 2001_10m_u_component_of_wind.npz | X,Y=(365, 1, 180, 360) | range = zero_one [-34.052146911621094, 30.364177703857422]
Saved 2001_10m_v_component_of_wind.npz | X,Y=(365, 1, 180, 360) | range = zero_one [-30.46213150024414, 31.31575584411621]
Saved 2001_mean_surface_downward_long_wave_radiation_flux.npz | X,Y=(365, 1, 180, 360) | range = zero_one [44.725563049316406, 488.61151123046875]
Saved 2001_mean_surface_downward_short_wave_radiation_flux.npz | X,Y=(365, 1, 180, 360) | range = zero_one [0.0, 485.0629577636719]
Saved 2001_specific_humidity.npz | X,Y=(365, 1, 180, 360) | range = zero_one [6.416160545086314e-07, 0.02964499592781067]
Saved 2001_surface_pressure.npz | X,Y=(365, 1, 180, 360) | range = zero_one [49037.703125, 106520.15625]
Saved 2001_total_precipitation.npz | X,Y=(365, 1, 180, 360) | range = zero_one [0.0, 0.02144767716526985]
Saved 2002_2m_temperature