# Glacier grids from SGI or GLAMOS:

Creates monthly grid files for the MBM to make PMB predictions over the whole glacier grid. The files come from the SGI grid and use OGGM topography. Computing takes a long time because of the conversion to monthly format.
## Setting up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM

import pandas as pd
import warnings
from tqdm.notebook import tqdm
import re
import massbalancemachine as mbm
import geopandas as gpd
import matplotlib.pyplot as plt
import geopandas as gpd
import geopandas as gpd
import rasterio 
import rioxarray
# scripts
from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.geodata import *
from scripts.xgb_helpers import *
from scripts.config_CH import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.SwitzerlandConfig()

In [None]:
seed_all(cfg.seed)
free_up_cuda()  # in case no memory

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

# Climate columns
vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]
# Topographical columns
voi_topographical = [
    "aspect",
    "slope",
    "hugonnet_dhdt",
    "consensus_ice_thickness",
    "millan_v",
    "topo",
]

In [None]:
glaciers_glamos_dem = os.listdir(
    os.path.join(cfg.dataPath, path_GLAMOS_topo, 'lv95/'))

# Glacier outlines:
glacier_outline_sgi = gpd.read_file(
    os.path.join(cfg.dataPath, path_SGI_topo, 'inventory_sgi2016_r2020',
                 'SGI_2016_glaciers_copy.shp'))  # Load the shapefile
glacier_outline_rgi = gpd.read_file(cfg.dataPath + path_rgi_outlines)

# Sort glaciers by area
gl_area = get_gl_area(cfg)
gl_area['clariden'] = gl_area['claridenL']

In [None]:
geodetic_mb = get_geodetic_MB(cfg)

# get years per glacier
years_start_per_gl = geodetic_mb.groupby(
    'glacier_name')['Astart'].unique().apply(list).to_dict()
years_end_per_gl = geodetic_mb.groupby('glacier_name')['Aend'].unique().apply(
    list).to_dict()

periods_per_glacier, geoMB_per_glacier = build_periods_per_glacier(geodetic_mb)
periods_per_glacier['silvretta']

# GLAMOS grids:

For the geodetic MB and gridded MB products computed by GLAMOS, they did not use the SGI grids (from 2015) but their own yearly DEMs. They're not available for all years, but we still compute monthly grids for these available glaciers and years, in order to make the comparison with geodetic MB fairer.

In [None]:
# gdirs, rgidf = initialize_oggm_glacier_directories(
#     cfg,
#     rgi_region="11",
#     rgi_version="62",
#     base_url=
#     "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L1-L2_files/2025.6/elev_bands_w_data/",
#     log_level='WARNING',
#     task_list=None,
# )
# df_missing = export_oggm_grids(cfg, gdirs)

## Example of one glacier:

In [None]:
glacier_name = 'gietro'
sgi_id, rgi_id, rgi_shp = get_rgi_sgi_ids(cfg, glacier_name)

folder_path = os.path.join(cfg.dataPath, path_GLAMOS_topo, 'lv95',
                           glacier_name)

# Example file
fileName = 'gl_2023_lv95.grid'
metadata, grid_data = load_grid_file(folder_path + '/' + fileName)

# Convert to xarray
dem_y = convert_to_xarray_geodata(grid_data, metadata)

# Transform the coordinates to WGS84
dem_wgs84_y = transform_xarray_coords_lv95_to_wgs84(dem_y)

# Create a mask where 'elevation' is not NaN (1 if not NaN, 0 if NaN)
ds_gl = xr.Dataset({'dem': dem_wgs84_y})
ds_gl["glacier_mask"] = ds_gl["dem"].notnull().astype(np.uint8)

dx = abs(ds_gl.x[1] - ds_gl.x[0]).values
dy = abs(ds_gl.y[1] - ds_gl.y[0]).values
print(f"Cell size of GLAMOS DEM: {dx} x {dy} meters")

# Extract SGI topo and aspect over GLAMOS DEM
ds = xr_GLAMOS_masked_topo(cfg, sgi_id, ds_gl)

# Coarson to 50 m resolution if needed
ds = coarsenDS(ds)
dx_m, dy_m = get_res_from_degrees(ds)
print(f"Coarsened ds resolution: {dx_m} x {dy_m} meters")

# Plot the masked data
fig, axs = plt.subplots(1, 4, figsize=(15, 6))
ds.masked_aspect.plot(ax=axs[0], cmap='twilight_shifted', add_colorbar=False)
ds.masked_slope.plot(ax=axs[1], cmap='cividis', add_colorbar=False)
ds.masked_elev.plot(ax=axs[2], cmap='terrain', add_colorbar=False)
ds.glacier_mask.plot(ax=axs[3], cmap='binary', add_colorbar=False)

axs[0].set_title("Aspect")
axs[1].set_title("Slope")
axs[2].set_title("DEM")
axs[3].set_title("Glacier mask")
plt.tight_layout()

### Geotifs of DEMs:

In [None]:
RUN = False
if RUN:
    glaciers_glamos_dems = os.listdir(
        os.path.join(cfg.dataPath, path_GLAMOS_topo, 'lv95'))

    path_out_tiff = os.path.join(cfg.dataPath,
                                 "GLAMOS/topo/GLAMOS_DEM/DEMs_geotiff_lv95/")
    os.makedirs(path_out_tiff, exist_ok=True)
    emptyfolder(path_out_tiff)

    for glacier_name in tqdm(glaciers_glamos_dems, desc="Processing glaciers"):

        sgi_id, rgi_id, rgi_shp = get_rgi_sgi_ids(cfg, glacier_name)

        folder_path = os.path.join(
            cfg.dataPath, path_GLAMOS_topo, 'lv95',
            'stanna' if glacier_name == 'sanktanna' else glacier_name)

        # Regular expression to extract years from filenames
        pattern = re.compile(r'gl_(\d{4})_lv95\.grid')

        # Extract available years from filenames
        years = sorted({
            int(match.group(1))
            for filename in os.listdir(folder_path)
            if (match := pattern.match(filename))
        })
        for i, year in enumerate(years):
            file_name = f'gl_{year}_lv95.grid'
            file_path = os.path.join(folder_path, file_name)

            # Load grid file
            metadata, grid_data = load_grid_file(file_path)

            # Convert to xarray
            masked_dem = convert_to_xarray_geodata(grid_data, metadata)

            # --- Attach CRS and write GeoTIFF ---
            masked_dem = masked_dem.rio.write_crs("EPSG:2056", inplace=True)

            # Prepare output folder
            out_tif = os.path.join(path_out_tiff, f"{glacier_name}_{year}.tif")
            masked_dem.rio.to_raster(
                out_tif,
                dtype="float32",
                compress="LZW",
                BIGTIFF="IF_SAFER",
                tiled=True,
                predictor=3,  # better compression for float rasters
            )

## Yearly masked grids - xarrays:
Save a .zarr xarray per glacier per year (not in monthly format) needed in the MBM later.

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

# ---- pull any non-picklable cfg bits NOW into plain strings/ints ----
DATA_ROOT = cfg.dataPath  # assume this is just a string
PATH_GLAMOS_TOPO = path_GLAMOS_topo  # e.g. "GLAMOS/topo/..."
PATH_XR_SVF = os.path.join(DATA_ROOT, "GLAMOS/topo/GLAMOS_DEM",
                           "svf_nc_latlon")
PATH_XR_GRIDS = os.path.join(DATA_ROOT, PATH_GLAMOS_TOPO, "xr_masked_grids")


def _open_and_merge_svf(ds_latlon: xr.Dataset, glacier_name: str,
                        year: int) -> xr.Dataset:
    """Open SVF file, normalize coords, and merge/interp onto ds_latlon grid."""
    svf_path = os.path.join(PATH_XR_SVF,
                            f"{glacier_name}_{year}_svf_latlon.nc")
    if not os.path.exists(svf_path):
        print(f"SVF not found: {svf_path}")
        return ds_latlon

    with xr.open_dataset(svf_path, decode_cf=True) as ds_svf_raw:
        ds_svf = ds_svf_raw

        # normalize coordinate names
        ren = {}
        if "longitude" in ds_svf.coords and "lon" not in ds_svf.coords:
            ren["longitude"] = "lon"
        if "latitude" in ds_svf.coords and "lat" not in ds_svf.coords:
            ren["latitude"] = "lat"
        if ren:
            ds_svf = ds_svf.rename(ren)

        if not ({"lon", "lat"} <= set(ds_svf.coords)):
            print(f"SVF lacks lon/lat: {svf_path}")
            return ds_latlon

        # longitude range normalization (0–360 -> -180–180) if needed
        if float(ds_svf.lon.max()) > 180 and float(ds_latlon.lon.min()) < 0:
            ds_svf = ds_svf.assign_coords(lon=((ds_svf.lon + 180) % 360) - 180)

        # sort ascending for interp stability
        if ds_svf.lon[0] > ds_svf.lon[-1]:
            ds_svf = ds_svf.sortby("lon")
        if ds_svf.lat[0] > ds_svf.lat[-1]:
            ds_svf = ds_svf.sortby("lat")
        if ds_latlon.lon[0] > ds_latlon.lon[-1]:
            ds_latlon = ds_latlon.sortby("lon")
        if ds_latlon.lat[0] > ds_latlon.lat[-1]:
            ds_latlon = ds_latlon.sortby("lat")

        svf_vars = [
            v for v in ["svf", "asvf", "opns"] if v in ds_svf.data_vars
        ]
        if not svf_vars:
            print(f"No SVF vars in {svf_path}")
            return ds_latlon

        same_lon = np.array_equal(ds_latlon.lon.values, ds_svf.lon.values)
        same_lat = np.array_equal(ds_latlon.lat.values, ds_svf.lat.values)

        if same_lon and same_lat:
            merged = xr.merge([ds_latlon, ds_svf[svf_vars]])
        else:
            svf_on_grid = ds_svf[svf_vars].interp(lon=ds_latlon.lon,
                                                  lat=ds_latlon.lat,
                                                  method="linear")
            for v in svf_vars:
                svf_on_grid[v] = svf_on_grid[v].astype("float32")
            merged = ds_latlon.assign(**{v: svf_on_grid[v] for v in svf_vars})

        # add masked versions
        if "glacier_mask" in merged:
            gmask = xr.where(merged["glacier_mask"] == 1, 1.0,
                             np.nan).astype("float32")
            for v in svf_vars:
                merged[f"masked_{v}"] = (gmask * merged[v]).astype("float32")
        return merged


def process_glacier_year(glacier_name: str,
                         year: int,
                         make_plot: bool = True) -> str:
    """Process one (glacier_name, year) pair. Returns the output zarr path or a warning string."""
    try:
        # resolve SGI/RGI
        sgi_id, rgi_id, rgi_shp = get_rgi_sgi_ids(cfg, glacier_name)
        if not sgi_id or not rgi_shp:
            return f"SKIP {glacier_name} {year}: missing SGI or shapefile."

        # folder with .grid files
        folder_path = os.path.join(
            DATA_ROOT, PATH_GLAMOS_TOPO, 'lv95',
            'stanna' if glacier_name == 'sanktanna' else glacier_name)
        if not os.path.exists(folder_path):
            return f"SKIP {glacier_name} {year}: folder missing."

        if year < 1951:
            return f"SKIP {glacier_name} {year}: <1951."

        file_name = f'gl_{year}_lv95.grid'
        file_path = os.path.join(folder_path, file_name)
        if not os.path.exists(file_path):
            return f"SKIP {glacier_name} {year}: grid file missing."

        # load grid → xarray → lat/lon
        metadata, grid_data = load_grid_file(file_path)
        dem_y = convert_to_xarray_geodata(grid_data, metadata)
        dem_wgs84_y = transform_xarray_coords_lv95_to_wgs84(dem_y)

        ds_gl = xr.Dataset({'dem': dem_wgs84_y})
        ds_gl["glacier_mask"] = ds_gl["dem"].notnull().astype(np.uint8)

        # your topo enrichment (aspect/slope/etc) in WGS84
        ds_latlon = xr_GLAMOS_masked_topo(cfg, sgi_id, ds_gl)

        # # resolution and optional coarsening
        # dx_m, dy_m = get_res_from_degrees(ds_latlon)
        # if dx_m > 20:
        #     ds_latlon = coarsenDS(ds_latlon, target_res_m=50)

        # merge SVF
        ds_latlon = _open_and_merge_svf(ds_latlon, glacier_name, year)

        # save zarr
        save_path = os.path.join(PATH_XR_GRIDS, f"{glacier_name}_{year}.zarr")
        ds_latlon.to_zarr(save_path, mode="w", consolidated=True)
        return f"OK {glacier_name} {year} → {save_path}"
    except Exception as e:
        return f"ERROR {glacier_name} {year}: {e}"

In [None]:
RUN = False
if RUN:
    # ensure clean output folder if you want a fresh run
    emptyfolder(PATH_XR_GRIDS)

    # ---- Build task list (glacier, year) ----
    glaciers_root = os.path.join(DATA_ROOT, PATH_GLAMOS_TOPO, 'lv95')
    glacier_names = os.listdir(glaciers_root)

    pattern = re.compile(r'gl_(\d{4})_lv95\.grid')

    tasks = []
    for glacier_name in glacier_names:
        folder_path = os.path.join(
            glaciers_root,
            'stanna' if glacier_name == 'sanktanna' else glacier_name)
        if not os.path.isdir(folder_path):
            continue
        years = sorted({
            int(m.group(1))
            for fn in os.listdir(folder_path) if (m := pattern.match(fn))
        })
        for year in years:
            if year >= 1951:
                tasks.append((glacier_name, year))

    print(f"Submitting {len(tasks)} tasks...")

    # ---- Run in parallel ----
    max_workers = max(1, (os.cpu_count() or 4) - 1)
    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as ex:
        futs = [
            ex.submit(process_glacier_year, g, y, True) for (g, y) in tasks
        ]
        for fut in as_completed(futs):
            res = fut.result()
            results.append(res)
            print(res)

    # (optional) summarize
    n_ok = sum(r.startswith("OK ") for r in results)
    n_err = sum(r.startswith("ERROR ") for r in results)
    n_skip = sum(r.startswith("SKIP ") for r in results)
    print(f"Done. OK={n_ok}, SKIP={n_skip}, ERROR={n_err}")

In [None]:
# Plot the masked data
ds = xr.open_zarr(os.path.join(PATH_XR_GRIDS, 'aletsch_2016.zarr'))

fig, axs = plt.subplots(1, 5, figsize=(20, 6))
ds.masked_aspect.plot(ax=axs[0], cmap='twilight_shifted', add_colorbar=True)
ds.masked_slope.plot(ax=axs[1], cmap='cividis', add_colorbar=True)
ds.masked_elev.plot(ax=axs[2], cmap='terrain', add_colorbar=True)
ds.svf.plot(ax=axs[3], cmap='binary', add_colorbar=False)
ds.glacier_mask.plot(ax=axs[4], cmap='binary', add_colorbar=False)

axs[0].set_title("Aspect")
axs[1].set_title("Slope")
axs[2].set_title("DEM")
axs[3].set_title("Skyview factor")
axs[4].set_title("Glacier mask")
plt.tight_layout()
fig.show()

In [None]:
# Plot the masked data
ds = xr.open_zarr(os.path.join(PATH_XR_GRIDS, 'gietro_2016.zarr'))

fig, axs = plt.subplots(2, 3, figsize=(15, 10))
ds.masked_aspect.plot(ax=axs[0, 0], cmap='twilight_shifted', add_colorbar=True)
ds.masked_slope.plot(ax=axs[0, 1], cmap='cividis', add_colorbar=True)
ds.masked_elev.plot(ax=axs[0, 2], cmap='terrain', add_colorbar=True)
ds.svf.plot(ax=axs[1, 0], cmap='binary', add_colorbar=False)
ds.glacier_mask.plot(ax=axs[1, 1], cmap='binary', add_colorbar=False)

axs[0, 0].set_title("Aspect")
axs[0, 1].set_title("Slope")
axs[0, 2].set_title("DEM")
axs[1, 0].set_title("Skyview factor")
axs[1, 1].set_title("Glacier mask")
plt.tight_layout()
fig.show()

## Monthly masked grids - dataframes:

In [None]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

# ------------- pull only simple types out of cfg (picklable) -------------
DATA_ROOT = cfg.dataPath
PATH_XR_GRIDS = os.path.join(DATA_ROOT, path_GLAMOS_topo, 'xr_masked_grids')
OUT_FOLDER_ROOT = os.path.join(DATA_ROOT,
                               path_glacier_grid_glamos)  # parquet output root
ERA5_MONTHLY = os.path.join(DATA_ROOT, path_ERA5_raw,
                            'era5_monthly_averaged_data.nc')
ERA5_GEOPOT = os.path.join(DATA_ROOT, path_ERA5_raw,
                           'era5_geopotential_pressure.nc')
PCSR_ZARR = os.path.join(DATA_ROOT, path_pcsr, 'zarr/')

# If you have an RGI outlines shapefile path, use it here so workers load it once
RGI_OUTLINES_PATH = cfg.dataPath + path_rgi_outlines  # <-- set this to your outlines file path if available

VOIS_CLIMATE = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]
VOIS_TOPO = [
    "aspect", "slope", "hugonnet_dhdt", "consensus_ice_thickness", "millan_v",
    "topo", "svf"
]
TOO_SMALL = set(['vorab', 'blauschnee', 'joeri'])
ONLY_GEODETIC_YEARS = False

# meta columns used later (copy from cfg.metaData)
META_COLS = list(cfg.metaData)
OGGM_PATH = os.path.join(DATA_ROOT, path_OGGM)

# ============== worker initializer ==============

_GLOBALS = {}


def _init_worker(outlines_path):
    """Runs once per worker process. Load expensive, read-only resources here."""
    global _GLOBALS
    _GLOBALS = {}
    if outlines_path and os.path.exists(outlines_path):
        import geopandas as gpd
        _GLOBALS['RGI_OUTLINES'] = gpd.read_file(outlines_path)
    else:
        _GLOBALS['RGI_OUTLINES'] = None


# ============== worker function ==============


def _process_glacier_year(glacier_name: str, year: int, *, data_root: str,
                          path_xr_grids: str, out_folder_root: str,
                          vois_climate: list, vois_topo: list, meta_cols: list,
                          era5_monthly_path: str, era5_geopot_path: str,
                          pcsr_zarr_root: str, oggm_path: str) -> str:
    """Process one (glacier, year) -> writes a parquet; returns status string."""
    try:
        if glacier_name in TOO_SMALL:
            return f"SKIP {glacier_name} {year}: too small"

        # find the Zarr file
        file_name = f"{glacier_name}_{year}.zarr"
        zarr_path = os.path.join(path_xr_grids, file_name)
        if not os.path.exists(zarr_path):
            return f"SKIP {glacier_name} {year}: zarr not found"

        # open masked grid (WGS84 lat/lon)
        ds = xr.open_zarr(zarr_path)

        # resolve IDs
        sgi_id, rgi_id, rgi_shp = get_rgi_sgi_ids(
            cfg, glacier_name)  # uses global cfg; OK if it’s lightweight
        if not sgi_id or not rgi_id or not rgi_shp:
            return f"SKIP {glacier_name} {year}: missing SGI/RGI"

        # create glacier grid (user function)
        df_grid = create_glacier_grid_SGI(glacier_name, year, rgi_id, ds)
        df_grid = df_grid.reset_index(drop=True)

        # construct the Dataset wrapper
        dataset_grid = mbm.data_processing.Dataset(
            cfg=
            cfg,  # if cfg is heavy/unpicklable, pass a lightweight surrogate instead
            data=df_grid,
            region_name='CH',
            region_id=11,
            data_path=os.path.join(data_root, path_PMB_GLAMOS_csv))

        # climate
        dataset_grid.get_climate_features(climate_data=era5_monthly_path,
                                          geopotential_data=era5_geopot_path,
                                          change_units=True,
                                          smoothing_vois={
                                              'vois_climate': vois_climate,
                                              'vois_other':
                                              ['ALTITUDE_CLIMATE']
                                          })

        # potential clear-sky radiation
        dataset_grid.get_potential_rad(pcsr_zarr_root)

        # get data and attach RGI ids via outlines (loaded once per worker)
        df_y_gl = dataset_grid.data
        df_y_gl.rename(columns={'RGIId': 'RGIId_old'}, inplace=True)

        outlines = _GLOBALS.get('RGI_OUTLINES', None)
        if outlines is None:
            return f"SKIP {glacier_name} {year}: outlines not loaded in worker"
        df_y_gl = mbm.data_processing.utils.get_rgi(data=df_y_gl,
                                                    glacier_outlines=outlines)

        # drop rows without RGI
        df_y_gl = df_y_gl.dropna(subset=['RGIId'])

        # OGGM features
        df_y_gl = add_OGGM_features(
            df_y_gl, ["hugonnet_dhdt", "consensus_ice_thickness", "millan_v"],
            oggm_path)

        # GLWD_ID
        df_y_gl['GLWD_ID'] = df_y_gl.apply(lambda x: mbm.data_processing.utils.
                                           get_hash(f"{x.GLACIER}_{x.YEAR}"),
                                           axis=1).astype(str)

        # wrap again for monthly conversion
        dataset_grid_oggm = mbm.data_processing.Dataset(
            cfg=cfg,
            data=df_y_gl,
            region_name='CH',
            region_id=11,
            data_path=os.path.join(data_root, path_PMB_GLAMOS_csv))

        dataset_grid_oggm.convert_to_monthly(meta_data_columns=meta_cols,
                                             vois_climate=vois_climate +
                                             ['pcsr'],
                                             vois_topographical=vois_topo)

        df_oggm = dataset_grid_oggm.data

        expected_months = [
            "oct", "nov", "dec", "jan", "feb", "mar", "apr", "may", "jun",
            "jul", "aug", "sep"
        ]

        # Normalize months to lowercase (in case of mixed case)
        months_present = df_oggm["MONTHS"].astype(
            str).str.lower().unique().tolist()
        missing = [m for m in expected_months if m not in months_present]

        if missing:
            print(f"❌ Missing months: {missing}")

        # sanity
        if 'svf' not in df_oggm.columns:
            return f"ERROR {glacier_name} {year}: 'svf' missing after conversion"
        if 'pcsr' not in df_oggm.columns:
            return f"ERROR {glacier_name} {year}: 'pcsr' missing after conversion"

        # rename columns
        df_oggm.rename(columns={
            'aspect': 'aspect_sgi',
            'slope': 'slope_sgi'
        },
                       inplace=True)
        if 'POINT_ELEVATION' not in df_oggm.columns:
            return f"ERROR {glacier_name} {year}: 'POINT_ELEVATION' missing"

        # write parquet
        out_folder = os.path.join(out_folder_root, glacier_name)
        os.makedirs(out_folder, exist_ok=True)
        out_path = os.path.join(out_folder,
                                f"{glacier_name}_grid_{year}.parquet")
        df_oggm.to_parquet(out_path, engine="pyarrow", compression="snappy")
        return f"OK {glacier_name} {year} -> {out_path}"
    except Exception as e:
        return f"ERROR {glacier_name} {year}: {e}"

In [None]:
RUN = True
if RUN:
    # ============== build task list (glacier, year) ==============

    # ensure output root exists and (optionally) clean it
    os.makedirs(OUT_FOLDER_ROOT, exist_ok=True)
    emptyfolder(OUT_FOLDER_ROOT)  # if you truly want a fresh run

    # glaciers to consider
    all_glaciers = [g for g in years_start_per_gl.keys() if g not in TOO_SMALL]

    # map each glacier to missing years
    tasks = []
    pattern = re.compile(r'_(\d{4})\.zarr$')

    for glacier_name in all_glaciers:
        # zarr files present
        zarr_files = [
            f for f in os.listdir(PATH_XR_GRIDS)
            if f.startswith(f"{glacier_name}_") and f.endswith(".zarr")
        ]
        if not zarr_files:
            print(f"No GLAMOS DEM for {glacier_name}, skipping.")
            continue
        zarr_files.sort()

        # parquet folder
        out_folder = os.path.join(OUT_FOLDER_ROOT, glacier_name)
        os.makedirs(out_folder, exist_ok=True)

        # existing parquet years
        existing = {
            int(m.group(1))
            for f in os.listdir(out_folder)
            if (m := re.search(r'_grid_(\d{4})\.parquet$', f))
        }

        # geodetic period
        if glacier_name not in years_start_per_gl or glacier_name not in years_end_per_gl:
            print(f"Skipping {glacier_name}: missing start/end years")
            continue
        geodetic_start = years_start_per_gl[glacier_name][0]
        geodetic_end = years_end_per_gl[glacier_name][-1]

        # choose years
        for f in zarr_files:
            m = pattern.search(f)
            if not m:
                continue
            year = int(m.group(1))
            if year < 1951:
                continue
            if ONLY_GEODETIC_YEARS:
                if (year in range(geodetic_start, geodetic_end +
                                  1)) and (year not in existing):
                    tasks.append((glacier_name, year))
            else:
                if year not in existing:
                    tasks.append((glacier_name, year))

    print(f"Submitting {len(tasks)} tasks…")

    # ============== run in parallel ==============

    max_workers = max(1, (os.cpu_count() or 4) - 1)
    results = []
    n_ok = n_err = n_skip = 0

    with ProcessPoolExecutor(max_workers=max_workers,
                             initializer=_init_worker,
                             initargs=(RGI_OUTLINES_PATH, )) as ex:
        futs = [
            ex.submit(
                _process_glacier_year,
                glacier_name,
                year,
                data_root=DATA_ROOT,
                path_xr_grids=PATH_XR_GRIDS,
                out_folder_root=OUT_FOLDER_ROOT,
                vois_climate=VOIS_CLIMATE,
                vois_topo=VOIS_TOPO,
                meta_cols=META_COLS,
                era5_monthly_path=ERA5_MONTHLY,
                era5_geopot_path=ERA5_GEOPOT,
                pcsr_zarr_root=PCSR_ZARR,
                oggm_path=OGGM_PATH,
            ) for glacier_name, year in tasks
        ]

        with tqdm(total=len(futs), desc="Processing gl-years",
                  unit="task") as pbar:
            for fut in as_completed(futs):
                res = fut.result()
                results.append(res)
                if res.startswith("OK "):
                    n_ok += 1
                elif res.startswith("ERROR "):
                    n_err += 1
                elif res.startswith("SKIP "):
                    n_skip += 1

                pbar.set_postfix(ok=n_ok, skip=n_skip, err=n_err)
                pbar.update(1)

    print(f"Done. OK={n_ok}, SKIP={n_skip}, ERROR={n_err}")

##### Example:

In [None]:
# Load stake data ONCE instead of for every glacier
stake_file = os.path.join(cfg.dataPath, path_PMB_GLAMOS_csv,
                          "CH_wgms_dataset_all.csv")
df_stakes = pd.read_csv(stake_file)

# Load GLAMOS masked grid
glacier_name = 'aletsch'
year = 2016

month = 'sep'  # Example month, adjust as needed

folder_path = os.path.join(cfg.dataPath, path_glacier_grid_glamos,
                           glacier_name)
# load the dataset
df = pd.read_parquet(
    os.path.join(folder_path, f"{glacier_name}_grid_{year}.parquet"))
df = df[df.MONTHS == month]

stake_locs = df_stakes[df_stakes.GLACIER == glacier_name]

# Variables of interest
voi = [
    "aspect_sgi",
    "slope_sgi",
]
fig, axs = plt.subplots(3, 4, figsize=(15, 10))
voi = [
    't2m', 'tp', 'ELEVATION_DIFFERENCE', 'hugonnet_dhdt',
    'consensus_ice_thickness', 'millan_v', 'aspect_sgi', 'slope_sgi', 'pcsr',
    'svf'
]
axs = axs.flatten()
for i, var in enumerate(voi):
    sns.scatterplot(df,
                    x='POINT_LON',
                    y='POINT_LAT',
                    hue=var,
                    s=5,
                    alpha=0.5,
                    palette='twilight_shifted',
                    ax=axs[i])
    axs[i].set_title(var)

    # scatter stake location
    sns.scatterplot(stake_locs,
                    x='POINT_LON',
                    y='POINT_LAT',
                    color='red',
                    s=10,
                    alpha=0.5,
                    ax=axs[i])

plt.tight_layout()

In [None]:
# Load stake data ONCE instead of for every glacier
stake_file = os.path.join(cfg.dataPath, path_PMB_GLAMOS_csv,
                          "CH_wgms_dataset_all.csv")
df_stakes = pd.read_csv(stake_file)

# Load GLAMOS masked grid
glacier_name = 'gietro'
year = 2016

month = 'sep'  # Example month, adjust as needed

folder_path = os.path.join(cfg.dataPath, path_glacier_grid_glamos,
                           glacier_name)
# load the dataset
df = pd.read_parquet(
    os.path.join(folder_path, f"{glacier_name}_grid_{year}.parquet"))
df = df[df.MONTHS == month]

stake_locs = df_stakes[df_stakes.GLACIER == glacier_name]

# Variables of interest
voi = [
    "aspect_sgi",
    "slope_sgi",
]
fig, axs = plt.subplots(3, 4, figsize=(15, 10))
voi = [
    't2m', 'tp', 'ELEVATION_DIFFERENCE', 'hugonnet_dhdt',
    'consensus_ice_thickness', 'millan_v', 'aspect_sgi', 'slope_sgi', 'pcsr',
    'svf'
]
axs = axs.flatten()
for i, var in enumerate(voi):
    sns.scatterplot(df,
                    x='POINT_LON',
                    y='POINT_LAT',
                    hue=var,
                    s=5,
                    alpha=0.5,
                    palette='twilight_shifted',
                    ax=axs[i])
    axs[i].set_title(var)

    # scatter stake location
    sns.scatterplot(stake_locs,
                    x='POINT_LON',
                    y='POINT_LAT',
                    color='red',
                    s=10,
                    alpha=0.5,
                    ax=axs[i])

plt.tight_layout()

In [None]:
rgi_id = get_rgi_sgi_ids(cfg, 'gietro')[1]
path_to_data = cfg.dataPath + path_OGGM + "xr_grids/"
file_path = f"{path_to_data}{rgi_id}.zarr"
ds_oggm = xr.open_dataset(file_path)

In [None]:
ds_oggm.millan_v.plot()

In [None]:
ds_oggm.hugonnet_dhdt.plot()

In [None]:
sns.scatterplot(
    df,
    x='POINT_LON',
    y='POINT_LAT',
    hue='millan_v',
    s=5,
    alpha=0.5,
    palette='twilight_shifted',
)

In [None]:
sns.scatterplot(
    df,
    x='POINT_LON',
    y='POINT_LAT',
    hue='svf',
    s=5,
    alpha=0.5,
    palette='twilight_shifted',
)