# Glacier grids from RGI:

Creates monthly grid files for the MBM to make PMB predictions over the whole glacier grid. The files come from the RGI grid with OGGM topography. Computing takes a long time because of the conversion to monthly format.
## Setting up:

In [None]:
import sys, os

sys.path.append(os.path.join(os.getcwd(),
                             '../../'))  # Add root of repo to import MBM
import csv
from functools import partial

import pandas as pd
import warnings
from tqdm.notebook import tqdm
import re
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
import xarray as xr
import massbalancemachine as mbm
from collections import defaultdict
import logging
from skorch.helper import SliceDataset
from datetime import datetime
from skorch.callbacks import EarlyStopping, LRScheduler, Checkpoint
import itertools
import random
import pickle
from collections import Counter
import ast
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.config_CH import *
from scripts.nn_helpers import *
from scripts.xgb_helpers import *
from scripts.geodata import *
from scripts.NN_networks import *
from scripts.geodata_plots import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.SwitzerlandConfig()

In [None]:
seed_all(cfg.seed)
print("Using seed:", cfg.seed)

from torch.utils.data import Subset
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset
from torch.utils.data import WeightedRandomSampler, SubsetRandomSampler
import torch.nn as nn

if torch.cuda.is_available():
    print("CUDA is available")
    free_up_cuda()
else:
    print("CUDA is NOT available")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
seed_all(cfg.seed)
free_up_cuda()  # in case no memory

# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

# Climate columns
vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]
# Topographical columns
vois_topographical = [
    "aspect",
    "slope",
    "hugonnet_dhdt",
    "consensus_ice_thickness",
    "millan_v",
    "topo",
]

glacier_outline_rgi = gpd.read_file(cfg.dataPath + path_rgi_outlines)


In [None]:
gdirs, rgidf = initialize_oggm_glacier_directories(
    cfg,
    rgi_region="11",
    rgi_version="6",
    # base_url=
    # "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L3-L5_files/2023.1/elev_bands/W5E5_w_data/",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L1-L2_files/2025.6/elev_bands_w_data/",
    log_level='WARNING',
    task_list=None,
)

# Save OGGM xr for all needed glaciers in RGI region 11.6:
df_missing = export_oggm_grids(cfg, gdirs)

path_rgi = cfg.dataPath + 'GLAMOS/RGI/nsidc0770_11.rgi60.CentralEurope/11_rgi60_CentralEurope.shp'

# load RGI shapefile
gdf = gpd.read_file(path_rgi)
# reproject to a local equal-area projection (example: EPSG:3035 for Europe)
gdf_proj = gdf.to_crs(3035)
gdf_proj.rename(columns={"RGIId": "rgi_id"}, inplace=True)
# gdf_proj.set_index('rgi_id', inplace=True)
gdf_proj["area_m2"] = gdf_proj.geometry.area
gdf_proj["area_km2"] = gdf_proj["area_m2"] / 1e6

df_missing = df_missing.merge(gdf_proj[['area_km2', 'rgi_id']], on="rgi_id")

# total glacier area
total_area = gdf_proj["area_km2"].sum()

# explode the list of missing vars into rows (one var per row)
df_exploded = df_missing.explode("missing_vars")

# 1) COUNT: number of glaciers missing each variable
counts_missing_per_var = (
    df_exploded.groupby("missing_vars")["rgi_id"].nunique().sort_values(
        ascending=False))

# 2) TOTAL % AREA with ANY missing var
total_missing_area_km2 = df_missing["area_km2"].sum()
total_missing_area_pct = (total_missing_area_km2 / total_area) * 100

print(f"Total glacier area with ANY missing variable: "
      f"{total_missing_area_km2:,.2f} km² "
      f"({total_missing_area_pct:.2f}%)")

# Optional: also show % area per variable (kept from your earlier logic)
area_missing_per_var = (
    df_exploded.groupby("missing_vars")["area_km2"].sum().sort_values(
        ascending=False))
perc_missing_per_var = (area_missing_per_var / total_area) * 100

print("\n% of total glacier area missing per variable:")
for var, pct in perc_missing_per_var.items():
    print(f"  - {var}: {pct:.2f}%")

# ---- barplot: number of glaciers missing each variable ----
plt.figure(figsize=(7, 4))
plt.bar(counts_missing_per_var.index, counts_missing_per_var.values)
plt.xlabel("Missing variable")
plt.ylabel("Number of glaciers")
plt.title("Count of glaciers missing each variable")
plt.tight_layout()
plt.show()

In [None]:
# RGI Ids:
# Read glacier ids:
rgi_df = pd.read_csv(cfg.dataPath + path_glacier_ids, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)
rgi_df.loc['rhone']

## Create RGI grids for all glaciers:

In [None]:
# # Open an example
# # rgi_gl = gdirs[0].rgi_id
# rgi_gl = 'RGI60-11.01238'

# ds = xr.open_dataset(path_RGIs + rgi_gl + '.zarr')
# glacier_mask = np.where(ds['glacier_mask'].values == 0, np.nan,
#                         ds['glacier_mask'].values)

# # Create glacier mask
# ds = ds.assign(masked_slope=glacier_mask * ds['slope'])
# ds = ds.assign(masked_elev=glacier_mask * ds['topo'])
# ds = ds.assign(masked_aspect=glacier_mask * ds['aspect'])
# ds = ds.assign(masked_dis=glacier_mask * ds['dis_from_border'])

# # Assign other variables only if available
# if 'hugonnet_dhdt' in ds:
#     ds = ds.assign(masked_hug=glacier_mask * ds['hugonnet_dhdt'])
# if 'consensus_ice_thickness' in ds:
#     ds = ds.assign(masked_cit=glacier_mask * ds['consensus_ice_thickness'])
# if 'millan_v' in ds:
#     ds = ds.assign(masked_miv=glacier_mask * ds['millan_v'])

# glacier_indices = np.where(ds['glacier_mask'].values == 1)

# fig, axs = plt.subplots(1, 4, figsize=(16, 8), sharey=True)

# ds.masked_aspect.plot(ax=axs[0], cmap='twilight_shifted', add_colorbar=False)
# ds.masked_slope.plot(ax=axs[1], cmap='cividis', add_colorbar=False)
# ds.masked_elev.plot(ax=axs[2], cmap='terrain', add_colorbar=False)
# ds.glacier_mask.plot(ax=axs[3], cmap='binary', add_colorbar=False)

# axs[0].set_title("Aspect OGGM")
# axs[1].set_title("Slope OGGM")
# axs[2].set_title("DEM OGGM")
# axs[3].set_title("Glacier mask OGGM")

### Create masked xarray grids:

In [None]:
def create_masked_glacier(path_RGIs, rgi_gl):
    # Load dataset
    ds = xr.open_dataset(path_RGIs + rgi_gl + '.zarr')

    # Check if 'glacier_mask' exists
    if 'glacier_mask' not in ds:
        raise ValueError(
            f"'glacier_mask' variable not found in dataset {rgi_gl}")

    # Create glacier mask
    glacier_mask = np.where(ds['glacier_mask'].values == 0, np.nan,
                            ds['glacier_mask'].values)

    # Apply mask to core variables
    ds = ds.assign(masked_slope=glacier_mask * ds['slope'])
    ds = ds.assign(masked_elev=glacier_mask * ds['topo'])
    ds = ds.assign(masked_aspect=glacier_mask * ds['aspect'])
    ds = ds.assign(masked_dis=glacier_mask * ds['dis_from_border'])

    # Apply mask to optional variables if present
    if 'hugonnet_dhdt' in ds:
        ds = ds.assign(masked_hug=glacier_mask * ds['hugonnet_dhdt'])
    if 'consensus_ice_thickness' in ds:
        ds = ds.assign(masked_cit=glacier_mask * ds['consensus_ice_thickness'])
    if 'millan_v' in ds:
        ds = ds.assign(masked_miv=glacier_mask * ds['millan_v'])

    # Indices where glacier_mask == 1
    glacier_indices = np.where(ds['glacier_mask'].values == 1)

    return ds, glacier_indices

In [None]:
path_xr_grids = os.path.join(cfg.dataPath, 'GLAMOS/topo/RGI_v6_11/',
                             'xr_masked_grids/')
path_RGIs = cfg.dataPath + path_OGGM + 'xr_grids/'
glaciers = os.listdir(path_RGIs)

print(f"Found {len(glaciers)} glaciers in RGI region 11.6")

RUN = False
if RUN:
    emptyfolder(path_xr_grids)

    for gdir in tqdm(gdirs):
        rgi_gl = gdir.rgi_id

        try:
            # Create masked glacier dataset
            ds, glacier_indices = create_masked_glacier(path_RGIs, rgi_gl)
        except ValueError as e:
            print(f"Skipping {rgi_gl}: {e}")
            continue  # Skip to next glacier

        dx_m, dy_m = get_res_from_projected(ds)

        # Coarsen to 50 m resolution if needed
        if 20 < dx_m < 50:
            ds = coarsenDS_mercator(ds, target_res_m=50)
            dx_m, dy_m = get_res_from_projected(ds)
        else:
            ds = ds

        # Change coordinates to Lat/Lon projection
        original_proj = ds.pyproj_srs
        ds = ds.rio.write_crs(original_proj)
        ds_latlon = ds.rio.reproject("EPSG:4326")
        ds_latlon = ds_latlon.rename({'x': 'lon', 'y': 'lat'})

        # Save xarray dataset
        save_path = os.path.join(path_xr_grids, f"{rgi_gl}.zarr")
        ds_latlon.to_zarr(save_path)

# open example
for gdir in gdirs:
    if gdir.rgi_id == 'RGI60-11.01238':
        gdir_rhone = gdir

rgi_gl_rhone = gdir_rhone.rgi_id
ds = xr.open_dataset(path_xr_grids + rgi_gl_rhone + '.zarr')
fig, axs = plt.subplots(1, 4, figsize=(15, 6))
ds.masked_aspect.plot(ax=axs[0], cmap='twilight_shifted', add_colorbar=True)
ds.masked_slope.plot(ax=axs[1], cmap='cividis', add_colorbar=True)
ds.masked_elev.plot(ax=axs[2], cmap='terrain', add_colorbar=True)
ds.glacier_mask.plot(ax=axs[3], cmap='binary', add_colorbar=False)

axs[0].set_title("Aspect")
axs[1].set_title("Slope")
axs[2].set_title("DEM")
axs[3].set_title("Glacier mask")
plt.tight_layout()

In [None]:
# paths
path_xr_grids = os.path.join(cfg.dataPath, "GLAMOS/topo/RGI_v6_11", "xr_masked_grids/")
path_geotiff  = os.path.join(cfg.dataPath, "GLAMOS/topo/RGI_v6_11", "geotiff_meters_lv95/")
os.makedirs(path_geotiff, exist_ok=True)
emptyfolder(path_geotiff)

# target projected CRS in meters (Switzerland)
TARGET_CRS = "EPSG:2056"  # LV95
# optionally set a target pixel size in meters (None keeps native res after reprojection)
TARGET_RES = None  # e.g., 10.0 for ~10 m pixels

for rgi_gl in tqdm([f for f in os.listdir(path_xr_grids) if f.endswith(".zarr")]):
    rgi_id = rgi_gl[:-5]  # strip ".zarr"
    out_tif = os.path.join(path_geotiff, f"{rgi_id}.tif")
    if os.path.exists(out_tif):
        continue  # idempotent

    # open Zarr (try consolidated first)
    zarr_path = os.path.join(path_xr_grids, rgi_gl)
    try:
        ds = xr.open_zarr(zarr_path, consolidated=True)
    except Exception:
        ds = xr.open_zarr(zarr_path)

    dem = ds["masked_elev"]

    # ensure a CRS is set for the source (WGS84 if lat/lon)
    # If your DataArray already has CRS, you can skip this line.
    if not dem.rio.crs:
        dem = dem.rio.write_crs("EPSG:4326")

    # propagate nodata if present; otherwise use NaN
    if "_FillValue" in dem.attrs:
        dem = dem.rio.write_nodata(dem.attrs["_FillValue"])
    elif "nodata" in dem.attrs:
        dem = dem.rio.write_nodata(dem.attrs["nodata"])
    else:
        dem = dem.rio.write_nodata(np.nan)

    # reproject to meters (LV95). You can pass resolution=TARGET_RES to enforce a grid size.
    if TARGET_RES is None:
        dem_m = dem.rio.reproject(TARGET_CRS)
    else:
        dem_m = dem.rio.reproject(TARGET_CRS, resolution=TARGET_RES)

    # save a compact, tiled, compressed GeoTIFF
    dem_m.rio.to_raster(
        out_tif,
        dtype="float32",
        compress="LZW",
        BIGTIFF="IF_SAFER",
        tiled=True,
        predictor=3  # better compression for float rasters
    )

# quick sanity plot of the last DEM (projected)
plt.figure(figsize=(7,6))
dem_m.plot(cmap="terrain")
plt.title(f"Projected DEM (meters) — {rgi_id}")
plt.xlabel("x (m, LV95)")
plt.ylabel("y (m, LV95)")
plt.tight_layout()
plt.show()

### Create monthly dataframes:

In [None]:
import os
import re
from tqdm import tqdm
import xarray as xr

RUN = True
path_rgi_alps = os.path.join(cfg.dataPath, 'GLAMOS/topo/gridded_topo_inputs/RGI_v6_11/')

# ---- helpers ----
def expected_fname(rgi_gl: str, year: int) -> str:
    # Expected: RGI60-11.00001_grid_1999.parquet
    return f"{rgi_gl}_grid_{year}.parquet"

def years_present_for_glacier(folder_path: str, rgi_gl: str) -> set:
    """Return the set of 4-digit years found for this glacier in its output folder."""
    if not os.path.isdir(folder_path):
        return set()
    rx = re.compile(rf"^{re.escape(rgi_gl)}_grid_(\d{{4}})\.parquet$")
    years_found = set()
    for f in os.listdir(folder_path):
        m = rx.match(f)
        if m:
            years_found.add(int(m.group(1)))
    return years_found

def glacier_is_complete(rgi_gl: str, years: range) -> bool:
    folder_path = os.path.join(path_rgi_alps, rgi_gl)
    found = years_present_for_glacier(folder_path, rgi_gl)
    return set(years).issubset(found)

# ---- main ----
if RUN:
    # inclusive 1999..2024
    years = range(1999, 2025)

    os.makedirs(path_rgi_alps, exist_ok=True)

    valid_rgis = [
        f.replace('.zarr', '') for f in os.listdir(path_xr_grids)
        if f.endswith('.zarr')
    ]

    # Glaciers that are already complete (all yearly files exist)
    complete_rgis = [r for r in valid_rgis if glacier_is_complete(r, years)]
    # Glaciers that still need work
    rest_rgis = list(set(valid_rgis) - set(complete_rgis))

    print(f"Glaciers already complete: {len(complete_rgis)}")
    print(f"Number of glaciers to process: {len(rest_rgis)}")

    for gdir in tqdm(gdirs, desc="Processing glaciers"):
        rgi_gl = gdir.rgi_id

        if rgi_gl not in valid_rgis:
            print(f"Skipping {rgi_gl}: not found in valid RGI glaciers")
            continue

        # Skip if already fully complete
        if glacier_is_complete(rgi_gl, years):
            continue

        try:
            file_path = os.path.join(path_xr_grids, f"{rgi_gl}.zarr")
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"Missing file: {file_path}")

            # Open Zarr
            try:
                ds = xr.open_zarr(file_path, consolidated=True)
            except Exception:
                ds = xr.open_zarr(file_path)

            # Build grid for all years once
            try:
                df_grid = create_glacier_grid_RGI(ds, years, rgi_gl)
            except Exception as e:
                print(f"Failed creating glacier grid for {rgi_gl}: {e}")
                continue

            df_grid.reset_index(drop=True, inplace=True)

            # Add GLWD_ID and GLACIER columns
            df_grid['GLWD_ID'] = [
                mbm.data_processing.utils.get_hash(f"{r}_{y}")
                for r, y in zip(df_grid['RGIId'].astype(str),
                                df_grid['YEAR'].astype(str))
            ]
            df_grid['GLWD_ID'] = df_grid['GLWD_ID'].astype(str)
            df_grid['GLACIER'] = df_grid['RGIId']

            # Output folder
            folder_path = os.path.join(path_rgi_alps, rgi_gl)
            os.makedirs(folder_path, exist_ok=True)

            # Determine missing years for this glacier (idempotent)
            existing_years = years_present_for_glacier(folder_path, rgi_gl)
            missing_years = [y for y in years if y not in existing_years]

            if not missing_years:
                # Another process may have finished meanwhile
                continue

            for year in missing_years:
                try:
                    df_grid_y = df_grid[df_grid.YEAR == year].copy()
                    if df_grid_y.empty:
                        # No data for that year; keep going
                        continue

                    # Build dataset & add climate features
                    try:
                        dataset_grid_yearly = mbm.data_processing.Dataset(
                            cfg=cfg,
                            data=df_grid_y,
                            region_name='CH',
                            region_id=11,
                            data_path=os.path.join(cfg.dataPath, path_PMB_GLAMOS_csv)
                        )

                        era5_climate_data = os.path.join(
                            cfg.dataPath, path_ERA5_raw, 'era5_monthly_averaged_data_Alps.nc'
                        )
                        geopotential_data = os.path.join(
                            cfg.dataPath, path_ERA5_raw, 'era5_geopotential_pressure_Alps.nc'
                        )

                        dataset_grid_yearly.get_climate_features(
                            climate_data=era5_climate_data,
                            geopotential_data=geopotential_data,
                            change_units=True,
                            smoothing_vois={'vois_climate': vois_climate,
                                            'vois_other': ['ALTITUDE_CLIMATE']}
                        )
                    except Exception as e:
                        print(f"Failed adding climate features for {rgi_gl} (year {year}): {e}")
                        continue

                    vois_topographical_sub = [voi for voi in vois_topographical
                                              if voi in df_grid_y.columns]

                    dataset_grid_yearly.convert_to_monthly(
                        meta_data_columns=cfg.metaData,
                        vois_climate=vois_climate,
                        vois_topographical=vois_topographical_sub
                    )

                    save_path = os.path.join(folder_path, expected_fname(rgi_gl, year))
                    # If a stale/partial file exists, overwrite it
                    dataset_grid_yearly.data.to_parquet(
                        save_path, engine="pyarrow", compression="snappy"
                    )

                except Exception as e:
                    print(f"Failed processing {rgi_gl} for year {year}: {e}")
                    continue

        except Exception as e:
            print(f"Error with glacier {rgi_gl}: {e}")
            continue

In [None]:
# Look at one example
for gdir in gdirs:
    if gdir.rgi_id == 'RGI60-11.01238':
        gdir_rhone = gdir

rgi_gl = gdir_rhone.rgi_id

year = 2000
df = pd.read_parquet(
    os.path.join(path_rgi_alps, rgi_gl, f"{rgi_gl}_grid_{year}.parquet"))
df = df[df.MONTHS == 'sep']
print(df['t2m'].unique())

year = 2004
df = pd.read_parquet(
    os.path.join(path_rgi_alps, rgi_gl, f"{rgi_gl}_grid_{year}.parquet"))
df = df[df.MONTHS == 'sep']
print(df['t2m'].unique())

In [None]:
# Look at one example
for gdir in gdirs:
    if gdir.rgi_id == 'RGI60-11.01238':
        gdir_rhone = gdir

year = 2000
rgi_gl = gdir_rhone.rgi_id

df = pd.read_parquet(
    os.path.join(path_rgi_alps, rgi_gl, f"{rgi_gl}_grid_{year}.parquet"))
df = df[df.MONTHS == 'sep']
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
voi = [
    't2m', 'tp', 'ALTITUDE_CLIMATE', 'ELEVATION_DIFFERENCE', 'hugonnet_dhdt',
    'consensus_ice_thickness'
]
axs = axs.flatten()
for i, var in enumerate(voi):
    sns.scatterplot(df,
                    x='POINT_LON',
                    y='POINT_LAT',
                    hue=var,
                    s=5,
                    alpha=0.5,
                    palette='twilight_shifted',
                    ax=axs[i])

### Location of all glaciers:

In [None]:
rgi_ids = os.listdir(path_rgi_alps)
pos_gl = []
for rgi_gl in tqdm(rgi_ids):
    df = pd.read_parquet(
        os.path.join(path_rgi_alps, rgi_gl, f"{rgi_gl}_grid_{year}.parquet"))
    pos_gl.append((df.POINT_LAT.mean(), df.POINT_LON.mean()))
df_pos_all = pd.DataFrame(pos_gl, columns=['lat', 'lon'])
df_pos_all['rgi_id'] = rgi_ids

In [None]:
print('Number of glaciers in RGI region 11.6:', len(df_pos_all))

# ---- 2. Create figure and base map ----
fig = plt.figure(figsize=(18, 10))

latN, latS = 48, 44
lonW, lonE = 4, 14
projPC = ccrs.PlateCarree()
ax2 = plt.axes(projection=projPC)
ax2.set_extent([lonW, lonE, latS, latN], crs=ccrs.Geodetic())

ax2.add_feature(cfeature.COASTLINE)
ax2.add_feature(cfeature.LAKES)
ax2.add_feature(cfeature.RIVERS)
ax2.add_feature(cfeature.BORDERS, linestyle='-', linewidth=1)

g = sns.scatterplot(
    data=df_pos_all,
    x='lon',
    y='lat',
    alpha=0.6,
    transform=projPC,
    ax=ax2,
    zorder=10,
    legend=True  # custom legend added below
)

glacier_outline_rgi.plot(ax=ax2, transform=projPC, color='black')

# ---- 4. Gridlines ----
gl = ax2.gridlines(draw_labels=True,
                   linewidth=1,
                   color='gray',
                   alpha=0.5,
                   linestyle='--')
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
gl.xlabel_style = {'size': 16, 'color': 'black'}
gl.ylabel_style = {'size': 16, 'color': 'black'}
gl.top_labels = gl.right_labels = False

## Train LSTM model:

In [None]:
data_glamos = getStakesData(cfg)

months_head_pad, months_tail_pad = mbm.data_processing.utils._compute_head_tail_pads_from_df(
    data_glamos)

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Transform data to monthly format (run or load data):
paths = {
    'csv_path': cfg.dataPath + path_PMB_GLAMOS_csv,
    'era5_climate_data':
    cfg.dataPath + path_ERA5_raw + 'era5_monthly_averaged_data.nc',
    'geopotential_data':
    cfg.dataPath + path_ERA5_raw + 'era5_geopotential_pressure.nc',
    'radiation_save_path': cfg.dataPath + path_pcsr + 'zarr/'
}
RUN = False
data_monthly = process_or_load_data(
    run_flag=RUN,
    data_glamos=data_glamos,
    paths=paths,
    cfg=cfg,
    vois_climate=vois_climate,
    vois_topographical=vois_topographical,
    output_file='CH_wgms_dataset_monthly_LSTM_CA.csv')

# Create DataLoader
dataloader_gl = mbm.dataloader.DataLoader(cfg,
                                          data=data_monthly,
                                          random_seed=cfg.seed,
                                          meta_data_columns=cfg.metaData)

In [None]:
# Ensure all test glaciers exist in the dataset
existing_glaciers = set(data_monthly.GLACIER.unique())
missing_glaciers = [g for g in TEST_GLACIERS if g not in existing_glaciers]

if missing_glaciers:
    print(
        f"Warning: The following test glaciers are not in the dataset: {missing_glaciers}"
    )

# Define training glaciers correctly
train_glaciers = [i for i in existing_glaciers if i not in TEST_GLACIERS]

data_test = data_monthly[data_monthly.GLACIER.isin(TEST_GLACIERS)]
print('Size of monthly test data:', len(data_test))

data_train = data_monthly[data_monthly.GLACIER.isin(train_glaciers)]
print('Size of monthly train data:', len(data_train))

if len(data_train) == 0:
    print("Warning: No training data available!")
else:
    test_perc = (len(data_test) / len(data_train)) * 100
    print('Percentage of test size: {:.2f}%'.format(test_perc))

splits, test_set, train_set = get_CV_splits(dataloader_gl,
                                            test_split_on='GLACIER',
                                            test_splits=TEST_GLACIERS,
                                            random_state=cfg.seed)

print('Test glaciers: ({}) {}'.format(len(test_set['splits_vals']),
                                      test_set['splits_vals']))
test_perc = (len(test_set['df_X']) / len(train_set['df_X'])) * 100
print('Percentage of test size: {:.2f}%'.format(test_perc))
print('Size of test set:', len(test_set['df_X']))
print('Train glaciers: ({}) {}'.format(len(train_set['splits_vals']),
                                       train_set['splits_vals']))
print('Size of train set:', len(train_set['df_X']))

# Validation and train split:
data_train = train_set['df_X']
data_train['y'] = train_set['y']

data_test = test_set['df_X']
data_test['y'] = test_set['y']

### Load trained model:

#### Simple model:

In [None]:
MONTHLY_COLS = [
    't2m',
    'tp',
    'slhf',
    'sshf',
    'ssrd',
    'fal',
    'str',
    'ELEVATION_DIFFERENCE',
]
STATIC_COLS = ['aspect', 'slope']

feature_columns = MONTHLY_COLS + STATIC_COLS
seed_all(cfg.seed)

df_train = data_train.copy()
df_train['PERIOD'] = df_train['PERIOD'].str.strip().str.lower()

df_test = data_test.copy()
df_test['PERIOD'] = df_test['PERIOD'].str.strip().str.lower()

# --- build train dataset from dataframe ---
ds_train_simple = mbm.data_processing.MBSequenceDataset.from_dataframe(
    df_train,
    MONTHLY_COLS,
    STATIC_COLS,
    months_tail_pad=months_tail_pad,
    months_head_pad=months_head_pad,
    expect_target=True)

ds_test_simple = mbm.data_processing.MBSequenceDataset.from_dataframe(
    df_test,
    MONTHLY_COLS,
    STATIC_COLS,
    months_tail_pad=months_tail_pad,
    months_head_pad=months_head_pad,
    expect_target=True)

train_idx_simple, val_idx_simple = mbm.data_processing.MBSequenceDataset.split_indices(
    len(ds_train_simple), val_ratio=0.2, seed=cfg.seed)

custom_params = {
    'Fm': 8,
    'Fs': 2,
    'hidden_size': 128,
    'num_layers': 1,
    'bidirectional': False,
    'dropout': 0.0,
    'static_layers': 0,
    'static_hidden': None,
    'static_dropout': None,
    'lr': 0.001,
    'weight_decay': 0.0,
    'loss_name': 'neutral',
    'loss_spec': None
}

custom_params['two_heads'] = True
custom_params['head_dropout'] = 0.0

params_simple_model = custom_params.copy()

# --- build model, resolve loss, train, reload best ---
current_date = datetime.now().strftime("%Y-%m-%d")
model_filename = f"models/lstm_model_{current_date}_CA_simple.pt"

# --- loaders (fit scalers on TRAIN, apply to whole ds_train_simple) ---
ds_train_simple_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
    ds_train_simple)

ds_test_simple_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
    ds_test_simple)

train_dl, val_dl = ds_train_simple_copy.make_loaders(
    train_idx=train_idx_simple,
    val_idx=val_idx_simple,
    batch_size_train=64,
    batch_size_val=128,
    seed=cfg.seed,
    fit_and_transform=
    True,  # fit scalers on TRAIN and transform Xm/Xs/y in-place
    shuffle_train=True,
    use_weighted_sampler=True  # use weighted sampler for training
)

# --- test loader (copies TRAIN scalers into ds_test_simple and transforms it) ---
test_dl_simple = mbm.data_processing.MBSequenceDataset.make_test_loader(
    ds_test_simple_copy, ds_train_simple_copy, batch_size=128, seed=cfg.seed)

# --- build model, resolve loss, train, reload best ---
model = mbm.models.LSTM_MB.build_model_from_params(cfg, custom_params, device)
loss_fn = mbm.models.LSTM_MB.resolve_loss_fn(custom_params)

TRAIN = False
if TRAIN:
    if os.path.exists(model_filename): os.remove(model_filename)

    history, best_val, best_state = model.train_loop(
        device=device,
        train_dl=train_dl,
        val_dl=val_dl,
        epochs=150,
        lr=custom_params['lr'],
        weight_decay=custom_params['weight_decay'],
        clip_val=1,
        # scheduler
        sched_factor=0.5,
        sched_patience=6,
        sched_threshold=0.01,
        sched_threshold_mode="rel",
        sched_cooldown=1,
        sched_min_lr=1e-6,
        # early stopping
        es_patience=15,
        es_min_delta=1e-4,
        # logging
        log_every=5,
        verbose=True,
        # checkpoint
        save_best_path=model_filename,
        loss_fn=loss_fn,
    )
    plot_history_lstm(history)

model_filename = f"models/lstm_model_2025-09-30_CA_simple.pt"
state = torch.load(model_filename, map_location=device)
model.load_state_dict(state)

# Evaluate on test
test_metrics, test_df_preds = model.evaluate_with_preds(
    device, test_dl_simple, ds_test_simple_copy)
test_rmse_a, test_rmse_w = test_metrics['RMSE_annual'], test_metrics[
    'RMSE_winter']

print('Test RMSE annual: {:.3f} | winter: {:.3f}'.format(
    test_rmse_a, test_rmse_w))

scores_annual, scores_winter = compute_seasonal_scores(test_df_preds,
                                                       target_col='target',
                                                       pred_col='pred')

fig = plot_predictions_summary(grouped_ids=test_df_preds,
                               scores_annual=scores_annual,
                               scores_winter=scores_winter,
                               ax_xlim=(-8, 6),
                               ax_ylim=(-8, 6))

### Partial models:

In [None]:
MONTHLY_COLS = [
    't2m',
    'tp',
    'slhf',
    'sshf',
    'ssrd',
    'fal',
    'str',
    'ELEVATION_DIFFERENCE',
]
vars = ['hugonnet_dhdt', 'consensus_ice_thickness', 'millan_v']

all_combos = [('hugonnet_dhdt', ), ('consensus_ice_thickness', ),
              ('millan_v', ), ('hugonnet_dhdt', 'consensus_ice_thickness'),
              ('hugonnet_dhdt', 'millan_v'),
              ('consensus_ice_thickness', 'millan_v')]

TRAIN = False
if TRAIN:
    for combo in all_combos:
        print(combo)

        STATIC_COLS = ['aspect', 'slope', *combo]
        feature_columns = MONTHLY_COLS + STATIC_COLS
        seed_all(cfg.seed)

        # prepare train/test data
        df_train = data_train.copy()
        df_train['PERIOD'] = df_train['PERIOD'].str.strip().str.lower()
        df_test = data_test.copy()
        df_test['PERIOD'] = df_test['PERIOD'].str.strip().str.lower()

        # datasets
        ds_train_full = mbm.data_processing.MBSequenceDataset.from_dataframe(
            df_train,
            MONTHLY_COLS,
            STATIC_COLS,
            months_tail_pad=months_tail_pad,
            months_head_pad=months_head_pad,
            expect_target=True)
        ds_test_full = mbm.data_processing.MBSequenceDataset.from_dataframe(
            df_test,
            MONTHLY_COLS,
            STATIC_COLS,
            months_tail_pad=months_tail_pad,
            months_head_pad=months_head_pad,
            expect_target=True)

        # split train/val
        train_idx_full, val_idx_full = mbm.data_processing.MBSequenceDataset.split_indices(
            len(ds_train_full), val_ratio=0.2, seed=cfg.seed)

        # params
        custom_params = {
            'Fm': 8,
            'Fs': len(STATIC_COLS),
            'hidden_size': 128,
            'num_layers': 1,
            'bidirectional': False,
            'dropout': 0.0,
            'static_layers': 0,
            'static_hidden': None,
            'static_dropout': None,
            'lr': 0.001,
            'weight_decay': 0.0,
            'loss_name': 'neutral',
            'loss_spec': None,
            'two_heads': True,
            'head_dropout': 0.0
        }
        params_full_model = custom_params.copy()

        # --- model filename ---
        current_date = datetime.now().strftime("%Y-%m-%d")
        combo_str = "_".join(combo)
        model_filename = f"models/lstm_model_{current_date}_CA_{combo_str}.pt"

        # loaders
        ds_train_full_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
            ds_train_full)
        ds_test_full_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
            ds_test_full)

        train_dl, val_dl = ds_train_full_copy.make_loaders(
            train_idx=train_idx_full,
            val_idx=val_idx_full,
            batch_size_train=64,
            batch_size_val=128,
            seed=cfg.seed,
            fit_and_transform=True,
            shuffle_train=True,
            use_weighted_sampler=True)

        test_dl_full = mbm.data_processing.MBSequenceDataset.make_test_loader(
            ds_test_full_copy,
            ds_train_full_copy,
            batch_size=128,
            seed=cfg.seed)

        # model
        model = mbm.models.LSTM_MB.build_model_from_params(
            cfg, custom_params, device)
        loss_fn = mbm.models.LSTM_MB.resolve_loss_fn(custom_params)

        if os.path.exists(model_filename):
            os.remove(model_filename)

        history, best_val, best_state = model.train_loop(
            device=device,
            train_dl=train_dl,
            val_dl=val_dl,
            epochs=150,
            lr=custom_params['lr'],
            weight_decay=custom_params['weight_decay'],
            clip_val=1,
            sched_factor=0.5,
            sched_patience=6,
            sched_threshold=0.01,
            sched_threshold_mode="rel",
            sched_cooldown=1,
            sched_min_lr=1e-6,
            es_patience=15,
            es_min_delta=1e-4,
            log_every=5,
            verbose=True,
            save_best_path=model_filename,
            loss_fn=loss_fn)

#### Full model (with OGGM variables):

In [None]:
MONTHLY_COLS = [
    't2m',
    'tp',
    'slhf',
    'sshf',
    'ssrd',
    'fal',
    'str',
    'ELEVATION_DIFFERENCE',
]

STATIC_COLS = [
    'aspect', 'slope', 'hugonnet_dhdt', 'consensus_ice_thickness', 'millan_v'
]

feature_columns = MONTHLY_COLS + STATIC_COLS
seed_all(cfg.seed)

df_train = data_train.copy()
df_train['PERIOD'] = df_train['PERIOD'].str.strip().str.lower()

df_test = data_test.copy()
df_test['PERIOD'] = df_test['PERIOD'].str.strip().str.lower()

# --- build train dataset from dataframe ---
ds_train_full = mbm.data_processing.MBSequenceDataset.from_dataframe(
    df_train,
    MONTHLY_COLS,
    STATIC_COLS,
    months_tail_pad=months_tail_pad,
    months_head_pad=months_head_pad,
    expect_target=True)

ds_test_full = mbm.data_processing.MBSequenceDataset.from_dataframe(
    df_test,
    MONTHLY_COLS,
    STATIC_COLS,
    months_tail_pad=months_tail_pad,
    months_head_pad=months_head_pad,
    expect_target=True)

train_idx_full, val_idx_full = mbm.data_processing.MBSequenceDataset.split_indices(
    len(ds_train_full), val_ratio=0.2, seed=cfg.seed)

custom_params = {
    'Fm': 8,
    'Fs': 5,
    'hidden_size': 128,
    'num_layers': 1,
    'bidirectional': False,
    'dropout': 0.0,
    'static_layers': 0,
    'static_hidden': None,
    'static_dropout': None,
    'lr': 0.001,
    'weight_decay': 0.0,
    'loss_name': 'neutral',
    'loss_spec': None
}

custom_params['two_heads'] = True
custom_params['head_dropout'] = 0.0

params_full_model = custom_params.copy()

# --- build model, resolve loss, train, reload best ---
current_date = datetime.now().strftime("%Y-%m-%d")
model_filename = f"models/lstm_model_{current_date}_CA_full.pt"

# --- loaders (fit scalers on TRAIN, apply to whole ds_train_full) ---
ds_train_full_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
    ds_train_full)

ds_test_full_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
    ds_test_full)

train_dl, val_dl = ds_train_full_copy.make_loaders(
    train_idx=train_idx_full,
    val_idx=val_idx_full,
    batch_size_train=64,
    batch_size_val=128,
    seed=cfg.seed,
    fit_and_transform=
    True,  # fit scalers on TRAIN and transform Xm/Xs/y in-place
    shuffle_train=True,
    use_weighted_sampler=True  # use weighted sampler for training
)

# --- test loader (copies TRAIN scalers into ds_test_full and transforms it) ---
test_dl_full = mbm.data_processing.MBSequenceDataset.make_test_loader(
    ds_test_full_copy, ds_train_full_copy, batch_size=128, seed=cfg.seed)

# --- build model, resolve loss, train, reload best ---
model = mbm.models.LSTM_MB.build_model_from_params(cfg, custom_params, device)
loss_fn = mbm.models.LSTM_MB.resolve_loss_fn(custom_params)

TRAIN = False
if TRAIN:
    if os.path.exists(model_filename): os.remove(model_filename)

    history, best_val, best_state = model.train_loop(
        device=device,
        train_dl=train_dl,
        val_dl=val_dl,
        epochs=150,
        lr=custom_params['lr'],
        weight_decay=custom_params['weight_decay'],
        clip_val=1,
        # scheduler
        sched_factor=0.5,
        sched_patience=6,
        sched_threshold=0.01,
        sched_threshold_mode="rel",
        sched_cooldown=1,
        sched_min_lr=1e-6,
        # early stopping
        es_patience=15,
        es_min_delta=1e-4,
        # logging
        log_every=5,
        verbose=True,
        # checkpoint
        save_best_path=model_filename,
        loss_fn=loss_fn,
    )
    plot_history_lstm(history)

model_filename = f"models/lstm_model_2025-09-30_CA_full.pt"

state = torch.load(model_filename, map_location=device)
model.load_state_dict(state)

# Evaluate on test
test_metrics, test_df_preds = model.evaluate_with_preds(
    device, test_dl_full, ds_test_full_copy)
test_rmse_a, test_rmse_w = test_metrics['RMSE_annual'], test_metrics[
    'RMSE_winter']

print('Test RMSE annual: {:.3f} | winter: {:.3f}'.format(
    test_rmse_a, test_rmse_w))

scores_annual, scores_winter = compute_seasonal_scores(test_df_preds,
                                                       target_col='target',
                                                       pred_col='pred')

fig = plot_predictions_summary(grouped_ids=test_df_preds,
                               scores_annual=scores_annual,
                               scores_winter=scores_winter,
                               ax_xlim=(-8, 6),
                               ax_ylim=(-8, 6))

## Results:

In [None]:
#path_rgi = cfg.dataPath+'GLAMOS/RGI/RGI2000-v7.0-G-11_central_europe/RGI2000-v7.0-G-11_central_europe.shp'
path_rgi = cfg.dataPath + 'GLAMOS/RGI/nsidc0770_11.rgi60.CentralEurope/11_rgi60_CentralEurope.shp'

# load RGI shapefile
gdf = gpd.read_file(path_rgi)

# check CRS
print(gdf.crs)

# reproject to a local equal-area projection (example: EPSG:3035 for Europe)
gdf_proj = gdf.to_crs(3035)
gdf_proj.set_index('RGIId', inplace=True, drop=True)
gdf_proj["area_m2"] = gdf_proj.geometry.area
gdf_proj["area_km2"] = gdf_proj["area_m2"] / 1e6

In [None]:
# open output file
output_df = pd.read_csv("logs/glacier_mean_MB_old.csv").drop(['Index'], axis=1)

output_df['area_gl'] = output_df['RGIId'].map(
    lambda x: gdf_proj.loc[x, 'area_km2'])

# yearly_mean_mb_CA = output_df.groupby('Year',
#                                       as_index=False).agg({'Mean_MB': 'mean'})
# yearly_cum_mb_CA = output_df.groupby('Year',
#                                      as_index=False).agg({'Mean_MB': 'sum'})
# yearly_cum_mb_CA['Cum_MB'] = yearly_cum_mb_CA['Mean_MB'].cumsum()
# yearly_cum_mb_CA['Mean_MB'] = yearly_mean_mb_CA['Mean_MB']
# # yearly_cum_mb_CA['Mean_MB'] = yearly_cum_mb_CA['Mean_MB'] / total_area
# yearly_cum_mb_CA.head()

df = output_df.copy()

# annual change per glacier in Gt
df["annual_change_gt"] = (df["Mean_MB"] * df["area_gl"]) / 1e9

# total annual change in Gt (sum across glaciers)
annual_gt = df.groupby("Year")["annual_change_gt"].sum().reset_index(
    name="Annual_MB_Gt")

# cumulative MB in Gt
annual_gt["Cumulative_MB_Gt"] = annual_gt["Annual_MB_Gt"].cumsum()

# compute weighted mean MB per year
yearly_weighted = (output_df.groupby("Year").apply(lambda g: (g["Mean_MB"] * g[
    "area_gl"]).sum() / g["area_gl"].sum()).reset_index(name="Weighted_MB"))

print(yearly_weighted.head())

In [None]:
glambie_df = pd.read_csv('glambie_values.csv')
date_columns = [
    'central_europe_dates', 'central_europe_start_dates',
    'central_europe_end_dates'
]

glambie_df[date_columns] = glambie_df[date_columns].apply(
    lambda x: x.round() - 1)
glambie_df.head()

In [None]:
# --- plotting ---
fig, axs = plt.subplots(1, 2, figsize=(15, 6), sharey=True)

# --------------------
# Left: LSTM results
# --------------------
ax1 = axs[0]
years = yearly_weighted['Year']

# barplot: annual weighted MB (m w.e.)
ax1.bar(years,
        yearly_weighted['Weighted_MB'],
        color="skyblue",
        label="Area-weighted annual MB")
ax1.set_ylabel("Annual MB (m w.e.)", color="skyblue")

# lineplot: cumulative MB in Gt (secondary axis)
ax2 = ax1.twinx()
ax2.plot(annual_gt['Year'],
         annual_gt['Cumulative_MB_Gt'],
         color="red",
         marker="o",
         label="Cumulative MB")
ax2.set_ylabel("Cumulative MB (Gt)", color="red")

ax1.set_title("Central Alps annual MB (LSTM)")
ax1.legend(loc="upper left")
ax2.legend(loc="upper right")

# --------------------
# Right: GLAMBIE results
# --------------------
ax3 = axs[1]

# annual MB (bars)
ax3.bar(glambie_df['central_europe_end_dates'],
        glambie_df['central_europe_annual_change_mwe'],
        color="lightgreen",
        label="Annual MB (GLAMBIE)")
ax3.set_ylabel("Annual MB (m w.e.)", color="lightgreen")

# cumulative MB (line, secondary axis)
ax4 = ax3.twinx()
ax4.plot(glambie_df['central_europe_dates'],
         glambie_df['central_europe_cumulative_change_gt'],
         color="darkgreen",
         marker="s",
         label="Cumulative MB (GLAMBIE)")
ax4.set_ylabel("Cumulative MB (Gt)", color="darkgreen")

ax3.set_title("Central Europe MB (GLAMBIE)")
ax3.legend(loc="upper left")
ax4.legend(loc="upper right")

# --------------------
# Formatting
# --------------------
for ax in axs:
    ax.tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# make sure both datasets use the same x-axis type
years_lstm = yearly_weighted['Year']

years_glambie = glambie_df['central_europe_end_dates']

fig, ax = plt.subplots(figsize=(12, 6))

# bar width
width = 0.4

# LSTM bars (slightly shifted left)
ax.bar(years_lstm - 0.2,
       yearly_weighted['Weighted_MB'],
       width=width,
       color="skyblue",
       label="LSTM Annual MB")

# GLAMBIE bars (slightly shifted right)
ax.bar(years_glambie + 0.2,
       glambie_df['central_europe_annual_change_mwe'],
       width=width,
       color="lightgreen",
       label="GLAMBIE Annual MB")

# formatting
ax.set_ylabel("Annual MB (m w.e.)")
ax.set_title("Annual Mass Balance: LSTM vs GLAMBIE")
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()