# Glacier grids from RGI:

Creates monthly grid files for the MBM to make PMB predictions over the whole glacier grid. The files come from the RGI grid with OGGM topography. Computing takes a long time because of the conversion to monthly format.
## Setting up:

In [None]:
import sys, os

sys.path.append(os.path.join(os.getcwd(),
                             '../../'))  # Add root of repo to import MBM
import csv
from functools import partial

import pandas as pd
import warnings
from tqdm.notebook import tqdm
import re
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
import xarray as xr
import massbalancemachine as mbm
from collections import defaultdict
import logging
from skorch.helper import SliceDataset
from datetime import datetime
from skorch.callbacks import EarlyStopping, LRScheduler, Checkpoint
import itertools
import random
import pickle
from collections import Counter
import ast
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.config_CH import *
from scripts.nn_helpers import *
from scripts.xgb_helpers import *
from scripts.geodata import *
from scripts.NN_networks import *
from scripts.geodata_plots import *

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.SwitzerlandConfig()

seed_all(cfg.seed)
print("Using seed:", cfg.seed)

from torch.utils.data import Subset
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset
from torch.utils.data import WeightedRandomSampler, SubsetRandomSampler
import torch.nn as nn

if torch.cuda.is_available():
    print("CUDA is available")
    free_up_cuda()
else:
    print("CUDA is NOT available")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

# Climate columns
vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]
# Topographical columns
vois_topographical = [
    "aspect", "slope", "hugonnet_dhdt", "consensus_ice_thickness", "millan_v",
    "topo", 'svf'
]

glacier_outline_rgi = gpd.read_file(cfg.dataPath + path_rgi_outlines)

In [None]:
gdirs, rgidf = initialize_oggm_glacier_directories(
    cfg,
    rgi_region="11",
    rgi_version="62",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L1-L2_files/2025.6/elev_bands_w_data/",
    log_level='WARNING',
    task_list=None,
)

# Save OGGM xr for all needed glaciers in RGI region 11.6:
df_missing = export_oggm_grids(cfg, gdirs)

path_rgi = cfg.dataPath + 'GLAMOS/RGI/nsidc0770_11.rgi60.CentralEurope/11_rgi60_CentralEurope.shp'

# load RGI shapefile
gdf = gpd.read_file(path_rgi)
# reproject to a local equal-area projection (example: EPSG:3035 for Europe)
gdf_proj = gdf.to_crs(3035)
gdf_proj.rename(columns={"RGIId": "rgi_id"}, inplace=True)
# gdf_proj.set_index('rgi_id', inplace=True)
gdf_proj["area_m2"] = gdf_proj.geometry.area
gdf_proj["area_km2"] = gdf_proj["area_m2"] / 1e6

df_missing = df_missing.merge(gdf_proj[['area_km2', 'rgi_id']], on="rgi_id")

# total glacier area
total_area = gdf_proj["area_km2"].sum()

# explode the list of missing vars into rows (one var per row)
df_exploded = df_missing.explode("missing_vars")

# 1) COUNT: number of glaciers missing each variable
counts_missing_per_var = (
    df_exploded.groupby("missing_vars")["rgi_id"].nunique().sort_values(
        ascending=False))

# 2) TOTAL % AREA with ANY missing var
total_missing_area_km2 = df_missing["area_km2"].sum()
total_missing_area_pct = (total_missing_area_km2 / total_area) * 100

print(f"Total glacier area with ANY missing variable: "
      f"{total_missing_area_km2:,.2f} km² "
      f"({total_missing_area_pct:.2f}%)")

# Optional: also show % area per variable (kept from your earlier logic)
area_missing_per_var = (
    df_exploded.groupby("missing_vars")["area_km2"].sum().sort_values(
        ascending=False))
perc_missing_per_var = (area_missing_per_var / total_area) * 100

print("\n% of total glacier area missing per variable:")
for var, pct in perc_missing_per_var.items():
    print(f"  - {var}: {pct:.2f}%")

# ---- barplot: number of glaciers missing each variable ----
plt.figure(figsize=(7, 4))
plt.bar(counts_missing_per_var.index, counts_missing_per_var.values)
plt.xlabel("Missing variable")
plt.ylabel("Number of glaciers")
plt.title("Count of glaciers missing each variable")
plt.tight_layout()
plt.show()

In [None]:
# RGI Ids:
# Read glacier ids:
rgi_df = pd.read_csv(cfg.dataPath + path_glacier_ids, sep=',')
rgi_df.rename(columns=lambda x: x.strip(), inplace=True)
rgi_df.sort_values(by='short_name', inplace=True)
rgi_df.set_index('short_name', inplace=True)
rhone_rgiid = rgi_df.loc['rhone']['rgi_id.v6']
aletsch_rgiid = rgi_df.loc['aletsch']['rgi_id.v6']
rgi_df.loc['rhone']

In [None]:
path_rgi_alps = os.path.join(cfg.dataPath,
                             'GLAMOS/topo/gridded_topo_inputs/RGI_v6_11_svf/')
rgi_ids = os.listdir(path_rgi_alps)
year = 2023
pos_gl, rgis = [], []
for rgi_gl in tqdm(rgi_ids):
    if os.path.exists(
            os.path.join(path_rgi_alps, rgi_gl,
                         f"{rgi_gl}_grid_{year}.parquet")):
        df = pd.read_parquet(
            os.path.join(path_rgi_alps, rgi_gl,
                         f"{rgi_gl}_grid_{year}.parquet"))
    else:
        continue
    pos_gl.append((df.POINT_LAT.mean(), df.POINT_LON.mean()))
    rgis.append(rgi_gl)
df_pos_all = pd.DataFrame(pos_gl, columns=['lat', 'lon'])
df_pos_all['rgi_id'] = rgis

print('Number of glaciers in RGI region 11.6:', len(df_pos_all))

# ---- 2. Create figure and base map ----
fig = plt.figure(figsize=(18, 10))

latN, latS = 48, 44
lonW, lonE = 4, 14
projPC = ccrs.PlateCarree()
ax2 = plt.axes(projection=projPC)
ax2.set_extent([lonW, lonE, latS, latN], crs=ccrs.Geodetic())

ax2.add_feature(cfeature.COASTLINE)
ax2.add_feature(cfeature.LAKES)
ax2.add_feature(cfeature.RIVERS)
ax2.add_feature(cfeature.BORDERS, linestyle='-', linewidth=1)

g = sns.scatterplot(
    data=df_pos_all,
    x='lon',
    y='lat',
    alpha=0.6,
    transform=projPC,
    ax=ax2,
    zorder=10,
    legend=True  # custom legend added below
)

glacier_outline_rgi.plot(ax=ax2, transform=projPC, color='black')

# ---- 4. Gridlines ----
gl = ax2.gridlines(draw_labels=True,
                   linewidth=1,
                   color='gray',
                   alpha=0.5,
                   linestyle='--')
gl.xformatter = LONGITUDE_FORMATTER
gl.yformatter = LATITUDE_FORMATTER
gl.xlabel_style = {'size': 16, 'color': 'black'}
gl.ylabel_style = {'size': 16, 'color': 'black'}
gl.top_labels = gl.right_labels = False

In [None]:
path_xr_grids = os.path.join(cfg.dataPath,
                             "GLAMOS/topo/RGI_v6_11/xr_masked_grids/")
path_xr_svf = os.path.join(cfg.dataPath,
                           "GLAMOS/topo/RGI_v6_11/svf_nc_latlon/")
rgi_gl = rhone_rgiid

# --- load
ds = xr.open_zarr(os.path.join(path_xr_grids, f"{rgi_gl}.zarr"))
ds_svf = xr.open_dataset(os.path.join(path_xr_svf, f"{rgi_gl}_svf_latlon.nc"))


# --- make coord names consistent (lat/lon)
def standardize_coords(d):
    ren = {}
    if "latitude" in d.dims: ren["latitude"] = "lat"
    if "longitude" in d.dims: ren["longitude"] = "lon"
    if "y" in d.dims and "lat" not in d.dims: ren["y"] = "lat"
    if "x" in d.dims and "lon" not in d.dims: ren["x"] = "lon"
    return d.rename(ren)


ds = standardize_coords(ds)
ds_svf = standardize_coords(ds_svf)

# --- ensure coords are ascending for interp
if ds.lon[0] > ds.lon[-1]: ds = ds.sortby("lon")
if ds.lat[0] > ds.lat[-1]: ds = ds.sortby("lat")
if ds_svf.lon[0] > ds_svf.lon[-1]: ds_svf = ds_svf.sortby("lon")
if ds_svf.lat[0] > ds_svf.lat[-1]: ds_svf = ds_svf.sortby("lat")

# --- if grids match exactly, merge; else interpolate SVF vars to ds grid
svf_vars = [v for v in ["svf", "asvf", "opns"] if v in ds_svf.data_vars]

if np.array_equal(ds.lon.values, ds_svf.lon.values) and np.array_equal(
        ds.lat.values, ds_svf.lat.values):
    ds_out = xr.merge([ds, ds_svf[svf_vars]])
else:
    # choose interpolation method: "linear" (smooth) or "nearest" (preserve edges)
    svf_on_ds = ds_svf[svf_vars].interp(lon=ds.lon,
                                        lat=ds.lat,
                                        method="linear")
    # (optional) cast to float32 to save space
    for v in svf_vars:
        svf_on_ds[v] = svf_on_ds[v].astype("float32")
    ds_out = ds.assign(**{v: svf_on_ds[v] for v in svf_vars})

# --- quick visual check
fig = plt.figure(figsize=(12, 6))
ax = plt.subplot(1, 2, 1)
ds_out.masked_elev.plot(ax=ax, cmap="terrain")
ax.set_title("masked_elev (grid)")

ax = plt.subplot(1, 2, 2)
ds_out.svf.plot(ax=ax, vmin=0, vmax=1, cmap="viridis")
ax.set_title("SVF (aligned to grid)")

plt.tight_layout()
plt.show()

## Results:

In [None]:
path_rgi = cfg.dataPath + 'GLAMOS/RGI/nsidc0770_11.rgi60.CentralEurope/11_rgi60_CentralEurope.shp'

# load RGI shapefile
gdf = gpd.read_file(path_rgi)

# check CRS
print(gdf.crs)

# reproject to a local equal-area projection (example: EPSG:3035 for Europe)
gdf_proj = gdf.to_crs(3035)
gdf_proj.set_index('RGIId', inplace=True, drop=True)
gdf_proj["area_m2"] = gdf_proj.geometry.area
gdf_proj["area_km2"] = gdf_proj["area_m2"] / 1e6

print('Number of glaciers in RGI region 11.6:', len(gdf_proj.index))
print('Total area of glaciers in RGI region 11.6 (km2):',
      gdf_proj["area_km2"].sum())

In [None]:
# GLAMBIE data
path_glambie = cfg.dataPath + 'GLAMBIE/glambie_results_20240716/hydrological_years/'
df_glambie = pd.read_csv(path_glambie + '11_central_europe.csv')
df_glambie['hydr_year'] = df_glambie['end_dates'].apply(lambda x: int(x))
# plot total area per year
df_glambie.plot(x='hydr_year', y='glacier_area', kind='bar', figsize=(10, 4))
print(df_glambie.columns)

In [None]:
# --- base setup ---
output_df = pd.read_csv("logs/glacier_mean_MB_2025-10-08.csv").drop(['Index'],
                                                                    axis=1)
output_df['area_gl'] = output_df['RGIId'].map(
    lambda x: gdf_proj.loc[x, 'area_km2'])
output_df.RGIId.nunique()

In [None]:
# 1) numerator: sum over glaciers of MB * glacier area
num_by_year = (output_df.assign(
    num=lambda d: d["Mean_MB"] * d["area_gl"]).groupby(
        "Year", as_index=False)["num"].sum())

# 2) total glacier area per year (from your own dataset)
area_by_year = (output_df.groupby(
    "Year", as_index=False)["area_gl"].sum().rename(
        columns={"area_gl": "total_area_internal"}))

# 3) external GLAMBIE area per year
glambie_area = (df_glambie[["hydr_year", "glacier_area"
                            ]].rename(columns={"hydr_year": "Year"}))

# 4) combine everything
yearly_weighted = (
    num_by_year.merge(area_by_year, on="Year", how="left").merge(
        glambie_area, on="Year", how="left").assign(
            Weighted_MB_internal=lambda d: d["num"] / d[
                "total_area_internal"],  # internal weighting
            Weighted_MB_external=lambda d: d["num"] / d["glacier_area"
                                                        ]  # GLAMBIE weighting
        ))

# 5) sanity check for missing GLAMBIE data
missing = yearly_weighted[
    yearly_weighted["glacier_area"].isna()]["Year"].tolist()
if missing:
    print("No GLAMBIE area for years:", missing)

# display result
yearly_weighted.head()

In [None]:
# # compare to GLAMBIE
# glambie_df = pd.read_csv('glambie_values.csv')
# date_columns = [
#     'central_europe_dates', 'central_europe_start_dates',
#     'central_europe_end_dates'
# ]

# glambie_df[date_columns] = glambie_df[date_columns].apply(
#     lambda x: x.round() - 1)
# glambie_df.head()

In [None]:
# --- plotting ---
fig, axs = plt.subplots(1, 2, figsize=(15, 6), sharey=True)

# --------------------
# Left: LSTM results
# --------------------
ax1 = axs[0]
years = yearly_weighted_ext['Year']

# barplot: annual weighted MB (m w.e.)
ax1.bar(years,
        yearly_weighted_ext['Weighted_MB'],
        color="skyblue",
        label="Area-weighted annual MB")
ax1.set_ylabel("Annual MB (m w.e.)", color="skyblue")
ax1.set_title("Central Alps annual MB (LSTM)")
ax1.legend(loc="upper left")

# --------------------
# Right: GLAMBIE results
# --------------------
ax3 = axs[1]

# annual MB (bars) + error bars from df_glambie['combined_mwe_errors']
ax3.bar(
    df_glambie['hydr_year'],
    df_glambie['combined_mwe'],
    yerr=df_glambie['combined_mwe_errors'],  # <- here
    capsize=3,  # little caps on error bars
    error_kw={
        "elinewidth": 1,
        "alpha": 0.9
    },  # style of the error lines
    color="lightgreen",
    ecolor="black",  # error bar color
    label="Annual MB (GLAMBIE)")
ax3.set_ylabel("Annual MB (m w.e.)", color="lightgreen")
ax3.set_title("Central Europe MB (GLAMBIE)")
ax3.legend(loc="upper left")

# --------------------
# Formatting
# --------------------
for ax in axs:
    ax.tick_params(axis="x", rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# make sure both datasets use the same x-axis type
years_lstm = yearly_weighted['Year']

years_glambie = glambie_df['central_europe_end_dates']

fig, ax = plt.subplots(figsize=(12, 6))

# bar width
width = 0.4

# LSTM bars (slightly shifted left)
ax.bar(years_lstm - 0.2,
       yearly_weighted['Weighted_MB'],
       width=width,
       color="skyblue",
       label="LSTM Annual MB")

# GLAMBIE bars (slightly shifted right)
ax.bar(years_glambie + 0.2,
       glambie_df['central_europe_annual_change_mwe'],
       width=width,
       color="lightgreen",
       label="GLAMBIE Annual MB")

# formatting
ax.set_ylabel("Annual MB (m w.e.)")
ax.set_title("Annual Mass Balance: LSTM vs GLAMBIE")
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()