# Glacier grids from RGI:

Creates monthly grid files for the MBM to make PMB predictions over the whole glacier grid. The files come from the RGI grid with OGGM topography. Computing takes a long time because of the conversion to monthly format.
## Setting up:

In [None]:
# --- sys.path for local package resolution ---
import sys, os
sys.path.append(os.path.join(os.getcwd(), '../../'))  # repo root for MBM

# --- standard library ---
import ast
import csv
import itertools
import logging
import pickle
import random
import re
import warnings
from collections import Counter, defaultdict
from datetime import datetime
import multiprocessing as mp
from concurrent.futures import ProcessPoolExecutor, as_completed

# --- third-party ---
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns
from cmcrameri import cm
from tqdm.auto import tqdm
import cartopy.crs as ccrs
import cartopy.feature as cfeature
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
import torch
import torch.nn as nn
from torch.utils.data import (
    Dataset, DataLoader, Subset, WeightedRandomSampler, SubsetRandomSampler
)
from torch.optim.lr_scheduler import ReduceLROnPlateau
from skorch.helper import SliceDataset
from skorch.callbacks import EarlyStopping, LRScheduler, Checkpoint
import rioxarray
import rasterio

# --- project/local ---
import massbalancemachine as mbm
from scripts.helpers import *
from scripts.glamos_preprocess import *
from scripts.plots import *
from scripts.config_CH import *
from scripts.nn_helpers import *
from scripts.xgb_helpers import *
from scripts.geodata import *
from scripts.NN_networks import *
from scripts.geodata_plots import *

# --- notebook conveniences (keep if you're in a notebook) ---
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

# --- config / seed / device ---
cfg = mbm.SwitzerlandConfig()
seed_all(cfg.seed)
print("Using seed:", cfg.seed)

if torch.cuda.is_available():
    print("CUDA is available")
    free_up_cuda()  # from scripts.helpers
else:
    print("CUDA is NOT available")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Plot styles:
path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

# Climate columns
vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]
# Topographical columns
vois_topographical = [
    "aspect",
    "slope",
    "hugonnet_dhdt",
    "consensus_ice_thickness",
    "millan_v",
    "topo",
]

In [None]:
# # Glacier outlines:
# glacier_outline_sgi = gpd.read_file(
#     os.path.join(cfg.dataPath, path_SGI_topo, 'inventory_sgi2016_r2020',
#                  'SGI_2016_glaciers_copy.shp'))  # Load the shapefile
# glacier_outline_rgi = gpd.read_file(cfg.dataPath + path_rgi_outlines)

# gdirs, rgidf = initialize_oggm_glacier_directories(
#     cfg,
#     rgi_region="11",
#     rgi_version="62",
#     base_url=
#     "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L1-L2_files/2025.6/elev_bands_w_data/",
#     log_level='WARNING',
#     task_list=None,
# )

# # Save OGGM xr for all needed glaciers in RGI region 11.6:
# df_missing = export_oggm_grids(cfg, gdirs)

## Save DEMs to geotiff:

In [None]:
sgi_list = [
    re.split('_',
             re.split('.grid', f)[0])[1]
    for f in os.listdir(os.path.join(cfg.dataPath, path_SGI_topo, 'aspect'))
]

path_out_tiff = os.path.join(cfg.dataPath,
                             "GLAMOS/topo/SGI2020/DEMs_geotiff_lv95/")
os.makedirs(path_out_tiff, exist_ok=True)

# read shapefile once per worker process
shp_path = os.path.join(cfg.dataPath, path_SGI_topo,
                        "inventory_sgi2016_r2020/SGI_2016_glaciers.shp")
glacier_outline_sgi = gpd.read_file(shp_path)

RUN = False
if RUN:
    for sgi_id in tqdm(sgi_list):
        path_DEM = os.path.join(cfg.dataPath, path_SGI_topo, "dem_HR")
        dem_gl = [f for f in os.listdir(path_DEM) if sgi_id in f][0]
        metadata_dem, grid_data_dem = load_grid_file(join(path_DEM, dem_gl))
        dem = convert_to_xarray_geodata(grid_data_dem, metadata_dem)
        gdf_mask_gl = glacier_outline_sgi[glacier_outline_sgi["sgi-id"] ==
                                          sgi_id]

        mask, masked_dem = extract_topo_over_outline(dem,
                                                     gdf_mask_gl,
                                                     target_crs=2056)

        # --- Attach CRS and write GeoTIFF ---
        masked_dem = masked_dem.rio.write_crs("EPSG:2056", inplace=True)

        # Prepare output folder
        os.makedirs(path_out_tiff, exist_ok=True)
        out_tif = os.path.join(path_out_tiff, f"{sgi_id}.tif")
        masked_dem.rio.to_raster(
            out_tif,
            dtype="float32",
            compress="LZW",
            BIGTIFF="IF_SAFER",
            tiled=True,
            predictor=3,  # better compression for float rasters
        )

## Masked xarrays:

In [None]:
sgi_list = [
    re.split('_',
             re.split('.grid', f)[0])[1]
    for f in os.listdir(os.path.join(cfg.dataPath, path_SGI_topo, 'aspect'))
]

# unique SGI IDs
sgi_list = list(set(sgi_list))
print('Number of unique SGI IDs:', len(sgi_list))

glaciers_glamos_dems = os.listdir(
    os.path.join(cfg.dataPath, path_GLAMOS_topo, 'lv95'))

path_xr_svf = os.path.join(cfg.dataPath, "GLAMOS/topo/SGI2020/svf_nc_latlon/")
os.makedirs(path_xr_svf, exist_ok=True)

RUN = False
if RUN:
    # Create SGI topographical masks
    # Note: This function will take a while to run
    # It creates a mask for each glacier in the SGI list
    # and saves them in the specified directory.
    path_save = os.path.join(cfg.dataPath, path_SGI_topo,
                             'xr_masked_grids_sgi/')
    emptyfolder(path_save)
    create_sgi_topo_masks_parallel(cfg,
                                   path_xr_svf,
                                   sgi_list,
                                   type='sgi_id',
                                   path_save=path_save)
path = os.path.join(cfg.dataPath, path_SGI_topo, 'xr_masked_grids_sgi/')
xr.open_dataset(path + 'A10g-02.zarr').masked_aspect.plot()

In [None]:
xr.open_dataset(path + 'A10g-02.zarr').svf.plot()

### Data exploration:

In [None]:
data_glamos = pd.read_csv(cfg.dataPath + path_PMB_GLAMOS_csv +
                          'CH_wgms_dataset_all.csv')
gl_area = get_gl_area(cfg)
areas_train_set = [
    gl_area[gl] for gl in data_glamos['GLACIER'].unique()
    if gl in gl_area.keys()
]

# histogram
plt.hist(areas_train_set, bins=50)
plt.xlabel('Area (km2)')
plt.title('Histogram of glacier areas with stakes')

In [None]:
# Load the shapefile
shapefile_path = os.path.join(cfg.dataPath, path_SGI_topo,
                              'inventory_sgi2016_r2020',
                              'SGI_2016_glaciers.shp')
gdf_shapefiles = gpd.read_file(shapefile_path)

# Histogram of area:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
sns.histplot(gdf_shapefiles.area / (10**6),
             color='blue',
             kde=True,
             bins=50,
             ax=axs[0])

# boxplot
sns.boxplot(x=gdf_shapefiles.area / (10**6), color='blue', ax=axs[1])

# set x label to km2
axs[0].set_xlabel('Area (km2)')
axs[1].set_xlabel('Area (km2)')

plt.suptitle('Histogram and Boxplot of all glaciers in SGI 2016')

## Create monthly grids:

### 2016 - 2022:

In [None]:
# --- QUIET parallel run: tqdm in notebook, detailed logs to file ---

import os, io, sys, logging, warnings, multiprocessing as mp
from contextlib import redirect_stdout, redirect_stderr
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime
from tqdm import tqdm

warnings.filterwarnings("ignore")

# Climate columns
vois_climate = [
    't2m', 'tp', 'slhf', 'sshf', 'ssrd', 'fal', 'str', 'u10', 'v10'
]
# Topographical columns
vois_topographical = [
    "aspect", "slope", "hugonnet_dhdt", "consensus_ice_thickness", "millan_v",
    "topo", "svf"
]

In [None]:
# ---------- per-process caches for heavy I/O ----------
_GDF_SGI = None
_GDF_RGI = None


def _load_outlines_once(cfg):
    """Load GeoDataFrames once per worker; cache in globals."""
    global _GDF_SGI, _GDF_RGI
    if _GDF_SGI is None:
        import geopandas as gpd
        shp_sgi = os.path.join(cfg.dataPath, path_SGI_topo,
                               'inventory_sgi2016_r2020',
                               'SGI_2016_glaciers_copy.shp')
        _GDF_SGI = gpd.read_file(shp_sgi)
    if _GDF_RGI is None:
        import geopandas as gpd
        _GDF_RGI = gpd.read_file(cfg.dataPath + path_rgi_outlines)
    return _GDF_SGI, _GDF_RGI


# ---------- worker init: cap threads + silence worker stdout/stderr ----------
def _worker_init_quiet():
    # silence prints in workers
    sys.stdout = open(os.devnull, "w")
    sys.stderr = open(os.devnull, "w")
    # cap BLAS threads to avoid oversubscription
    os.environ.setdefault("OMP_NUM_THREADS", "1")
    os.environ.setdefault("MKL_NUM_THREADS", "1")
    os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
    os.environ.setdefault("NUMEXPR_MAX_THREADS", "1")
    try:
        import torch
        torch.set_num_threads(1)
    except Exception:
        pass


# ---------- one task (sgi_id, year) ----------
def _process_glacier_year(sgi_id, year, cfg, year_out_dir):
    """
    Returns: (status, sgi_id, year, message)
      status in {"success","skipped","error"}
    """
    try:
        import xarray as xr
        import pandas as pd

        # load outlines once per worker
        glacier_outline_sgi, glacier_outline_rgi = _load_outlines_once(cfg)

        # --- load coarsened SGI topo (zarr) ---
        path_zarr_root = os.path.join(cfg.dataPath, path_SGI_topo,
                                      'xr_masked_grids_sgi/')
        zarr_path = os.path.join(path_zarr_root, f"{sgi_id}.zarr")
        try:
            ds_coarsened = xr.open_dataset(zarr_path)
        except Exception as e:
            return ("error", sgi_id, year, f"load zarr error: {e}")

        # --- build grid dataframe ---
        try:
            rgi_id = None
            df_grid = create_glacier_grid_SGI(sgi_id, year, rgi_id,
                                              ds_coarsened)
            df_grid = df_grid.reset_index(drop=True)
            dataset_grid = mbm.data_processing.Dataset(cfg=cfg,
                                                       data=df_grid,
                                                       region_name='CH',
                                                       region_id=11,
                                                       data_path=cfg.dataPath +
                                                       path_PMB_GLAMOS_csv)
        except Exception as e:
            return ("error", sgi_id, year, f"create grid error: {e}")

        # --- add climate features ---
        try:
            era5_climate_data = os.path.join(cfg.dataPath, path_ERA5_raw,
                                             'era5_monthly_averaged_data.nc')
            geopotential_data = os.path.join(cfg.dataPath, path_ERA5_raw,
                                             'era5_geopotential_pressure.nc')
            dataset_grid.get_climate_features(
                climate_data=era5_climate_data,
                geopotential_data=geopotential_data,
                change_units=True,
                smoothing_vois={
                    'vois_climate': vois_climate,
                    'vois_other': ['ALTITUDE_CLIMATE']
                })
            if dataset_grid.data.empty:
                return ("error", sgi_id, year, "no climate rows")
        except Exception as e:
            return ("error", sgi_id, year, f"climate features error: {e}")

        # --- intersect with RGI + add OGGM features ---
        try:
            df_y_gl = dataset_grid.data
            df_y_gl = df_y_gl.rename(columns={'RGIId': 'RGIId_old'})
            df_y_gl = mbm.data_processing.utils.get_rgi(
                data=df_y_gl, glacier_outlines=glacier_outline_rgi)
            df_y_gl = df_y_gl.dropna(subset=['RGIId'])
            if df_y_gl.empty:
                return ("skipped", sgi_id, year, "no RGI intersection")

            voi = ["hugonnet_dhdt", "consensus_ice_thickness", "millan_v"]
            df_y_gl = add_OGGM_features(df_y_gl, voi, cfg.dataPath + path_OGGM)
            df_y_gl['GLWD_ID'] = df_y_gl.apply(
                lambda x: mbm.data_processing.utils.get_hash(
                    f"{x.GLACIER}_{x.YEAR}"),
                axis=1).astype(str)

            dataset_grid = mbm.data_processing.Dataset(cfg=cfg,
                                                       data=df_y_gl,
                                                       region_name='CH',
                                                       region_id=11,
                                                       data_path=cfg.dataPath +
                                                       path_PMB_GLAMOS_csv)
        except Exception as e:
            return ("error", sgi_id, year, f"OGGM features error: {e}")

        # --- to monthly ---
        try:
            dataset_grid.convert_to_monthly(
                meta_data_columns=cfg.metaData,
                vois_climate=vois_climate,
                vois_topographical=vois_topographical)
        except Exception as e:
            return ("error", sgi_id, year, f"convert monthly error: {e}")

        # --- final save ---
        try:
            df_oggm = dataset_grid.data.copy()
            df_oggm = df_oggm.rename(columns={
                'aspect': 'aspect_sgi',
                'slope': 'slope_sgi'
            })
            df_oggm['POINT_ELEVATION'] = df_oggm['topo']
            save_path = os.path.join(year_out_dir,
                                     f"{sgi_id}_grid_{year}.parquet")
            df_oggm.to_parquet(save_path,
                               engine="pyarrow",
                               compression="snappy")
        except Exception as e:
            return ("error", sgi_id, year, f"save error: {e}")

        return ("success", sgi_id, year, "")

    except Exception as e:
        return ("error", sgi_id, year, f"unexpected: {e}")

In [None]:
# ---------- logging (single text log, no console spam) ----------
os.makedirs("logs", exist_ok=True)
LOG_PATH = f"logs/process_log_SGI_full.log"
logging.basicConfig(
    filename=LOG_PATH,
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger("sgi")

# ---------- run (shows only tqdm) ----------
years = range(2016, 2023)
RUN = False

# optional: wipe the root yearly output dir ONCE, before loop
root_out = os.path.join(cfg.dataPath,
                        'GLAMOS/topo/gridded_topo_inputs/SGI2020_all')


# helper to silence main-process stdout/stderr within the run
class _Devnull(io.StringIO):

    def write(self, *args, **kwargs):
        return 0


if RUN:
    emptyfolder(root_out)
    ctx = mp.get_context("fork")  # Linux
    max_workers = min(max(1, (os.cpu_count() or 2) - 1), 32)

    with redirect_stdout(_Devnull()):  # keep stderr open for tqdm
        for year in years:
            year_out_dir = os.path.join(root_out, str(year))
            if not os.path.exists(year_out_dir):
                os.makedirs(year_out_dir, exist_ok=True)
                log.info(f"Created directory {year_out_dir}")
            else:
                emptyfolder(year_out_dir)
                log.info(f"Emptied directory {year_out_dir}")

            ok = skip = err = 0
            with ProcessPoolExecutor(
                    max_workers=max_workers,
                    initializer=_worker_init_quiet,
                    mp_context=ctx,
            ) as ex:
                futures = [
                    ex.submit(_process_glacier_year, sgi_id, year, cfg,
                              year_out_dir) for sgi_id in sgi_list
                ]

                for fut in tqdm(as_completed(futures),
                                total=len(futures),
                                desc=f"Year {year} ({max_workers} workers)"):
                    try:
                        status, sgi_id, y, message = fut.result()
                    except Exception as e:
                        status, sgi_id, y, message = "error", "unknown", year, str(
                            e)

                    if status == "success":
                        ok += 1
                        #log.info(f"SUCCESS {sgi_id} {y}")
                    elif status == "skipped":
                        skip += 1
                        #log.warning(f"SKIP    {sgi_id} {y}: {message}")
                    else:
                        err += 1
                        #log.error(f"ERROR   {sgi_id} {y}: {message}")

            log.info(
                f"SUMMARY {year}: ok={ok} skip={skip} err={err} total={len(futures)}"
            )

print(f"Run complete. See log: {LOG_PATH}")

In [None]:
year = 2016
path_save_monthly = os.path.join(root_out, f'{year}')

sgi_id = 'B36-26'  # Aletsch

# Plot all OGGM variables
df = pd.read_parquet(
    os.path.join(path_save_monthly, f"{sgi_id}_grid_{year}.parquet"))
df = df[df.MONTHS == 'sep']
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
voi = [
    't2m', 'tp', 'svf', 'ELEVATION_DIFFERENCE', 'hugonnet_dhdt',
    'consensus_ice_thickness'
]
axs = axs.flatten()
for i, var in enumerate(voi):
    sns.scatterplot(df,
                    x='POINT_LON',
                    y='POINT_LAT',
                    hue=var,
                    s=5,
                    alpha=0.5,
                    palette='twilight_shifted',
                    ax=axs[i])

## Train LSTM model:

### Set up model:

In [None]:
data_glamos = getStakesData(cfg)

months_head_pad, months_tail_pad = mbm.data_processing.utils._compute_head_tail_pads_from_df(
    data_glamos)

MONTHLY_COLS = [
    't2m',
    'tp',
    'slhf',
    'sshf',
    'ssrd',
    'fal',
    'str',
    'ELEVATION_DIFFERENCE',
]
STATIC_COLS = [
    'aspect', 'slope', 'hugonnet_dhdt', 'consensus_ice_thickness', 'millan_v',
    'svf'
]

feature_columns = MONTHLY_COLS + STATIC_COLS

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Transform data to monthly format (run or load data):
paths = {
    'csv_path': cfg.dataPath + path_PMB_GLAMOS_csv,
    'era5_climate_data':
    cfg.dataPath + path_ERA5_raw + 'era5_monthly_averaged_data.nc',
    'geopotential_data':
    cfg.dataPath + path_ERA5_raw + 'era5_geopotential_pressure.nc',
    'radiation_save_path': cfg.dataPath + path_pcsr + 'zarr/'
}
RUN = False
data_monthly = process_or_load_data(
    run_flag=RUN,
    data_glamos=data_glamos,
    paths=paths,
    cfg=cfg,
    vois_climate=vois_climate,
    vois_topographical=vois_topographical,
    output_file='CH_wgms_dataset_monthly_LSTM_all_SGI.csv')

# Create DataLoader
dataloader_gl = mbm.dataloader.DataLoader(cfg,
                                          data=data_monthly,
                                          random_seed=cfg.seed,
                                          meta_data_columns=cfg.metaData)

### Dataloaders:

In [None]:
# Ensure all test glaciers exist in the dataset
existing_glaciers = set(data_monthly.GLACIER.unique())
missing_glaciers = [g for g in TEST_GLACIERS if g not in existing_glaciers]

if missing_glaciers:
    print(
        f"Warning: The following test glaciers are not in the dataset: {missing_glaciers}"
    )

# Define training glaciers correctly
train_glaciers = [i for i in existing_glaciers if i not in TEST_GLACIERS]

data_test = data_monthly[data_monthly.GLACIER.isin(TEST_GLACIERS)]
print('Size of monthly test data:', len(data_test))

data_train = data_monthly[data_monthly.GLACIER.isin(train_glaciers)]
print('Size of monthly train data:', len(data_train))

if len(data_train) == 0:
    print("Warning: No training data available!")
else:
    test_perc = (len(data_test) / len(data_train)) * 100
    print('Percentage of test size: {:.2f}%'.format(test_perc))

splits, test_set, train_set = get_CV_splits(dataloader_gl,
                                            test_split_on='GLACIER',
                                            test_splits=TEST_GLACIERS,
                                            random_state=cfg.seed)

print('Test glaciers: ({}) {}'.format(len(test_set['splits_vals']),
                                      test_set['splits_vals']))
test_perc = (len(test_set['df_X']) / len(train_set['df_X'])) * 100
print('Percentage of test size: {:.2f}%'.format(test_perc))
print('Size of test set:', len(test_set['df_X']))
print('Train glaciers: ({}) {}'.format(len(train_set['splits_vals']),
                                       train_set['splits_vals']))
print('Size of train set:', len(train_set['df_X']))

# Validation and train split:
data_train = train_set['df_X']
data_train['y'] = train_set['y']

data_test = test_set['df_X']
data_test['y'] = test_set['y']

seed_all(cfg.seed)

df_train = data_train.copy()
df_train['PERIOD'] = df_train['PERIOD'].str.strip().str.lower()

df_test = data_test.copy()
df_test['PERIOD'] = df_test['PERIOD'].str.strip().str.lower()

# --- build train dataset from dataframe ---
ds_train = mbm.data_processing.MBSequenceDataset.from_dataframe(
    df_train,
    MONTHLY_COLS,
    STATIC_COLS,
    months_tail_pad=months_tail_pad,
    months_head_pad=months_head_pad,
    expect_target=True)

ds_test = mbm.data_processing.MBSequenceDataset.from_dataframe(
    df_test,
    MONTHLY_COLS,
    STATIC_COLS,
    months_tail_pad=months_tail_pad,
    months_head_pad=months_head_pad,
    expect_target=True)

train_idx, val_idx = mbm.data_processing.MBSequenceDataset.split_indices(
    len(ds_train), val_ratio=0.2, seed=cfg.seed)

### Model:

In [None]:
custom_params = {
    'Fm': len(MONTHLY_COLS),
    'Fs': len(STATIC_COLS),
    'hidden_size': 64,
    'num_layers': 2,
    'bidirectional': False,
    'dropout': 0.2,
    'static_layers': 2,
    'static_hidden': [128, 64],
    'static_dropout': 0.1,
    'lr': 0.001,
    'weight_decay': 0.0001,
    'loss_name': 'neutral',
    'loss_spec': None,
    'two_heads': True,
    'head_dropout': 0.0
}

custom_params['two_heads'] = True
custom_params['head_dropout'] = 0.0

# --- build model, resolve loss, train, reload best ---
current_date = datetime.now().strftime("%Y-%m-%d")
model_filename = f"models/lstm_model_{current_date}_SGI2020.pt"

# --- loaders (fit scalers on TRAIN, apply to whole ds_train) ---
ds_train_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
    ds_train)

ds_test_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
    ds_test)

train_dl, val_dl = ds_train_copy.make_loaders(
    train_idx=train_idx,
    val_idx=val_idx,
    batch_size_train=64,
    batch_size_val=128,
    seed=cfg.seed,
    fit_and_transform=
    True,  # fit scalers on TRAIN and transform Xm/Xs/y in-place
    shuffle_train=True,
    use_weighted_sampler=True  # use weighted sampler for training
)

# --- test loader (copies TRAIN scalers into ds_test and transforms it) ---
test_dl = mbm.data_processing.MBSequenceDataset.make_test_loader(
    ds_test_copy, ds_train_copy, batch_size=128, seed=cfg.seed)

# --- build model, resolve loss, train, reload best ---
model = mbm.models.LSTM_MB.build_model_from_params(cfg, custom_params, device)
loss_fn = mbm.models.LSTM_MB.resolve_loss_fn(custom_params)

TRAIN = False
if TRAIN:
    if os.path.exists(model_filename): os.remove(model_filename)

    history, best_val, best_state = model.train_loop(
        device=device,
        train_dl=train_dl,
        val_dl=val_dl,
        epochs=150,
        lr=custom_params['lr'],
        weight_decay=custom_params['weight_decay'],
        clip_val=1,
        # scheduler
        sched_factor=0.5,
        sched_patience=6,
        sched_threshold=0.01,
        sched_threshold_mode="rel",
        sched_cooldown=1,
        sched_min_lr=1e-6,
        # early stopping
        es_patience=15,
        es_min_delta=1e-4,
        # logging
        log_every=5,
        verbose=True,
        # checkpoint
        save_best_path=model_filename,
        loss_fn=loss_fn,
    )
    plot_history_lstm(history)

# Evaluate on test
# model_filename = 'models/lstm_model_2025-10-13_SGI2020.pt'
state = torch.load(model_filename, map_location=device)
model.load_state_dict(state)
test_metrics, test_df_preds = model.evaluate_with_preds(
    device, test_dl, ds_test_copy)
test_rmse_a, test_rmse_w = test_metrics['RMSE_annual'], test_metrics[
    'RMSE_winter']

print('Test RMSE annual: {:.3f} | winter: {:.3f}'.format(
    test_rmse_a, test_rmse_w))

In [None]:
scores_annual, scores_winter = compute_seasonal_scores(test_df_preds,
                                                       target_col='target',
                                                       pred_col='pred')

print("Annual scores:", scores_annual)
print("Winter scores:", scores_winter)

fig = plot_predictions_summary(grouped_ids=test_df_preds,
                               scores_annual=scores_annual,
                               scores_winter=scores_winter,
                               ax_xlim=(-8, 6),
                               ax_ylim=(-8, 6))

### Predictions on SGI 2020:

In [None]:
# ----------------- quiet main logging (optional) -----------------
os.makedirs("logs", exist_ok=True)
LOG_PATH = f"logs/predict_glaciers_{datetime.now():%Y%m%d_%H%M%S}.log"
logging.basicConfig(filename=LOG_PATH,
                    level=logging.INFO,
                    format="%(asctime)s [%(levelname)s] %(message)s",
                    datefmt="%Y-%m-%d %H:%M:%S")
log = logging.getLogger("predict")

# ----------------- constants & paths -----------------
REQUIRED = ['GLACIER', 'YEAR', 'ID', 'PERIOD', 'MONTHS']
all_columns = MONTHLY_COLS + STATIC_COLS + cfg.fieldsNotFeatures
path_save_glw = os.path.join(cfg.dataPath, 'GLAMOS', 'distributed_MB_grids',
                             'MBM/swisswide')
os.makedirs(path_save_glw, exist_ok=True)
emptyfolder(path_save_glw)
path_xr_grids = os.path.join(cfg.dataPath, path_SGI_topo,
                             'xr_masked_grids_sgi/')

path_gridded_inputs = os.path.join(
    cfg.dataPath, 'GLAMOS/topo/gridded_topo_inputs/SGI2020_all')


# ----------------- worker init (quiet + CPU threads cap) -----------------
def _worker_init_quiet():
    # keep stderr for tqdm in main; silence worker prints
    sys.stdout = open(os.devnull, "w")
    sys.stderr = open(os.devnull, "w")
    os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")  # CPU only
    os.environ.setdefault("OMP_NUM_THREADS", "1")
    os.environ.setdefault("MKL_NUM_THREADS", "1")
    os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
    os.environ.setdefault("NUMEXPR_MAX_THREADS", "1")
    try:
        torch.set_num_threads(1)
    except Exception:
        pass


# ----------------- per-process model cache -----------------
_MODEL = None


def _get_model_cpu(cfg, params_used, model_filename):
    """Build+load the model once per worker (cached)."""
    global _MODEL
    if _MODEL is None:
        device = torch.device("cpu")
        model = mbm.models.LSTM_MB.build_model_from_params(cfg,
                                                           params_used,
                                                           device,
                                                           verbose=False)
        state = torch.load(model_filename, map_location=device)
        model.load_state_dict(state)
        model.eval()
        _MODEL = model
    return _MODEL


# ----------------- one glacier-year task -----------------
def _process_glacier_year(args):
    glacier_name, year = args  # everything else taken from globals via fork
    try:
        # Seed for reproducibility if you wish
        seed_all(cfg.seed)

        glacier_path = os.path.join(path_gridded_inputs, year)
        if not os.path.exists(glacier_path):
            return ("skip", glacier_name, year, "glacier folder missing")

        file_name = f"{glacier_name}_grid_{year}.parquet"
        parquet_path = os.path.join(glacier_path, file_name)
        if not os.path.exists(parquet_path):
            return ("skip", glacier_name, year, "parquet missing")

        df_grid_monthly = pd.read_parquet(parquet_path).copy()
        df_grid_monthly.drop_duplicates(inplace=True)

        # rename columns
        df_grid_monthly.rename(columns={
            'aspect_sgi': 'aspect',
            'slope_sgi': 'slope'
        },
                               inplace=True)

        # Keep required + feature columns; preserve order
        needed = set(all_columns) | set(REQUIRED)
        keep = [c for c in df_grid_monthly.columns if c in needed]
        df_grid_monthly = df_grid_monthly[keep]

        # Build winter subset (Sep–Apr)
        winter_months = [
            'sep', 'oct', 'nov', 'dec', 'jan', 'feb', 'mar', 'apr'
        ]
        df_grid_monthly_w = (
            df_grid_monthly[df_grid_monthly['MONTHS'].str.lower().isin(
                winter_months)].copy().dropna(subset=['ID', 'MONTHS']))
        df_grid_monthly_w['PERIOD'] = 'winter'

        # Minimal NaN cleanup for annual
        df_grid_monthly_a = df_grid_monthly.dropna(subset=['ID', 'MONTHS'])

        # Fit scalers on TRAIN only (clone/train ds are global via fork)
        ds_train_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
            ds_train)
        ds_train_copy.fit_scalers(train_idx)

        # Annual dataset/loader
        ds_gl_a = mbm.data_processing.MBSequenceDataset.from_dataframe(
            df_grid_monthly_a,
            MONTHLY_COLS,
            STATIC_COLS,
            months_tail_pad=months_tail_pad,
            months_head_pad=months_head_pad,
            expect_target=False,
            show_progress=False)
        test_gl_dl_a = mbm.data_processing.MBSequenceDataset.make_test_loader(
            ds_gl_a, ds_train_copy, seed=cfg.seed, batch_size=128)

        # Model (cached per worker)
        model = _get_model_cpu(cfg, custom_params, model_filename)
        device = torch.device("cpu")

        # Predict annual
        df_preds_a = model.predict_with_keys(device, test_gl_dl_a, ds_gl_a)

        # Aggregate annual
        data_a = df_preds_a[['ID', 'pred']].set_index('ID')
        meta_cols = [
            c for c in ['YEAR', 'POINT_LAT', 'POINT_LON', 'GLWD_ID']
            if c in df_grid_monthly_a.columns
        ]
        grouped_ids_a = (
            df_grid_monthly_a.groupby('ID')[meta_cols].first().merge(
                data_a, left_index=True, right_index=True, how='left'))
        months_per_id_a = df_grid_monthly_a.groupby('ID')['MONTHS'].unique()
        grouped_ids_a = grouped_ids_a.merge(months_per_id_a,
                                            left_index=True,
                                            right_index=True)
        grouped_ids_a.reset_index(inplace=True)
        grouped_ids_a.sort_values(by='ID', inplace=True)

        pred_y_annual = grouped_ids_a.copy()
        pred_y_annual['PERIOD'] = 'annual'
        pred_y_annual = pred_y_annual.drop(columns=['YEAR'], errors='ignore')

        # Load per-year DEM grid and save annual
        path_glacier_dem = os.path.join(path_xr_grids, f"{glacier_name}.zarr")
        if not os.path.exists(path_glacier_dem):
            return ("skip", glacier_name, year, "DEM zarr missing")
        ds = xr.open_zarr(path_glacier_dem)

        geoData = mbm.geodata.GeoData(df_grid_monthly_a,
                                      months_head_pad=months_head_pad,
                                      months_tail_pad=months_tail_pad)

        # save to yearly subfolders
        path_save_y = os.path.join(path_save_glw, str(year))
        os.makedirs(path_save_y, exist_ok=True)
        geoData._save_prediction(ds, pred_y_annual, glacier_name, year,
                                 path_save_y, "annual")

        # # Winter branch
        # if len(df_grid_monthly_w) == 0:
        #     return ("ok", glacier_name, year, "no winter months")

        # ds_gl_w = mbm.data_processing.MBSequenceDataset.from_dataframe(
        #     df_grid_monthly_w,
        #     MONTHLY_COLS,
        #     STATIC_COLS,
        #     months_tail_pad=months_tail_pad,
        #     months_head_pad=months_head_pad,
        #     expect_target=False,
        #     show_progress=False)
        # test_gl_dl_w = mbm.data_processing.MBSequenceDataset.make_test_loader(
        #     ds_gl_w, ds_train_copy, seed=cfg.seed, batch_size=128)

        # df_preds_w = model.predict_with_keys(device, test_gl_dl_w, ds_gl_w)

        # data_w = df_preds_w[['ID', 'pred']].set_index('ID')
        # grouped_ids_w = (
        #     df_grid_monthly_w.groupby('ID')[meta_cols].first().merge(
        #         data_w, left_index=True, right_index=True, how='left'))
        # months_per_id_w = df_grid_monthly_w.groupby('ID')['MONTHS'].unique()
        # grouped_ids_w = grouped_ids_w.merge(months_per_id_w,
        #                                     left_index=True,
        #                                     right_index=True)
        # grouped_ids_w.reset_index(inplace=True)
        # grouped_ids_w.sort_values(by='ID', inplace=True)

        # pred_y_winter = grouped_ids_w.copy()
        # pred_y_winter['PERIOD'] = 'winter'
        # pred_y_winter = pred_y_winter.drop(columns=['YEAR'], errors='ignore')

        # geoData_w = mbm.geodata.GeoData(df_grid_monthly_w,
        #                                 months_head_pad=months_head_pad,
        #                                 months_tail_pad=months_tail_pad)
        # geoData_w._save_prediction(ds, pred_y_winter, glacier_name, year,
        #                            path_save_glw, "winter")

        return ("ok", glacier_name, year, "")

    except Exception as e:
        return ("err", glacier_name, year, str(e))

In [None]:
# ----------------- build tasks -----------------
root = Path(path_gridded_inputs)

# years = ['2016', ..., '2022'] – digits-only dirs, sorted numerically
years = sorted(
    [p.name for p in root.iterdir() if p.is_dir() and p.name.isdigit()],
    key=int)

# Regex: <SGI>_grid_<YEAR>.parquet
pat = re.compile(r'^(?P<sgi_id>.+?)_grid_(?P<year>\d{4})\.parquet$')

# Build SGI list from a reference year (2016). Fall back to first available year if 2016 missing.
ref_year = '2016' if (root /
                      '2016').is_dir() else (years[0] if years else None)
if ref_year is None:
    raise RuntimeError(f"No year folders found in {root}")

sgi_list = []
for f in (root / ref_year).glob("*.parquet"):
    m = pat.match(f.name)
    if m:
        sgi_list.append(m['sgi_id'])

# Unique & sorted (optional)
sgi_list = sorted(set(sgi_list))

# Build tasks as (sgi_id, year) — worker will skip if parquet missing in that year
tasks = []
for y in years:
    year_dir = root / y
    for sgi_id in sgi_list:
        if (year_dir / f"{sgi_id}_grid_{y}.parquet").exists():
            tasks.append((sgi_id, y))


# ----------------- run in parallel (quiet stdout, keep tqdm) -----------------
class _Devnull(io.StringIO):

    def write(self, *args, **kwargs):
        return 0


ctx = mp.get_context("fork")  # Linux
max_workers = min(max(1, (os.cpu_count() or 2) - 1), 32)

with redirect_stdout(_Devnull()):  # keep stderr so tqdm is visible
    ok = skip = err = 0
    with ProcessPoolExecutor(max_workers=max_workers,
                             initializer=_worker_init_quiet,
                             mp_context=ctx) as ex:
        futures = [ex.submit(_process_glacier_year, t) for t in tasks]
        for fut in tqdm(as_completed(futures),
                        total=len(futures),
                        desc=f"Predicting ({max_workers} workers)"):
            status, g, y, msg = fut.result()
            if status == "ok":
                ok += 1
                if msg:  # no winter months, non-fatal
                    log.info(f"OK {g} {y}: {msg}")
            elif status == "skip":
                skip += 1
                log.warning(f"SKIP {g} {y}: {msg}")
            else:
                err += 1
                log.error(f"ERR {g} {y}: {msg}")

log.info(f"SUMMARY: ok={ok} skip={skip} err={err} total={len(tasks)}")
print(f"Done. Logs → {LOG_PATH}")

In [None]:
# open an example
sgi_id = 'B36-26'  # Aletsch
year = 2016
path_save_glw = f'{cfg.dataPath}/GLAMOS/distributed_MB_grids/MBM/swisswide/{year}/'
path = os.path.join(path_save_glw, f"{sgi_id}/{sgi_id}_{year}_annual.zarr")

xr.open_dataset(path).pred_masked.plot()

## Analyze results:

### Look at 2016:

#### Mean predicted MB:

In [None]:
year = 2016
path_save_glw = f'{cfg.dataPath}/GLAMOS/distributed_MB_grids/MBM/swisswide/{year}/'
sgi_id_list = os.listdir(path_save_glw)


def get_mean_mb_year(year):
    path_save_glw = f'{cfg.dataPath}/GLAMOS/distributed_MB_grids/MBM/swisswide/{year}/'

    # Calculate mean predicted mb for each glacier
    rows = []
    for sgi_id in tqdm(sgi_id_list):
        path_file = os.path.join(path_save_glw,
                                 f"{sgi_id}/{sgi_id}_{year}_annual.zarr")
        if os.path.exists(path_file):
            gridd_mb = xr.open_dataset(path_file)
            mean_value = gridd_mb.pred_masked.mean().values.item()
            rows.append({'sgi_id': sgi_id, 'mean_mb': mean_value})
        else:
            print(f"File not found: {path_file}")
            continue
    mean_mb = pd.DataFrame(rows)
    return mean_mb


mean_mb_2016 = get_mean_mb_year(2016)
mean_mb_2017 = get_mean_mb_year(2017)
mean_mb_2018 = get_mean_mb_year(2018)
mean_mb_2019 = get_mean_mb_year(2019)
mean_mb_2020 = get_mean_mb_year(2020)
mean_mb_2021 = get_mean_mb_year(2021)
mean_mb_2022 = get_mean_mb_year(2022)

In [None]:
# Plot mean mb from MBM and GLAMOS:
# open reference GLAMOS
df_reference = pd.read_csv(
    f'{cfg.dataPath}/GLAMOS/massbalance_swisswide_2024_r2024_clean.csv'
).iloc[1:]

ref_MB_glamos = []

for year in range(2016, 2023):
    ref_CH_y = df_reference[(df_reference.catchment == 'Switzerland')
                            & (df_reference.year == str(year))]
    ref_MB_glamos.append(ref_CH_y['massbalance evolution'].values[0])

# Prepare the data
years = list(range(2016, 2023))
mbm_mb = [
    mean_mb_2016.mean_mb.mean(),
    mean_mb_2017.mean_mb.mean(),
    mean_mb_2018.mean_mb.mean(),
    mean_mb_2019.mean_mb.mean(),
    mean_mb_2020.mean_mb.mean(),
    mean_mb_2021.mean_mb.mean(),
    mean_mb_2022.mean_mb.mean()
]

# Build DataFrame correctly
df = pd.DataFrame({'MBM MB': mbm_mb, 'GLAMOS MB': ref_MB_glamos}, index=years)

# give same type to columns
df['MBM MB'] = df['MBM MB'].astype(float)
df['GLAMOS MB'] = df['GLAMOS MB'].astype(float)

# Now plotting works
df.plot(kind='bar', figsize=(8, 5))

plt.xlabel('Year')
plt.ylabel('Mean Mass Balance (m w.e.)')
plt.title('Comparison of Mean Mass Balance: MBM vs GLAMOS')
plt.xticks(rotation=0)
plt.legend()
plt.tight_layout()
plt.show()

#### Get volumes and areas:

In [None]:
def convert_id(id_str):
    return id_str.replace('/', '-')

# Paths
path_volumes = cfg.dataPath + '/GLAMOS/volumes/'
path_areas = cfg.dataPath + '/GLAMOS/topo/SGI2020/inventory_sgi2016_r2020'

# Load the shapefile of volumes
volgdf = gpd.read_file(os.path.join(path_volumes, 'Summary.shp'))
volgdf['sgi-id'] = volgdf['pk_sgi'].apply(convert_id)
volgdf['V_2016'] = volgdf['V_2016'] * 10**9  # convert to m³

# Load the shapefile of areas from SGI 2016
areagdf = gpd.read_file(os.path.join(path_areas, 'SGI_2016_glaciers.shp'))
areagdf['area_2016'] = areagdf['area_km2'] * 10**6  # convert to m²

# Initialize glacier_info with volumes and areas
glacier_info = volgdf[['sgi-id',
                       'V_2016']].merge(areagdf[['sgi-id', 'area_2016']],
                                        on='sgi-id',
                                        how='inner')

# List of years you want to process
years = range(2016, 2023)  # includes 2022

# Now loop over the years and merge mean mass balance year by year
for year in years:
    mean_mb_df = globals().get(f"mean_mb_{year}", None)
    if mean_mb_df is not None:
        mean_mb_df = mean_mb_df.copy()
        mean_mb_df['sgi-id'] = mean_mb_df['sgi_id'].apply(convert_id)
        glacier_info = glacier_info.merge(
            mean_mb_df[['sgi-id', 'mean_mb'
                        ]].rename(columns={'mean_mb': f'mean_mb_{year}'}),
            on='sgi-id',
            how=
            'left'  # use 'left' to avoid dropping glaciers if some years are missing
        )
    else:
        print(f"Warning: mean_mb_{year} not found in globals.")

glacier_info.dropna(inplace=True)  # Drop rows with NaN values
glacier_info.set_index('sgi-id', inplace=True)
glacier_info.head()

#### Total vol change 2016:

In [None]:
density_ice = 916.7  # or 917 kg/m³
density_water = 1000  # kg/m³

# Calculate volume changes
glacier_info['vol_change_2016'] = (glacier_info['area_2016'] *
                                   glacier_info['mean_mb_2016']) * (
                                       density_water / density_ice)

vol_change_2016 = glacier_info['vol_change_2016'].sum(
) / 10**9  # convert to km3
volume_2016 = glacier_info['V_2016'].sum() / 10**9  # convert to km3
area_2016 = glacier_info['area_2016'].sum() / 10**6  # convert to km2
volume_change_2016_perc = vol_change_2016 / volume_2016 * 100
mb_2016 = glacier_info['mean_mb_2016'].mean()

ref_CH_2016 = df_reference[(df_reference.catchment == 'Switzerland')
                           & (df_reference.year == '2016')]

print('Volume change from GLAMOS:', ref_CH_2016['volume change'].values[0],
      '%')  # in %
print('Volume change from MBM:', np.round(volume_change_2016_perc, 2),
      '%')  # in %

print('Mean mass balance from GLAMOS:',
      ref_CH_2016['massbalance evolution'].values[0], 'm w.e.')
print('Mean mass balance from MBM:', np.round(mb_2016, 2), 'm w.e.')

### Volume area scaling:

In [None]:
def volume_area_scaling(
        glacier_info,
        t1,
        beta=1.36,
        density_ice=916.7,  # kg/m³
        density_water=1000  # kg/m³
):
    """
    Update glacier_info by applying volume-area scaling from year t1 to t1+1.
    """

    # Calculate c if not already done
    if 'c' not in glacier_info.columns:
        glacier_info['c'] = glacier_info[f'V_{t1}'] / (
            glacier_info[f'area_{t1}']**beta)

    # Get starting volume and area
    V_t1 = glacier_info[f'V_{t1}']
    A_t1 = glacier_info[f'area_{t1}']

    # Get mass balance in m w.e. for the following year (mean_mb at t1+1)
    mb = glacier_info[f'mean_mb_{t1}']

    # Calculate volume change [m³ of ice]
    vol_change = mb * A_t1 * (density_water / density_ice)

    # Update volume, ensuring non-negative
    V_t2 = (V_t1 + vol_change).clip(lower=0)

    # Update area using volume-area scaling, ensuring non-negative
    A_t2 = (V_t2 / glacier_info['c'])**(1 / beta)
    A_t2 = A_t2.clip(lower=0)

    # Save results back to glacier_info
    glacier_info[f'V_{t1+1}'] = V_t2
    glacier_info[f'area_{t1+1}'] = A_t2

end_year = 2023
for year in range(2016, end_year):
    volume_area_scaling(glacier_info, t1=year, beta=1.36)

glacier_info.head()

In [None]:
df_sub = glacier_info.sort_values(by='area_2016', ascending=False).head(20)
df_sub['area_2016'] = df_sub['area_2016'] / 10**6  # convert to km2
df_sub['area_2023'] = df_sub['area_2023'] / 10**6  # convert to km2

# Plotting
fig, ax = plt.subplots(figsize=(8, 5))
index = np.arange(len(df_sub))
bar_width = 0.35

# Bars for 2016 and 2023 areas
ax.bar(index, df_sub['area_2016'], bar_width, label='Area 2016')
ax.bar(index + bar_width, df_sub['area_2023'], bar_width, label='Area 2023')

# Axis labels and title
ax.set_xlabel('Glacier Index')
ax.set_ylabel('Area (km²)')
ax.set_title('Glacier Area Comparison: 2016 vs 2023')
ax.set_xticks(index + bar_width / 2)
ax.set_xticklabels(df_sub.index)

# rotate x labels
plt.xticks(rotation=45)
ax.legend()

# Layout optimization
plt.tight_layout()
plt.show()


In [None]:
fig, axs = plt.subplots(2, 3, figsize=(12, 6))
ax = axs.flatten()
for i, year in enumerate(range(2017, 2023)):
    area_perc_loss_y = (
        glacier_info['area_2016'] -
        glacier_info[f'area_{year}']) / glacier_info['area_2016']

    sns.boxplot(area_perc_loss_y, ax=ax[i], color='blue')
    ax[i].set_title(f'Loss from 2016 to {year}')
    ax[i].set_ylabel('Area loss (%)')

plt.tight_layout()


### Calculate new volume changes:

#### Total vol change 2022: 

In [None]:
glacier_info['vol_change_2022'] = (glacier_info['area_2022'] *
                                   glacier_info['mean_mb_2022']) * (
                                       density_water / density_ice)

vol_change_2022 = glacier_info['vol_change_2022'].sum(
) / 10**9  # convert to km3

volume_2022 = glacier_info['V_2022'].sum() / 10**9  # convert to km3
mb_2022 = glacier_info['mean_mb_2022'].mean()

volume_change_2022_perc = vol_change_2022 / volume_2022 * 100

ref_CH_2022 = df_reference[(df_reference.catchment == 'Switzerland')
                           & (df_reference.year == '2022')]

print('Volume change from GLAMOS:', ref_CH_2022['volume change'].values[0],
      '%')  # in %
print('Volume change from MBM:', np.round(volume_change_2022_perc, 2),
      '%')  # in %

print('Mean mass balance from GLAMOS:',
      ref_CH_2022['massbalance evolution'].values[0], 'm w.e.')
print('Mean mass balance from MBM:', np.round(mb_2022, 2), 'm w.e.')

#### Plot vol & area change all years:

In [None]:
volume_change_y_perc, ref_V_glamos = [], []
for year in range(2016, 2023):
    glacier_info[f'vol_change_{year}'] = (glacier_info[f'area_{year}'] *
                                          glacier_info[f'mean_mb_{year}']) * (
                                              density_water / density_ice)

    vol_change_y = glacier_info[f'vol_change_{year}'].sum(
    ) / 10**9  # convert to km3

    volume_y = glacier_info[f'V_{year}'].sum() / 10**9  # convert to km3
    mb_y = glacier_info[f'mean_mb_{year}'].mean()

    volume_change_y_perc.append(vol_change_y / volume_y * 100)

    ref_CH_y = df_reference[(df_reference.catchment == 'Switzerland')
                            & (df_reference.year == str(year))]

    ref_V_glamos.append(ref_CH_y['volume change'].values[0])

# Build DataFrame correctly
df = pd.DataFrame({
    'MBM V': volume_change_y_perc,
    'GLAMOS V': ref_V_glamos
},
                  index=years)

# give same type to columns
df['MBM V'] = df['MBM V'].astype(float)
df['GLAMOS V'] = df['GLAMOS V'].astype(float)

# Now plotting works
df.plot(kind='bar', figsize=(8, 5))

plt.xlabel('Year')
plt.ylabel('% volume change')
plt.title('Volume change: MBM vs GLAMOS')
plt.xticks(rotation=0)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
volume_change_y_perc, ref_V_glamos = [], []
for year in range(2016, 2023):
    glacier_info[f'vol_change_{year}'] = (glacier_info[f'area_{year}'] *
                                          glacier_info[f'mean_mb_{year}']) * (
                                              density_water / density_ice)

    vol_change_y = glacier_info[f'vol_change_{year}'].sum(
    ) / 10**9  # convert to km3

    volume_y = glacier_info[f'V_{year}'].sum() / 10**9  # convert to km3
    mb_y = glacier_info[f'mean_mb_{year}'].mean()

    volume_change_y_perc.append(vol_change_y / volume_y * 100)

    ref_CH_y = df_reference[(df_reference.catchment == 'Switzerland')
                            & (df_reference.year == str(year))]

    ref_V_glamos.append(ref_CH_y['volume change'].values[0])

# Build DataFrame correctly
df = pd.DataFrame({
    'MBM V': volume_change_y_perc,
    'GLAMOS V': ref_V_glamos
},
                  index=years)

# give same type to columns
df['MBM V'] = df['MBM V'].astype(float)
df['GLAMOS V'] = df['GLAMOS V'].astype(float)

# Now plotting works
df['MBM V'].plot(kind='bar', figsize=(8, 5))

plt.xlabel('Year')
plt.ylabel('% volume change')
plt.title('Volume change by Mass Balance Machine over the Swiss Alps',
          fontsize=16)
plt.xticks(rotation=0)
plt.legend([], [], frameon=False)
plt.tight_layout()
plt.show()
