## Setting Up:

In [None]:
# --- Standard library
from concurrent.futures import ProcessPoolExecutor, as_completed
from contextlib import redirect_stdout
from datetime import datetime
import io
import logging
import multiprocessing as mp
import os
import sys
import warnings

# Make repo root importable (for MBM & scripts/*)
sys.path.append(os.path.join(os.getcwd(), '../../'))

# --- Third-party
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from cmcrameri import cm
import torch
from tqdm.auto import tqdm
import xarray as xr
from matplotlib.lines import Line2D

import massbalancemachine as mbm

# --- Project-local
from scripts.utils import *
from scripts.glamos import *
from scripts.models import *
from scripts.geo_data import *
from scripts.dataset import *
from scripts.geodetic import *
from scripts.plotting import *

# --- Notebook settings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.SwitzerlandConfig()

# Plot styles:
use_mbm_style()

seed_all(cfg.seed)
print("Using seed:", cfg.seed)

if torch.cuda.is_available():
    print("CUDA is available")
    free_up_cuda()
else:
    print("CUDA is NOT available")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Input data:

In [None]:
# Read GLAMOS stake data
data_glamos = get_stakes_data(cfg)

# Compute padding for monthly data
months_head_pad, months_tail_pad = mbm.data_processing.utils._compute_head_tail_pads_from_df(
    data_glamos)

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

# Transform data to monthly format (run or load data):
paths = {
    'csv_path': cfg.dataPath + path_PMB_GLAMOS_csv,
    'era5_climate_data':
    cfg.dataPath + path_ERA5_raw + 'era5_monthly_averaged_data.nc',
    'geopotential_data':
    cfg.dataPath + path_ERA5_raw + 'era5_geopotential_pressure.nc',
    'radiation_save_path': cfg.dataPath + path_pcsr + 'zarr/'
}
RUN = False
data_monthly = process_or_load_data(
    run_flag=RUN,
    data_glamos=data_glamos,
    paths=paths,
    cfg=cfg,
    vois_climate=VOIS_CLIMATE,
    vois_topographical=VOIS_TOPOGRAPHICAL,
    output_file='CH_wgms_dataset_monthly_LSTM.csv')

dataloader_gl = mbm.dataloader.DataLoader(cfg,
                                          data=data_monthly,
                                          random_seed=cfg.seed,
                                          meta_data_columns=cfg.metaData)

# remove 2025
existing_glaciers = set(data_monthly.GLACIER.unique())
train_glaciers = existing_glaciers
data_train = data_monthly[data_monthly.GLACIER.isin(train_glaciers)
                          & (data_monthly.YEAR < 2025)]
print('Size of monthly train data:', len(data_train))

# Validation and train split:
data_train = data_train
data_train['y'] = data_train['POINT_BALANCE']

data_test = data_monthly[data_monthly.YEAR == 2025]
data_test['y'] = data_test['POINT_BALANCE']

print('Size of monthly test data:', len(data_test))

In [None]:
# Convert to start of August instead:
# Convert to str → parse → replace month/day → convert back to int
data_glamos_Aug_ = data_glamos.copy()
data_glamos_Aug_["FROM_DATE"] = (
    data_glamos_Aug_["FROM_DATE"].astype(str).str.slice(0,
                                                        4)  # extract year YYYY
    .astype(int).astype(str) + "0801"  # append "0801"
).astype(int)

# Same for full temporal resolution (run or load data):
# Compute padding for monthly data
months_head_pad_Aug_, months_tail_pad_Aug_ = mbm.data_processing.utils._compute_head_tail_pads_from_df(
    data_glamos_Aug_)

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

RUN = False
data_monthly_Aug_ = process_or_load_data(
    run_flag=RUN,
    data_glamos=data_glamos_Aug_,
    paths=paths,
    cfg=cfg,
    vois_climate=VOIS_CLIMATE,
    vois_topographical=VOIS_TOPOGRAPHICAL,
    output_file='CH_wgms_dataset_monthly_LSTM_Aug_.csv')

# Create DataLoader
dataloader_gl_Aug_ = mbm.dataloader.DataLoader(cfg,
                                               data=data_monthly_Aug_,
                                               random_seed=cfg.seed,
                                               meta_data_columns=cfg.metaData)

# remove 2025
existing_glaciers = set(data_monthly_Aug_.GLACIER.unique())
train_glaciers = existing_glaciers
data_train_Aug_ = data_monthly_Aug_[
    data_monthly_Aug_.GLACIER.isin(train_glaciers)
    & (data_monthly_Aug_.YEAR < 2025)]
print('Size of monthly train data:', len(data_train_Aug_))

# Validation and train split:
data_train_Aug_ = data_train_Aug_
data_train_Aug_['y'] = data_train_Aug_['POINT_BALANCE']

data_test_Aug_ = data_monthly_Aug_[data_monthly_Aug_.YEAR == 2025]
data_test_Aug_['y'] = data_test_Aug_['POINT_BALANCE']

print('Size of monthly test data:', len(data_test_Aug_))

## LSTM:

In [None]:
MONTHLY_COLS = [
    't2m',
    'tp',
    'slhf',
    'sshf',
    'ssrd',
    'fal',
    'str',
    'pcsr',
    'ELEVATION_DIFFERENCE',
]
STATIC_COLS = ['aspect_sgi', 'slope_sgi', 'svf']

feature_columns = MONTHLY_COLS + STATIC_COLS

### Build LSTM dataloaders:

In [None]:
seed_all(cfg.seed)

ds_train = build_combined_LSTM_dataset(df_loss=data_train,
                                       df_full=data_train_Aug_,
                                       monthly_cols=MONTHLY_COLS,
                                       static_cols=STATIC_COLS,
                                       months_head_pad=months_head_pad_Aug_,
                                       months_tail_pad=months_tail_pad_Aug_,
                                       normalize_target=True,
                                       expect_target=True)

ds_test = build_combined_LSTM_dataset(df_loss=data_test,
                                      df_full=data_test_Aug_,
                                      monthly_cols=MONTHLY_COLS,
                                      static_cols=STATIC_COLS,
                                      months_head_pad=months_head_pad_Aug_,
                                      months_tail_pad=months_tail_pad_Aug_,
                                      normalize_target=True,
                                      expect_target=True)

train_idx, val_idx = mbm.data_processing.MBSequenceDataset.split_indices(
    len(ds_train), val_ratio=0.2, seed=cfg.seed)

month_list, month_pos = mbm.data_processing.utils._rebuild_month_index(
    months_head_pad_Aug_, months_tail_pad_Aug_)
month_order = [m for m, _ in sorted(month_pos.items(), key=lambda x: x[1])]
print("Month order used in sequences:", month_order)

inspect_LSTM_sample(ds_train, 0, month_labels=month_order)

### Load model:

In [None]:
# --- loaders (fit scalers on TRAIN, apply to whole ds_train) ---
seed_all(cfg.seed)
ds_train_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
    ds_train)
ds_test_copy = mbm.data_processing.MBSequenceDataset._clone_untransformed_dataset(
    ds_test)

train_dl, val_dl = ds_train_copy.make_loaders(
    train_idx=train_idx,
    val_idx=val_idx,
    batch_size_train=64,
    batch_size_val=128,
    seed=cfg.seed,
    fit_and_transform=
    True,  # fit scalers on TRAIN and transform Xm/Xs/y in-place
    shuffle_train=True,
    use_weighted_sampler=True  # use weighted sampler for training
)

# --- test loader (copies TRAIN scalers into ds_test and transforms it) ---
test_dl = mbm.data_processing.MBSequenceDataset.make_test_loader(
    ds_test_copy, ds_train_copy, batch_size=128, seed=cfg.seed)

# --- build model, resolve loss, train, reload best ---
seed_all(cfg.seed)
model = mbm.models.LSTM_MB.build_model_from_params(cfg, PARAMS_LSTM_IS_past,
                                                   device)
loss_fn = mbm.models.LSTM_MB.resolve_loss_fn(PARAMS_LSTM_IS_past)
state = torch.load(LSTM_IS_NORM_Y_PAST, map_location=device)
model.load_state_dict(state)

test_metrics, test_df_preds = model.evaluate_with_preds(
    device, test_dl, ds_test_copy)
test_rmse_a, test_rmse_w = test_metrics['RMSE_annual'], test_metrics[
    'RMSE_winter']

print('Test RMSE annual: {:.3f} | winter: {:.3f}'.format(
    test_rmse_a, test_rmse_w))

In [None]:
scores_annual, scores_winter = compute_seasonal_scores(test_df_preds,
                                                       target_col='target',
                                                       pred_col='pred')

print("Annual scores:", scores_annual)
print("Winter scores:", scores_winter)

fig = plt.figure(figsize=(10, 10))
ax1 = plt.subplot(1, 1, 1)
pred_vs_truth(
    ax1,
    test_df_preds,
    scores_annual,
    hue="PERIOD",
    add_legend=False,
    palette=[COLOR_ANNUAL, COLOR_WINTER],
    ax_xlim=(-8, 6),
    ax_ylim=(-8, 6),
)

legend_NN = "\n".join([
    r"$\mathrm{RMSE_a}=%.3f$, $\mathrm{RMSE_w}=%.3f$" %
    (scores_annual["rmse"], scores_winter["rmse"]),
    r"$\mathrm{R^2_a}=%.3f$, $\mathrm{R^2_w}=%.3f$" %
    (scores_annual["R2"], scores_winter["R2"]),
    r"$\mathrm{Bias_a}=%.3f$, $\mathrm{Bias_w}=%.3f$" %
    (scores_annual["Bias"], scores_winter["Bias"]),
])
ax1.text(
    0.05,
    0.98,
    legend_NN,
    transform=ax1.transAxes,
    verticalalignment="top",
    fontsize=20,
    bbox=dict(boxstyle="round", facecolor="white", alpha=0.5),
)
ax1.set_title('Predictions on 2025', fontsize=22)
plt.tight_layout()

# save figure
fig.savefig('figures/paper/appendix/' + 'app_LSTM_IS_predictions_2025.png')