## Setting up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM

import pandas as pd
import os
import warnings
import massbalancemachine as mbm
import pyproj
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr
import geopandas as gpd
import logging
import glob
from cmcrameri import cm
from oggm import utils

from regions.Iceland.scripts.iceland_preprocess import *
from regions.Iceland.scripts.config_ICE import *

from regions.Switzerland.scripts.oggm import initialize_oggm_glacier_directories, export_oggm_grids
from regions.Switzerland.scripts.glamos import merge_pmb_with_oggm_data, rename_stakes_by_elevation, check_point_ids_contain_glacier, remove_close_points, check_multiple_rgi_ids

from regions.French_Alps.scripts.glacioclim_preprocess import add_svf_from_rgi_zarr, plot_missing_svf_for_all_glaciers, add_svf_nearest_valid

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

cfg = mbm.IcelandConfig()

# Module logger
log = logging.getLogger('.'.join(__name__.split('.')[:-1]))

%load_ext autoreload
%autoreload 2

mbm.utils.seed_all(cfg.seed)
mbm.utils.free_up_cuda()
mbm.plots.use_mbm_style()

## Load all stake csv files into 1 df
The data used in this code comes from the data scraping done in the 1.0 Iceland-data-acquisition notebook in June 2025, only winter and annual measurements are used. Code might have to be adjusted if new data is added to https://joklavefsja.vedur.is/

In [None]:
all_files = glob.glob(os.path.join(cfg.dataPath + path_PMB_WGMS_raw, "*.csv"))

# Initialize empty list to store dataframes
dfs = []

# Read each CSV file into a dataframe and append to list
for file in all_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Print info
print(
    f"Combined {len(all_files)} CSV files into one dataframe with {len(combined_df)} rows"
)

# Add data modification column to keep track of mannual changes
combined_df['DATA_MODIFICATION'] = ''

display(combined_df.head(2))

Split into annual and winter rows

In [None]:
df_stakes_split = split_stake_measurements(combined_df)

# Convert date columns to string in 'YYYYMMDD' format
df_stakes_split['TO_DATE'] = pd.to_datetime(
    df_stakes_split['TO_DATE']).dt.strftime('%Y%m%d')
df_stakes_split['FROM_DATE'] = pd.to_datetime(
    df_stakes_split['FROM_DATE']).dt.strftime('%Y%m%d')

display(df_stakes_split)

## Date Fixes

Fix NaN dates by adding hydrological year dates. (It would be nicer if this code also checked if there was a previous year of the same stake with a date and then takes that date instead of hydr. year)

In [None]:
display(df_stakes_split[df_stakes_split['FROM_DATE'].isna()])
display(df_stakes_split[df_stakes_split['TO_DATE'].isna()])
display(df_stakes_split[df_stakes_split['YEAR'].isna()])

# Change NaN year values to the year of the TO_DATE
df_stakes_split.loc[df_stakes_split['YEAR'].isna(),
                    'YEAR'] = df_stakes_split.loc[
                        df_stakes_split['YEAR'].isna(),
                        'TO_DATE'].astype(str).str[:4].astype(float)

# Data modification column update
date_nan_mask = df_stakes_split['FROM_DATE'].isna(
) | df_stakes_split['TO_DATE'].isna()
df_stakes_split.loc[
    date_nan_mask,
    'DATA_MODIFICATION'] = 'Dates filled in according to hydrological year'
# Set FROM_DATE from NaN to 01 Oct of previous year
df_stakes_split.loc[df_stakes_split['FROM_DATE'].isna(), 'FROM_DATE'] = (
    (df_stakes_split.loc[df_stakes_split['FROM_DATE'].isna(),
                         'YEAR'].astype(int) - 1).astype(str) + '1001')
# Set TO_DATE from NaN to 30 Sept of the year (as only annual rows have NaN, no need for period distinction)
df_stakes_split.loc[df_stakes_split['TO_DATE'].isna(), 'TO_DATE'] = (
    df_stakes_split.loc[df_stakes_split['TO_DATE'].isna(),
                        'YEAR'].astype(int).astype(str) + '0930')

Check for problematic date ranges

In [None]:
annual_inconsistent, winter_inconsistent = check_period_consistency(
    df_stakes_split)

# Display the inconsistent records
if len(annual_inconsistent) > 0:
    print("\nInconsistent annual periods:")
    display(annual_inconsistent)

if len(winter_inconsistent) > 0:
    print("\nInconsistent winter periods:")
    display(winter_inconsistent)

# Only index 5084 is unreasonabl (-2), probably wrong FROM_DATE year, change to year - 1
df_stakes_split.loc[df_stakes_split['stake'] == 'GL10a',
                    'FROM_DATE'] = '19960825'
df_stakes_split.loc[
    df_stakes_split['stake'] == 'GL10a',
    'DATA_MODIFICATION'] = 'FROM_DATE year corrected from 1997 to 1996'

Rename Columns and general data cleaning, we can skip the close stake removal, as seen form the leaflet map online, the stakes are spaced out.

In [None]:
df_stakes_renamed = df_stakes_split.rename(
    columns={
        'lat': 'POINT_LAT',
        'lon': 'POINT_LON',
        'elevation': 'POINT_ELEVATION',
        'stake': 'ID',
    })

In [None]:
# NaN check
display(df_stakes_renamed[df_stakes_renamed.isna().any(axis=1)])

# Remove all rows with any NaN values
df_stakes_renamed = df_stakes_renamed.dropna()

# Confirm removal - this should show 0 rows if all NaNs were removed
print(
    f"Rows with NaN values after removal: {len(df_stakes_renamed[df_stakes_renamed.isna().any(axis=1)])}"
)

##### Find RGIId

In [None]:
# initialize OGGM glacier directories
gdirs, rgidf = initialize_oggm_glacier_directories(
    cfg,
    rgi_region="06",
    rgi_version="62",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L1-L2_files/2025.6/elev_bands_w_data/",
    log_level='WARNING',
    task_list=None,
)

export_oggm_grids(cfg, gdirs, rgi_region="06")

In [None]:
# Load glacier outlines
rgi_file = utils.get_rgi_region_file(region="06", version="62")
glacier_outline = gpd.read_file(rgi_file)

# Add RGI IDs through intersection
df_stakes_renamed_rgiid = mbm.data_processing.utils.get_rgi(
    data=df_stakes_renamed, glacier_outlines=glacier_outline)
print('Number of measurements without RGI:',
      len(df_stakes_renamed_rgiid[df_stakes_renamed_rgiid['RGIId'].isna()]))

# Remove (nine) stakes without RGIId, as they wont have OGGM data anyways
df_stakes_renamed_rgiid = df_stakes_renamed_rgiid.dropna(subset=['RGIId'])

### Add OGGM data

In [None]:
unique_rgis = df_stakes_renamed_rgiid['RGIId'].unique()

## Around 10% of all the measurements have no hugonnet_dhdt data, so I removed the entire variable from merge_pmb_with_oggm_data()
df_stakes_topo = merge_pmb_with_oggm_data(
    df_pmb=df_stakes_renamed_rgiid,
    gdirs=gdirs,
    rgi_region="06",  #06 iceland
    rgi_version="62")


Get Glacier names from RGIId

In [None]:
# Create a dictionary mapping from RGIId to glacier name
rgi_to_name_dict = dict(zip(rgidf.RGIId, rgidf.Name))
df_stakes_topo['GLACIER'] = df_stakes_topo['RGIId'].map(rgi_to_name_dict)
display(df_stakes_topo[df_stakes_topo['GLACIER'].isna()])

Multiple RGIIds have no associated glacier name, assign the 'RGIId' as the 'GLACIER' name

In [None]:
missing_rgi_ids = df_stakes_topo.loc[df_stakes_topo['GLACIER'].isna(),
                                     'RGIId'].unique()
print(f"Number of unique RGI IDs without names: {len(missing_rgi_ids)}")
print("RGI IDs without names:", missing_rgi_ids)
# Just assign RGIId to 'GLACIER' as name for the ones that are missing
df_stakes_topo.loc[df_stakes_topo['GLACIER'].isna(),
                   'GLACIER'] = df_stakes_topo.loc[
                       df_stakes_topo['GLACIER'].isna(), 'RGIId']

In [None]:
# Example:
glacierName = 'Thjorsarjoekull (Hofsjoekull E)'
# stakes
df_stakes_topo_1 = df_stakes_topo.copy()
df_stakes_topo_1 = df_stakes_topo_1[(
    df_stakes_topo_1['GLACIER'] == glacierName)]
RGIId = df_stakes_topo_1['RGIId'].unique()[0]
print(RGIId)
# open OGGM xr for glacier
# Get oggm data for that RGI grid
ds_oggm = xr.open_dataset(f'{cfg.dataPath + path_OGGM_xrgrids}/{RGIId}.zarr')

# Define the coordinate transformation
transf = pyproj.Transformer.from_proj(
    pyproj.CRS.from_user_input("EPSG:4326"),  # Input CRS (WGS84)
    pyproj.CRS.from_user_input(ds_oggm.pyproj_srs),  # Output CRS from dataset
    always_xy=True)

# Transform all coordinates in the group
lon, lat = df_stakes_topo_1["POINT_LON"].values, df_stakes_topo_1[
    "POINT_LAT"].values
x_stake, y_stake = transf.transform(lon, lat)
df_stakes_topo_1['x'] = x_stake
df_stakes_topo_1['y'] = y_stake

# plot stakes
plt.figure(figsize=(8, 6))
ds_oggm.glacier_mask.plot(cmap='binary')
sns.scatterplot(df_stakes_topo_1,
                x='x',
                y='y',
                hue='within_glacier_shape',
                palette=['r', 'b'])
plt.title(f'Stakes on {glacierName} (OGGM)')
plt.tight_layout()

In [None]:
# Restrict to within glacier shape
df_stakes_topo = df_stakes_topo[df_stakes_topo['within_glacier_shape'] == True]
df_stakes_topo = df_stakes_topo.drop(columns=['within_glacier_shape'])

# Display rows that have any NaN values
display(df_stakes_topo[df_stakes_topo.isna().any(axis=1)])

# Drop 3 rows where consensus_ice_thickness is NaN
#df_stakes_topo_dropped = df_stakes_topo.dropna(subset=['consensus_ice_thickness'])

In [None]:
# Create new POINT_ID column
df_stakes_topo['POINT_ID'] = (df_stakes_topo['GLACIER'] + '_' +
                              df_stakes_topo['YEAR'].astype(str) + '_' +
                              df_stakes_topo['PERIOD'].astype(str) + '_' +
                              df_stakes_topo['ID'].astype(str))

df_stakes_topo = df_stakes_topo.drop(columns=['ID'])

display(df_stakes_topo.head(2))

In [None]:
# Check for NaN
display(df_stakes_topo[df_stakes_topo.isna().any(axis=1)])

### Merge close stakes:

In [None]:
from tqdm import tqdm

# df_pmb_topo = remove_close_points(df_stakes_topo)
df_pmb_topo = pd.DataFrame()
for gl in tqdm(df_stakes_topo.GLACIER.unique(), desc='Merging stakes'):
    print(f'-- {gl.capitalize()}:')
    df_gl = df_stakes_topo[df_stakes_topo.GLACIER == gl]
    df_gl_cleaned = remove_close_points(df_gl)
    df_pmb_topo = pd.concat([df_pmb_topo, df_gl_cleaned])
df_pmb_topo.drop(['x', 'y'], axis=1, inplace=True)
df_pmb_topo.reset_index(inplace=True, drop=True)

### Check for wrong elevation:

In [None]:
df_checked, df_bad = flag_elevation_mismatch(df_pmb_topo, threshold=400)

### Add Skyview factor:

In [None]:
# Example of one svf file
rgi_id = df_pmb_topo.loc[0].RGIId

nigardsbreen_rgi = "RGI60-06.00002"

# read ds with svf
path_masked_xr = os.path.join(cfg.dataPath,
                              'RGI_v6/RGI_06_Iceland/xr_masked_grids/')

xr.open_zarr(path_masked_xr + f'{nigardsbreen_rgi}.zarr').svf.plot()

In [None]:
path_masked_xr = os.path.join(cfg.dataPath,
                              "RGI_v6/RGI_06_Iceland/xr_masked_grids")

df_pmb_topo_svf = add_svf_from_rgi_zarr(
    df_pmb_topo,
    path_masked_xr,
    rgi_col="RGIId",
    lon_col="POINT_LON",
    lat_col="POINT_LAT",
    svf_var="svf",
    out_col="svf",
)
df_missing = df_pmb_topo_svf[df_pmb_topo_svf["svf"].isna()].copy()
print("Missing SVF points:", len(df_missing))
print("Glaciers affected:", sorted(df_missing["RGIId"].unique()))

In [None]:
plot_missing_svf_for_all_glaciers(
    df_with_svf=df_pmb_topo_svf,
    path_masked_xr=path_masked_xr,
    plot_valid_points=True,
    save_dir=
    None  # or e.g. os.path.join(cfg.dataPath, "diagnostics/svf_missing")
)

In [None]:
df_pmb_topo_svf_new = add_svf_nearest_valid(
    df_pmb_topo,
    path_masked_xr,
    rgi_col="RGIId",
    lon_col="POINT_LON",
    lat_col="POINT_LAT",
    svf_var="svf",
    out_col="svf",
    max_radius=30,  # ~30 grid cells search; adjust if needed
)

print("Missing SVF points after nearest-valid fill:",
      df_pmb_topo_svf_new["svf"].isna().sum())

plot_missing_svf_for_all_glaciers(
    df_with_svf=df_pmb_topo_svf_new,
    path_masked_xr=path_masked_xr,
    plot_valid_points=True,
    save_dir=
    None  # or e.g. os.path.join(cfg.dataPath, "diagnostics/svf_missing")
)

### Give new stake IDs:

In [None]:
df_pmb_new_ids = rename_stakes_by_elevation(df_pmb_topo_svf_new)

# Check the condition
check_point_ids_contain_glacier(df_pmb_new_ids)

print('Number of winter and annual samples:', len(df_pmb_new_ids))
print('Number of annual samples:',
      len(df_pmb_new_ids[df_pmb_new_ids.PERIOD == 'annual']))
print('Number of winter samples:',
      len(df_pmb_new_ids[df_pmb_new_ids.PERIOD == 'winter']))

# Histogram of mass balance
df_pmb_new_ids['POINT_BALANCE'].hist(bins=20)
plt.xlabel('Mass balance [m w.e.]')

### Final cleaning:

In [None]:
df_pmb_clean = df_pmb_new_ids.copy()

# Ensure YYYYMMDD format
df_pmb_clean["FROM_DATE"] = df_pmb_clean["FROM_DATE"].astype(str).str.zfill(8)
df_pmb_clean["TO_DATE"] = df_pmb_clean["TO_DATE"].astype(str).str.zfill(8)

# Extract months
df_pmb_clean["MONTH_START"] = df_pmb_clean["FROM_DATE"].str[4:6]
df_pmb_clean["MONTH_END"] = df_pmb_clean["TO_DATE"].str[4:6]

def print_months(df, label):
    winter = df[df.PERIOD == "winter"]
    annual = df[df.PERIOD == "annual"]

    print(f"\n{label}")
    print("Winter measurement months:")
    print("  Unique start months:", sorted(winter["MONTH_START"].unique()))
    print("  Unique end months:  ", sorted(winter["MONTH_END"].unique()))

    print("\nAnnual measurement months:")
    print("  Unique start months:", sorted(annual["MONTH_START"].unique()))
    print("  Unique end months:  ", sorted(annual["MONTH_END"].unique()))

# --- Before filtering ---
print_months(df_pmb_clean, "Before filtering")

# -----------------------
# Filtering masks (define + count BEFORE filtering)
# -----------------------
bad_months = {"07", "12", "01"}

mask_bad_months_all = (
    df_pmb_clean["MONTH_START"].isin(bad_months) |
    df_pmb_clean["MONTH_END"].isin(bad_months)
)

mask_bad_winter_aug = (
    (df_pmb_clean["PERIOD"].astype(str).str.strip().str.lower() == "winter") &
    (df_pmb_clean["MONTH_END"] == "08")
)

mask_remove = mask_bad_months_all | mask_bad_winter_aug

# counts (on original df)
n_total_removed = int(mask_remove.sum())
n_bad_months = int(mask_bad_months_all.sum())
n_winter_aug = int(mask_bad_winter_aug.sum())
n_overlap = int((mask_bad_months_all & mask_bad_winter_aug).sum())
n_bad_months_only = int((mask_bad_months_all & ~mask_bad_winter_aug).sum())
n_winter_aug_only = int((mask_bad_winter_aug & ~mask_bad_months_all).sum())

# Apply removal
df_pmb_clean = df_pmb_clean.loc[~mask_remove].copy()

# --- Correct mislabeled winter MB ---
mask_fix = (
    (df_pmb_clean["PERIOD"].astype(str).str.strip().str.lower() == "winter") &
    (df_pmb_clean["MONTH_END"] == "06") &
    (df_pmb_clean["POINT_BALANCE"] < 0)
)
n_relabel = int(mask_fix.sum())
df_pmb_clean.loc[mask_fix, "PERIOD"] = "annual"

print(
    f"\nRemoved {n_total_removed} rows total.\n"
    f"  - bad-month rows removed: {n_bad_months}\n"
    f"  - winter-end-08 rows removed: {n_winter_aug}\n"
    f"  - overlap (counted in both above): {n_overlap}\n"
    f"  - bad-month only: {n_bad_months_only}\n"
    f"  - winter-end-08 only: {n_winter_aug_only}\n"
    f"Relabeled winter -> annual: {n_relabel}"
)

print_months(df_pmb_clean, "After filtering + relabeling")

In [None]:
# Save to csv:
df_pmb_clean.to_csv(os.path.join(cfg.dataPath, path_PMB_WGMS_csv,
                                 'ICE_wgms_dataset_all.csv'),
                    index=False)

# Histogram of mass balance
df_pmb_clean['POINT_BALANCE'].hist(bins=20)
plt.xlabel('Mass balance [m w.e.]')