# Pre-processing of GLAMOS MB data:

Does the pre-processing of the point MB measurements from GLAMOS (winter and summer).

# Point Mass Balance data:

## Setting up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM
import matplotlib as mpl

import pandas as pd
import warnings
from shapely.geometry import Point
import pyproj
import matplotlib.pyplot as plt
import seaborn as sns
import xarray as xr
from cmcrameri import cm
from pathlib import Path

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

import massbalancemachine as mbm
cfg = mbm.SwitzerlandConfig()

In [None]:
from scripts.utils import *
from scripts.glamos import *
from scripts.geo_data import *
from scripts.oggm import *
from scripts.config_CH import *
from scripts.plotting import *

mbm.utils.seed_all(cfg.seed)
mbm.plots.use_mbm_style()
if torch.cuda.is_available():
    print("CUDA is available")
    mbm.utils.free_up_cuda()
else:
    print("CUDA is NOT available")

## Transform .dat files to .csv:

Transform the seasonal and winter PMB .dat files to .csv for simplicity. 

In [None]:
process_pmb_dat_files(cfg)

##  Assemble measurement periods:
### Annual measurements: 
Process annual measurements and put all stakes into one csv file

In [None]:
# Display the first two rows
df_annual_raw = process_annual_stake_data(cfg.dataPath + path_PMB_GLAMOS_csv_a)
df_annual_raw.head(2)

### Winter measurements:
For each point in annual meas., take winter meas that was taken closest:

In [None]:
process_winter_stake_data(df_annual_raw, cfg.dataPath + path_PMB_GLAMOS_csv_w,
                          cfg.dataPath + path_PMB_GLAMOS_csv_w_clean)

### Assemble both periods:

In [None]:
df_all_raw = assemble_all_stake_data(
    df_annual_raw, cfg.dataPath + path_PMB_GLAMOS_csv_w_clean,
    cfg.dataPath + path_PMB_GLAMOS_csv)

# Plot: Number of measurements per year
df_measurements_per_year = df_all_raw.groupby(['YEAR',
                                               'PERIOD']).size().unstack()
df_measurements_per_year.plot(kind='bar',
                              stacked=True,
                              figsize=(20, 5),
                              color=[mbm.plots.COLOR_ANNUAL, mbm.plots.COLOR_WINTER])
plt.title('Number of measurements per year for all glaciers')
plt.ylabel('Number of Measurements')
plt.xlabel('Year')
plt.legend(title='Period')
plt.tight_layout()
plt.show()

## Add RGIs Ids:

For each PMB measurement, we want to add the RGI ID (v6) of the shapefile it belongs to. 

In [None]:
df_pmb = add_rgi_ids_to_df(df_all_raw, cfg.dataPath + path_rgi_outlines)

rgiids6 = df_pmb[['GLACIER', 'RGIId']].drop_duplicates()
if check_multiple_rgi_ids(rgiids6):
    print(
        "-- Alert: The following glaciers have more than one RGIId. Cleaning up."
    )
    df_pmb_clean = clean_rgi_ids(df_pmb.copy())
    df_pmb_clean.reset_index(drop=True, inplace=True)

    rgiids6_clean = df_pmb_clean[['GLACIER', 'RGIId']].drop_duplicates()
    if check_multiple_rgi_ids(rgiids6_clean):
        print("-- Error: Some glaciers still have more than one RGIId.")
    else:
        print("-- All glaciers are correctly associated with a single RGIId.")
else:
    print("-- All glaciers are correctly associated with a single RGIId.")
    df_pmb_clean = df_pmb

## Cut from 1951:

In [None]:
# Filter to start of MS data (1951) or ERA5-Land data (1950):
df_pmb_50s = df_pmb_clean[df_pmb_clean.YEAR > 1950].sort_values(
    by=['GLACIER', 'YEAR'], ascending=[True, True])

# Change from mm w.e. to m w.e.
df_pmb_50s['POINT_BALANCE'] = df_pmb_50s['POINT_BALANCE'] / 1000

# merge ClaridenL and ClaridenU into one glacier:
df_pmb_50s.loc[df_pmb_50s.GLACIER == 'claridenU', 'GLACIER'] = 'clariden'
df_pmb_50s.loc[df_pmb_50s.GLACIER == 'claridenL', 'GLACIER'] = 'clariden'

print('Number of winter and annual samples:', len(df_pmb_50s))
print('Number of annual samples:',
      len(df_pmb_50s[df_pmb_50s.PERIOD == 'annual']))
print('Number of winter samples:',
      len(df_pmb_50s[df_pmb_50s.PERIOD == 'winter']))

# Number of measurements per year:
fig, axs = plt.subplots(2, 1, figsize=(20, 15))
ax = axs.flatten()[0]
df_pmb_50s.groupby(['YEAR', 'PERIOD']).size().unstack().plot(
    kind='bar', stacked=True, color=[mbm.plots.COLOR_ANNUAL, mbm.plots.COLOR_WINTER], ax=ax)
ax.set_title('Number of measurements per year for all glaciers')

ax = axs.flatten()[1]
num_gl = df_pmb_50s.groupby(['GLACIER']).size().sort_values()
num_gl.plot(kind='bar', ax=ax)
ax.set_title('Number of total measurements per glacier since 1951')
plt.tight_layout()

### Merge stakes that are close: 
Especially with winter probes, a lot of measurements were done at the same place in the raw data and this leads to noise. We merge the stakes that are very close and keep the mean of the measurement.


In [None]:
df_pmb_50s_clean_pts = pd.DataFrame()
for gl in tqdm(df_pmb_50s.GLACIER.unique(), desc='Merging stakes'):
    print(f'-- {gl.capitalize()}:')
    df_gl = df_pmb_50s[df_pmb_50s.GLACIER == gl]
    df_gl_cleaned = remove_close_points(df_gl)
    df_pmb_50s_clean_pts = pd.concat([df_pmb_50s_clean_pts, df_gl_cleaned])
df_pmb_50s_clean_pts.drop(['x', 'y'], axis=1, inplace=True)

### Correct for wrong elevations:
Some PMB data is stored with the wrong altitudes. We compare them to the DEMs and correct them otherwise.

In [None]:
# Make a unique-index working copy
df_clean = df_pmb_50s_clean_pts.reset_index(drop=True).copy()
print("Initial number of rows:", len(df_clean))

path_xr_grids = os.path.join(cfg.dataPath, path_GLAMOS_topo,
                             "xr_masked_grids/")

df_clean, df_mismatch, summary = reconcile_points_by_year(
    df=df_pmb_50s_clean_pts,
    path_xr_grids=path_xr_grids,
    var_name="masked_elev",
    lon_name="lon",
    lat_name="lat",
    year_col="YEAR",
    glacier_col="GLACIER",
    point_elev_col="POINT_ELEVATION",
    threshold=400.0,
    file_pattern="{glacier}_{year}.zarr",
    replace_glaciers={"aletsch"},  # replace for Aletsch, drop for others
    strict=False,
    verbose=True,  # prints counts per glacier
)

print("Final number of rows:", len(df_clean))

# Save mismatches to CSV
out_csv = os.path.join(cfg.dataPath, path_PMB_GLAMOS_csv,
                       "GLAMOS_elev_mismatch.csv")
df_mismatch.sort_values(by="elev_diff", ascending=False, inplace=True)
df_mismatch.to_csv(out_csv, index=False)
print("Saved mismatches to:", out_csv)

# df_clean is your final cleaned dataframe (all glaciers, mismatches removed)
df_pmb_50s_clean_elv = df_clean

# reset_index
df_pmb_50s_clean_elv.reset_index(drop=True, inplace=True)

# Save intermediate output
print('Saving intermediate output df_pmb_50s.csv to {path_PMB_GLAMOS_csv}')
df_pmb_50s_clean_elv.to_csv(os.path.join(cfg.dataPath, path_PMB_GLAMOS_csv,
                                         'df_pmb_50s.csv'),
                            index=False)
df_pmb_50s_clean_elv[[
    'GLACIER', 'POINT_ID', 'POINT_LAT', 'POINT_LON', 'PERIOD'
]].to_csv(os.path.join(cfg.dataPath, path_PMB_GLAMOS_csv,
                       'coordinate_50s.csv'),
          index=False)

In [None]:
# Example:
glacier_name = 'rhone'
df_clean = df_pmb_50s_clean_pts.reset_index(drop=True).copy()
df_first = first_year_per_glacier(path_xr_grids)

df_gl = df_clean[(df_clean.GLACIER == glacier_name)]
ds = xr.open_zarr(
    df_first[df_first.glacier == glacier_name].first_year_path.values[0])

threshold = 400.0,
mismatch_idx, df_with_diffs = find_mismatch_by_year(
    df_gl=df_gl,  # must include GLACIER and YEAR columns
    path_xr_grids=path_xr_grids,
    var_name="masked_elev",
    lon_name="lon",
    lat_name="lat",
    year_col="YEAR",  # change if your year column is named differently
    glacier_col="GLACIER",
    threshold=threshold,  # meters
    file_pattern="{glacier}_{year}.zarr",  # e.g. "aletsch_1951.zarr"
    strict=False,
)
print(
    f"Number of POINT indices with >={threshold} m mismatch: {len(mismatch_idx)}"
)


def pick_ann_file(cfg, glacier_name, year, period="annual"):
    if period == "annual":
        suffix = "ann"
    elif period == "winter":
        suffix = "win"
    base = os.path.join(cfg.dataPath, path_distributed_MB_glamos, "GLAMOS",
                        glacier_name)
    cand_lv95 = os.path.join(base, f"{year}_{suffix}_fix_lv95.grid")
    cand_lv03 = os.path.join(base, f"{year}_{suffix}_fix_lv03.grid")
    if os.path.exists(cand_lv95):
        return cand_lv95, "lv95"
    if os.path.exists(cand_lv03):
        return cand_lv03, "lv03"
    return None, None


glacier_name = glacier_name
year = df_first[df_first.glacier == glacier_name].first_year.values[0]
period = 'annual'
file_ann, coord_system = pick_ann_file(cfg, glacier_name, year, period)
grid_path_ann = os.path.join(cfg.dataPath, path_distributed_MB_glamos,
                             "GLAMOS", glacier_name, file_ann)

# Load GLAMOS data and convert to WGS84
metadata_ann, grid_data_ann = load_grid_file(grid_path_ann)
ds_glamos_ann = convert_to_xarray_geodata(grid_data_ann, metadata_ann)
if coord_system == "lv03":
    ds_glamos_wgs84_ann = transform_xarray_coords_lv03_to_wgs84(ds_glamos_ann)
elif coord_system == "lv95":
    ds_glamos_wgs84_ann = transform_xarray_coords_lv95_to_wgs84(ds_glamos_ann)

figure = plt.figure(figsize=(20, 6))

# Shared normalization across both plots
vmin = min(df_with_diffs["POINT_ELEVATION"].min(), float(ds.masked_elev.min()))
vmax = max(df_with_diffs["POINT_ELEVATION"].max(), float(ds.masked_elev.max()))
norm = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
cmap = plt.cm.terrain

# ---- First subplot ----
ax1 = plt.subplot(1, 2, 1)
ds_glamos_wgs84_ann.plot.imshow(
    ax=ax1,
    cmap="Greys",
    cbar_kwargs={"label": "Mass Balance [m w.e.]"},
)

# scatter using same cmap + norm
sc = ax1.scatter(
    df_with_diffs["POINT_LON"],
    df_with_diffs["POINT_LAT"],
    c=df_with_diffs["POINT_ELEVATION"],
    cmap=cmap,
    norm=norm,
    s=25,
)
ax1.set_title(f"{glacier_name.capitalize()} {year} GLAMOS glacier-wide MB")

# ---- Second subplot ----
ax2 = plt.subplot(1, 2, 2)
im = ds.masked_elev.plot(
    ax=ax2,
    cmap=cmap,
    norm=norm,
    add_colorbar=False,  # donâ€™t add duplicate colorbar
)
ax2.set_title(f"{glacier_name.capitalize()} {year} DEM")

# ---- Shared colorbar for elevation ----
cbar = figure.colorbar(
    mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
    ax=ax2,
    orientation="vertical",
    fraction=0.02,
    pad=0.02,
)
cbar.set_label("Elevation [m a.s.l.]")

plt.tight_layout()

df_with_diffs.head(10)

### Barplots:

In [None]:
# Number of measurements per year:
fig, axs = plt.subplots(2, 1, figsize=(20, 15))
ax = axs.flatten()[0]
df_pmb_50s_clean_elv.groupby(['YEAR', 'PERIOD']).size().unstack().plot(
    kind='bar', stacked=True, color=[mbm.plots.COLOR_ANNUAL, mbm.plots.COLOR_WINTER], ax=ax)
ax.set_title('Number of measurements per year for all glaciers')

ax = axs.flatten()[1]
num_gl = df_pmb_50s_clean_elv.groupby(['GLACIER']).size().sort_values()
num_gl.plot(kind='bar', ax=ax)
ax.set_title('Number of total measurements per glacier since 1951')
plt.tight_layout()

In [None]:
glacier_list = list(df_pmb_50s_clean_elv.GLACIER.unique())
print('Number of glaciers:', len(glacier_list))
glacier_list.sort()
glacier_list

In [None]:
# Number of measurements per glacier per year:
num_gl_yr = df_pmb_50s_clean_elv.groupby(['GLACIER', 'YEAR', 'PERIOD'
                                          ]).size().unstack().reset_index()

num_gl_annual = df_pmb_50s_clean_elv[
    df_pmb_50s_clean_elv.PERIOD == 'annual'].groupby(['GLACIER'
                                                      ]).size().sort_values()

# Plot one glacier per column:
big_gl = num_gl_annual[num_gl_annual > 250].index.sort_values()
num_glaciers = len(big_gl)
fig, ax = plt.subplots(num_glaciers, 1, figsize=(15, 5 * num_glaciers))
for i, gl in enumerate(big_gl):
    num_gl_yr[num_gl_yr.GLACIER == gl].plot(x='YEAR',
                                            kind='bar',
                                            stacked=True,
                                            ax=ax[i],
                                            title=gl)
    ax[i].set_ylabel('Number of measurements')
    ax[i].set_title

In [None]:
print('Number of winter and annual samples:', len(df_pmb_50s_clean_elv))
print('Number of annual samples:',
      len(df_pmb_50s_clean_elv[df_pmb_50s_clean_elv.PERIOD == 'annual']))
print('Number of winter samples:',
      len(df_pmb_50s_clean_elv[df_pmb_50s_clean_elv.PERIOD == 'winter']))
# Unique glaciers, sorted
glacier_list = sorted(df_pmb_50s_clean_elv.GLACIER.unique())
print(f"Number of glaciers: {len(glacier_list)}")
print(f"Glaciers: {glacier_list}")

## Add topographical information from OGGM & SGI:

### OGGM data:

In [None]:
df_pmb_50s_clean = pd.read_csv(cfg.dataPath + path_PMB_GLAMOS_csv +
                               'df_pmb_50s.csv')

gdirs, rgidf = initialize_oggm_glacier_directories(
    cfg,
    rgi_region="11",
    rgi_version="62",
    base_url=
    "https://cluster.klima.uni-bremen.de/~oggm/gdirs/oggm_v1.6/L1-L2_files/2025.6/elev_bands_w_data/",
    log_level='WARNING',
    task_list=None,
)
unique_rgis = df_pmb_50s_clean['RGIId'].unique()

export_oggm_grids(cfg, gdirs)

df_pmb_topo = merge_pmb_with_oggm_data(
    df_pmb=df_pmb_50s_clean,
    gdirs=gdirs,
    rgi_region="11",
    rgi_version="62",
)

In [None]:
# restrict to within glacier shape
df_pmb_topo = df_pmb_topo[df_pmb_topo['within_glacier_shape']]
df_pmb_topo = df_pmb_topo.drop(columns=['within_glacier_shape'])

print('Number of winter and annual samples:', len(df_pmb_topo))
print('Number of annual samples:',
      len(df_pmb_topo[df_pmb_topo.PERIOD == 'annual']))
print('Number of winter samples:',
      len(df_pmb_topo[df_pmb_topo.PERIOD == 'winter']))
# Unique glaciers, sorted
glacier_list = sorted(df_pmb_topo.GLACIER.unique())
print(f"Number of glaciers: {len(glacier_list)}")
print(f"Glaciers: {glacier_list}")

In [None]:
print('Saving intermediate output df_pmb_50s.csv to {path_PMB_GLAMOS_csv}')
df_pmb_topo.to_csv(os.path.join(cfg.dataPath, path_PMB_GLAMOS_csv,
                                'df_pmb_oggm_intermediate.csv'),
                   index=False)

### SGI data:

In [None]:
df_pmb_topo = pd.read_csv(
    os.path.join(cfg.dataPath, path_PMB_GLAMOS_csv,
                 'df_pmb_oggm_intermediate.csv'))

In [None]:
# First create the masked topographical arrays per glacier:
glacier_list = sorted(df_pmb_topo.GLACIER.unique())
create_sgi_topo_masks(cfg,
                      glacier_list,
                      type='glacier_name',
                      path_save=os.path.join(cfg.dataPath, path_SGI_topo,
                                             'xr_masked_grids/'),
                      path_xr_svf=os.path.join(
                          cfg.dataPath, "GLAMOS/topo/SGI2020/svf_nc_latlon/"))

In [None]:
# Example
i = 0
glacier_name = 'adler'
df_pmb_gl = df_pmb_50s_clean[df_pmb_50s_clean.GLACIER == glacier_name]

stake_coordinates = df_pmb_gl[['POINT_LON', 'POINT_LAT']].values

# Open SGI grid:
ds_sgi = xr.open_dataset(
    os.path.join(cfg.dataPath, path_SGI_topo, 'xr_masked_grids/',
                 f'{glacier_name}.zarr'))

# Plot the masked data
fig, axs = plt.subplots(1, 4, figsize=(15, 6))
ds_sgi.masked_aspect.plot(ax=axs[0], cmap='twilight_shifted')
ds_sgi.masked_slope.plot(ax=axs[1], cmap='cividis')
ds_sgi.masked_elev.plot(ax=axs[2], cmap='terrain')
ds_sgi.glacier_mask.plot(ax=axs[3], cmap='binary')
axs[3].scatter(stake_coordinates[:, 0], stake_coordinates[:, 1], c='r', s=10)
axs[0].set_title("Aspect")
axs[1].set_title("Slope")
axs[2].set_title("SVF")
axs[3].set_title("Glacier mask")
plt.tight_layout()

In [None]:
path_masked_grids = os.path.join(cfg.dataPath, path_SGI_topo,
                                 'xr_masked_grids/')

# Merge PMB with SGI data
df_pmb_sgi = merge_pmb_with_sgi_data(
    df_pmb_topo,  # cleaned PMB DataFrame
    path_masked_grids,  # path to SGI grids
    voi=[
        "masked_aspect", "masked_slope", "masked_elev", "masked_svf",
        "masked_asvf", "masked_opns"
    ])

df_pmb_sgi.rename(columns={
    'masked_svf': 'svf',
    'masked_asvf': 'asvf',
    'masked_opns': 'opns',
},
                  inplace=True)

# Drop points that have no intersection with SGI mask: (have NaN values)
df_pmb_sgi = df_pmb_sgi.dropna()
df_pmb_sgi.head(2)

In [None]:
# Count and display the number of samples
print(f"Total number of winter and annual samples: {len(df_pmb_sgi)}")

# Count occurrences of 'PERIOD' values
period_counts = df_pmb_sgi['PERIOD'].value_counts()
print(f"Number of annual samples: {period_counts.get('annual', 0)}")
print(f"Number of winter samples: {period_counts.get('winter', 0)}")

# Unique years, sorted
unique_years = np.sort(df_pmb_sgi.YEAR.unique())
print(f"Unique years: {unique_years}")

# Unique glaciers, sorted
glacier_list = sorted(df_pmb_sgi.GLACIER.unique())
print(f"Number of glaciers: {len(glacier_list)}")
print(f"Glaciers: {glacier_list}")

In [None]:
# Example:
glacierName = 'clariden'
# stakes
df_stakes = df_pmb_topo.copy()
df_stakes = df_stakes[(df_stakes['GLACIER'] == glacierName)]
RGIId = df_stakes.RGIId.unique()[0]
print(RGIId)
# open OGGM xr for glacier
# Get oggm data for that RGI grid
ds_oggm = xr.open_dataset(f'{cfg.dataPath}/OGGM/xr_grids/{RGIId}.zarr')

# Define the coordinate transformation
transf = pyproj.Transformer.from_proj(
    pyproj.CRS.from_user_input("EPSG:4326"),  # Input CRS (WGS84)
    pyproj.CRS.from_user_input(ds_oggm.pyproj_srs),  # Output CRS from dataset
    always_xy=True)

# Transform all coordinates in the group
lon, lat = df_stakes["POINT_LON"].values, df_stakes["POINT_LAT"].values
x_stake, y_stake = transf.transform(lon, lat)
df_stakes['x'] = x_stake
df_stakes['y'] = y_stake

# plot stakes
plt.figure(figsize=(10, 5))
ax = plt.subplot(121)
ds_oggm.glacier_mask.plot(cmap='binary', ax=ax)
sns.scatterplot(
    df_stakes,
    x='x',
    y='y',
    # hue='within_glacier_shape',
    ax=ax,
    palette=['r', 'b'])
ax.set_title('Stakes on glacier OGGM')

ax = plt.subplot(122)
path_SGI_topo = f'{cfg.dataPath}/GLAMOS/topo/SGI2020/'
sgi_grid = xr.open_dataset(path_SGI_topo +
                           f'xr_masked_grids/{glacierName}.zarr')
sgi_grid.glacier_mask.plot(cmap='binary', ax=ax)
sns.scatterplot(
    df_stakes,
    x='POINT_LON',
    y='POINT_LAT',
    # hue='within_glacier_shape',
    ax=ax,
    palette=['r', 'b'])
ax.set_title('Stakes on glacier SGI')

In [None]:
# Number of measurements per year:
fig, axs = plt.subplots(2, 1, figsize=(20, 15))
ax = axs.flatten()[0]
df_pmb_sgi.groupby(['YEAR', 'PERIOD']).size().unstack().plot(
    kind='bar', stacked=True, color=[mbm.plots.COLOR_ANNUAL, mbm.plots.COLOR_WINTER], ax=ax)
ax.set_title('Number of measurements per year for all glaciers')

ax = axs.flatten()[1]
num_gl = df_pmb_sgi.groupby(['GLACIER']).size().sort_values()
num_gl.plot(kind='bar', ax=ax)
ax.set_title('Number of total measurements per glacier since 1951')
plt.tight_layout()

### Example:


In [None]:
glacierName = 'clariden'
df_pmb_gl = df_pmb_sgi[(df_pmb_sgi.GLACIER == glacierName)]

# Plot aspect and sgi aspect
fig, axs = plt.subplots(1, 3, figsize=(15, 6))
axs[0].scatter(df_pmb_gl.aspect, df_pmb_gl.aspect_sgi)
axs[0].set_xlabel('aspect oggm')
axs[0].set_ylabel('aspect sgi')
axs[0].set_title('Aspect')

axs[1].scatter(df_pmb_gl.slope, df_pmb_gl.slope_sgi)
axs[1].set_xlabel('slope oggm')
axs[1].set_ylabel('slope sgi')
axs[1].set_title('Slope')

# same for topo
axs[2].scatter(df_pmb_gl.topo, df_pmb_gl.topo_sgi)
axs[2].set_xlabel('topo oggm')
axs[2].set_ylabel('topo sgi')
axs[2].set_title('Topo')
# add 1:1 line
for ax in axs:
    ax.plot(ax.get_xlim(), ax.get_xlim(), ls="--", c=".3")

plt.tight_layout()

## Give new stake IDs:
Give new stake IDs with glacier name and then a number according to the elevation. This is because accross glaciers some stakes have the same ID which is not practical.

In [None]:
# drop taelliboden (only one measurement)
df_pmb_sgi = df_pmb_sgi[df_pmb_sgi.GLACIER != 'taelliboden']

# drop taelliboden (big outlier)
df_pmb_sgi = df_pmb_sgi[df_pmb_sgi.GLACIER != 'plainemorte']

df_pmb_sgi = rename_stakes_by_elevation(df_pmb_sgi)

# Check the condition
check_point_ids_contain_glacier(df_pmb_sgi)

print('Number of winter and annual samples:', len(df_pmb_sgi))
print('Number of annual samples:',
      len(df_pmb_sgi[df_pmb_sgi.PERIOD == 'annual']))
print('Number of winter samples:',
      len(df_pmb_sgi[df_pmb_sgi.PERIOD == 'winter']))

# Histogram of mass balance
df_pmb_sgi['POINT_BALANCE'].hist(bins=20)
plt.xlabel('Mass balance [m w.e.]')

## Final cleaning:

In [None]:
df_pmb_sgi['MONTH_START'] = [str(date)[4:6] for date in df_pmb_sgi.FROM_DATE]
df_pmb_sgi['MONTH_END'] = [str(date)[4:6] for date in df_pmb_sgi.TO_DATE]

# drop rows where month_start is '07'
df_pmb_sgi = df_pmb_sgi[df_pmb_sgi['MONTH_START'] != '07']

# drop
df_pmb_sgi = df_pmb_sgi.loc[~((df_pmb_sgi['MONTH_END'] == '06') &
                              (df_pmb_sgi['PERIOD'] == 'annual'))]

df_pmb_sgi = df_pmb_sgi.loc[~((df_pmb_sgi['MONTH_END'] == '11') &
                              (df_pmb_sgi['PERIOD'] == 'annual'))]

# Rows where month_end is '05' and period is 'annual', rename period to 'winter'
df_pmb_sgi.loc[(df_pmb_sgi['MONTH_END'] == '05') &
               (df_pmb_sgi['PERIOD'] == 'annual'), 'PERIOD'] = 'winter'

# Rows where month_end is '08' and period is 'winter', rename period to 'winter'
df_pmb_sgi.loc[(df_pmb_sgi['MONTH_END'] == '08') &
               (df_pmb_sgi['PERIOD'] == 'winter'), 'PERIOD'] = 'annual'

POINT_ID_to_drop = [
    'schwarzberg_12', 'schwarzberg_13', 'schwarzberg_6', 'schwarzberg_5',
    'schwarzberg_12', 'plattalva_7', 'plattalva_11', 'plattalva_10',
    'plattalva_1'
]

# remove points with too much missing data
df_pmb_sgi = df_pmb_sgi[~df_pmb_sgi['POINT_ID'].isin(POINT_ID_to_drop)]

# Save to csv:
df_pmb_sgi.to_csv(cfg.dataPath + path_PMB_GLAMOS_csv +
                  f'CH_wgms_dataset_all.csv',
                  index=False)


In [None]:
df = pd.read_csv(cfg.dataPath + path_PMB_GLAMOS_csv +
                 f'CH_wgms_dataset_all.csv')
df.GLACIER.unique()

# Glacier wide MB:
Pre-processing of glacier wide SMB data from GLAMOS. Transform .dat files to .csv. 

In [None]:
process_SMB_GLAMOS(cfg)

In [None]:
# Obs: no fixed dates, but using observed periods.
# Example:
fileName = 'aletsch_obs.csv'
aletsch_csv = pd.read_csv(cfg.dataPath + path_SMB_GLAMOS_csv + 'obs/' +
                          fileName,
                          sep=',',
                          header=0,
                          encoding='latin-1')
aletsch_csv.head(2)

In [None]:
# Fix: with fixed periods (hydrological year).
# # Example:
fileName = 'aletsch_fix.csv'
aletsch_csv = pd.read_csv(cfg.dataPath + path_SMB_GLAMOS_csv + 'fix/' +
                          fileName,
                          sep=',',
                          header=0,
                          encoding='latin-1')
aletsch_csv.head(2)

# Potential incoming clear sky solar radiation:

Pre-process glamos data of "potential incoming clear sky solar radiation (pcsr)" used as a topographical variable. One per day grid per glacier for one year only, depends on the glacier.

In [None]:
RUN = False
if RUN:
    glDirect = np.sort(os.listdir(cfg.dataPath + path_pcsr +
                                  'raw/'))  # Glaciers with data

    print('Number of glacier with clear sky radiation data:', len(glDirect))
    print('Glaciers with clear sky radiation data:', glDirect)

    process_pcsr(cfg)

In [None]:
# read an plot one file
xr_file = xr.open_dataset(cfg.dataPath + path_pcsr + 'zarr/' +
                          'xr_direct_aletsch.zarr')
xr_file['grid_data'].plot(x='x', y='y', col='time', col_wrap=3)

In [None]:
pcsr_glaciers = os.listdir(cfg.dataPath + path_pcsr + 'raw/')
len(pcsr_glaciers)

In [None]:
# years available per glacier
geod_glaciers = [
    'schwarzbach', 'joeri', 'sanktanna', 'corvatsch', 'sexrouge', 'murtel',
    'plattalva', 'tortin', 'basodino', 'limmern', 'adler', 'hohlaub',
    'albigna', 'tsanfleuron', 'silvretta', 'oberaar', 'gries', 'clariden',
    'gietro', 'schwarzberg', 'forno', 'allalin', 'otemma', 'findelen', 'rhone',
    'morteratsch', 'corbassiere', 'gorner', 'aletsch'
]

base_dir = os.path.join(cfg.dataPath, path_pcsr, 'raw')

glacier_years = {}

for glacier_name in geod_glaciers:
    glacier_path = os.path.join(base_dir, glacier_name)
    if os.path.isdir(glacier_path):
        years = []
        for fname in os.listdir(glacier_path):
            match = re.search(r'(\d{4})', fname)  # look for a 4-digit year
            if match:
                years.append(int(match.group(1)))
        glacier_years[glacier_name] = sorted(set(years))

pd.DataFrame(glacier_years).transpose().sort_values(by=0).reset_index().rename(
    columns={
        'index': 'glacier_name',
        0: 'pcsr year'
    }).to_csv('pcsr.csv')

In [None]:
pd.DataFrame(glacier_years).transpose().sort_values(by=0).reset_index().rename(
    columns={
        'index': 'glacier_name',
        0: 'pcsr year'
    })