## Setting Up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM

import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import zipfile
import cdsapi
import numpy as np
import glob
import xarray as xr
import massbalancemachine as mbm

from regions.Svalbard.scripts.config_SVA import *

cfg = mbm.IcelandConfig()

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

## Download monthly ERA5-Land variables:

To download ERA5-Land data you need to configure your API client first. For this, follow these instructions: https://forum.ecmwf.int/t/step-by-step-instructions-on-how-to-download-data-using-new-climate-data-store-beta-cds-beta/3743. 

To summarize instructions: 
- Your existing CDS credentials will not work in CDS-Beta, you need to have an ECMWF account - register one here: https://www.ecmwf.int/. 
- Once you have set up an account, make sure that you accepted licences in the Download form of the product page: https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land-monthly-means?tab=download. 
- Make sure that you are logged in ECMWF then navigate to the CDSAPI setup page to copy the content for your .cdsapirc file: https://cds.climate.copernicus.eu/how-to-api. 

Once this is set up, you should be able to download the ERA5-Land data using our script. 

In [None]:
RUN = True

# ----------------------------
# Set these ONCE at the top
# ----------------------------
AREA_NAME = "ICE_Alps"  # used for your output filenames/tags
AREA = [
    82,  # North (covers Svalbard safely)
    -30,  # West (covers Iceland)
    42,  # South (covers Alps)
    40,  # East (covers Svalbard east side)
]

# Optional: create a deterministic tag from the numeric area (nice for debugging)
def area_tag(area):
    n, w, s, e = area
    return f"N{n}_W{w}_S{s}_E{e}".replace("-", "m").replace(".", "p")

# Helper: find the geopotential file after extraction (filename varies with area)
def find_geopot_nc(folder):
    # ERA5-land geopotential subset file(s) usually start with "geo.area-subset"
    candidates = sorted(glob.glob(os.path.join(folder, "geo.area-subset*.nc")))
    if not candidates:
        # fallback: any "geo*.nc"
        candidates = sorted(glob.glob(os.path.join(folder, "geo*.nc")))
    if not candidates:
        raise FileNotFoundError(
            f"No geopotential subset netCDF found in {folder}. "
            f"Looked for geo.area-subset*.nc / geo*.nc")
    # if multiple, choose the newest
    return max(candidates, key=os.path.getmtime)


if RUN:
    out_dir = os.path.join(cfg.dataPath, path_ERA5_raw)
    os.makedirs(out_dir, exist_ok=True)

    c = cdsapi.Client()

    # ----------------------------
    # Download climate variables
    # ----------------------------
    climate_zip = os.path.join(out_dir, "download.netcdf.zip")
    c.retrieve(
        "reanalysis-era5-land-monthly-means",
        {
            "product_type": ["monthly_averaged_reanalysis"],
            "variable": [
                "10m_u_component_of_wind",
                "10m_v_component_of_wind",
                "2m_temperature",
                "forecast_albedo",
                "snow_cover",
                "snow_density",
                "snow_depth_water_equivalent",
                "snowfall",
                "snowmelt",
                "surface_latent_heat_flux",
                "surface_net_thermal_radiation",
                "surface_sensible_heat_flux",
                "surface_solar_radiation_downwards",
                "total_precipitation",
            ],
            "year": [
                "1950", "1951", "1952", "1953", "1954", "1955", "1956", "1957",
                "1958", "1959", "1960", "1961", "1962", "1963", "1964", "1965",
                "1966", "1967", "1968", "1969", "1970", "1971", "1972", "1973",
                "1974", "1975", "1976", "1977", "1978", "1979", "1980", "1981",
                "1982", "1983", "1984", "1985", "1986", "1987", "1988", "1989",
                "1990", "1991", "1992", "1993", "1994", "1995", "1996", "1997",
                "1998", "1999", "2000", "2001", "2002", "2003", "2004", "2005",
                "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013",
                "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021",
                "2022", "2023", "2024", "2025"
            ],
            "month": [
                "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
                "11", "12"
            ],
            "time": ["00:00"],
            "data_format":
            "netcdf",
            "download_format":
            "zip",
            "area":
            AREA,
        },
        climate_zip,
    )
    with zipfile.ZipFile(climate_zip, "r") as z:
        z.extractall(out_dir)

    # ----------------------------
    # Download geopotential separately
    # ----------------------------
    geopot_zip = os.path.join(out_dir, "download_geopot.netcdf.zip")
    c.retrieve(
        "reanalysis-era5-land-monthly-means",
        {
            "variable": ["geopotential"],
            "data_format": "netcdf",
            "download_format": "zip",
            "area": AREA,
        },
        geopot_zip,
    )
    with zipfile.ZipFile(geopot_zip, "r") as z:
        z.extractall(out_dir)

    # ----------------------------
    # Correct / rename climate file
    # ----------------------------
    moda_path = os.path.join(out_dir, "data_stream-moda.nc")
    dc = xr.open_dataset(moda_path)
    dc2 = dc.rename({"valid_time": "time"})  # keep compatibility

    climate_out = os.path.join(out_dir,
                               f"era5_monthly_averaged_data_{AREA_NAME}.nc")
    if os.path.exists(climate_out):
        os.remove(climate_out)
    dc2.to_netcdf(climate_out)

    # ----------------------------
    # Rename geopotential file (flexible input name)
    # ----------------------------
    geopot_in = find_geopot_nc(out_dir)
    dcg = xr.open_dataset(geopot_in)

    geopot_out = os.path.join(out_dir,
                              f"era5_geopotential_pressure_{AREA_NAME}.nc")
    if os.path.exists(geopot_out):
        os.remove(geopot_out)
    dcg.to_netcdf(geopot_out)

    # ----------------------------
    # Cleanup
    # ----------------------------
    for p in [moda_path, climate_zip, geopot_zip, geopot_in]:
        if os.path.exists(p):
            os.remove(p)


In [None]:
# assumes you defined these once at the top:
# AREA_NAME = "ICE_Alps"
# out_dir = os.path.join(cfg.dataPath, path_ERA5_raw)

def find_geopot_nc(folder: str) -> str:
    """Find extracted geopotential subset file, which varies with 'area' in filename."""
    candidates = sorted(glob.glob(os.path.join(folder, "geo.area-subset*.nc")))
    if not candidates:
        candidates = sorted(glob.glob(os.path.join(folder, "geo*.nc")))
    if not candidates:
        raise FileNotFoundError(
            f"No geopotential subset netCDF found in {folder}. "
            f"Looked for geo.area-subset*.nc / geo*.nc"
        )
    return max(candidates, key=os.path.getmtime)

# ----------------------------
# Correct ERA5 climate data
# ----------------------------
moda_path = os.path.join(out_dir, "data_stream-moda.nc")
dc = xr.open_dataset(moda_path)

# Only rename if needed (avoids crashing if CDS changes again)
if "valid_time" in dc.coords or "valid_time" in dc.dims:
    dc2 = dc.rename({"valid_time": "time"})
else:
    dc2 = dc  # already compatible

climate_out = os.path.join(out_dir, f"era5_monthly_averaged_data_{AREA_NAME}.nc")
if os.path.exists(climate_out):
    os.remove(climate_out)
dc2.to_netcdf(climate_out)

# ----------------------------
# Rename geopotential file
# ----------------------------
geopot_in = find_geopot_nc(out_dir)
dcg = xr.open_dataset(geopot_in)

geopot_out = os.path.join(out_dir, f"era5_geopotential_pressure_{AREA_NAME}.nc")
if os.path.exists(geopot_out):
    os.remove(geopot_out)
dcg.to_netcdf(geopot_out)

# ----------------------------
# Cleanup (safe)
# ----------------------------
for p in [
    os.path.join(out_dir, "download.netcdf.zip"),
    moda_path,
    os.path.join(out_dir, "download_geopot.netcdf.zip"),
    geopot_in,
]:
    if os.path.exists(p):
        os.remove(p)

### Check geopotential data:

In [None]:
geopot_era5 = xr.open_dataset(cfg.dataPath + path_ERA5_raw +
                              f'era5_geopotential_pressure_{AREA_NAME}.nc')
display(geopot_era5)

geopot_era5_sorted = geopot_era5.sortby('longitude')
# plot geopot new
geopot_era5_sorted.z.plot()