## Setting Up:

In [None]:
import os, sys
sys.path.append(os.path.join(os.getcwd(), '../../')) # Add root of repo to import MBM

import pandas as pd
import os
import warnings
from tqdm.notebook import tqdm
import zipfile
import cdsapi
import numpy as np
import glob
import xarray as xr
import massbalancemachine as mbm

from regions.TF_Europe.scripts.config_TF_Europe import *

cfg = mbm.EuropeConfig()

warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

## Download monthly ERA5-Land variables:

To download ERA5-Land data you need to configure your API client first. For this, follow these instructions: https://forum.ecmwf.int/t/step-by-step-instructions-on-how-to-download-data-using-new-climate-data-store-beta-cds-beta/3743. 

To summarize instructions: 
- Your existing CDS credentials will not work in CDS-Beta, you need to have an ECMWF account - register one here: https://www.ecmwf.int/. 
- Once you have set up an account, make sure that you accepted licences in the Download form of the product page: https://cds.climate.copernicus.eu/datasets/reanalysis-era5-land-monthly-means?tab=download. 
- Make sure that you are logged in ECMWF then navigate to the CDSAPI setup page to copy the content for your .cdsapirc file: https://cds.climate.copernicus.eu/how-to-api. 

Once this is set up, you should be able to download the ERA5-Land data using our script. 

In [None]:
import os
import glob
import zipfile
import cdsapi
import xarray as xr

RUN = True

AREA_NAME = "Europe"
AREA = [82, -30, 42, 40]  # N, W, S, E

def find_geopot_nc(folder: str) -> str:
    """Find extracted geopotential subset file (name varies with area)."""
    candidates = sorted(glob.glob(os.path.join(folder, "geo.area-subset*.nc")))
    if not candidates:
        candidates = sorted(glob.glob(os.path.join(folder, "geo*.nc")))
    if not candidates:
        raise FileNotFoundError(
            f"No geopotential subset netCDF found in {folder}. "
            f"Looked for geo.area-subset*.nc / geo*.nc"
        )
    return max(candidates, key=os.path.getmtime)

def find_climate_nc(folder: str, area_name: str) -> str:
    """
    Find the ERA5-Land climate netCDF.
    Prefer raw extracted 'data_stream-moda.nc' if it exists, otherwise use the renamed output.
    """
    moda = os.path.join(folder, "data_stream-moda.nc")
    renamed = os.path.join(folder, f"era5_monthly_averaged_data_{area_name}.nc")

    if os.path.exists(moda):
        return moda
    if os.path.exists(renamed):
        return renamed

    # fallback: pick newest .nc that looks like the climate stream (not geopot)
    candidates = sorted(glob.glob(os.path.join(folder, "*.nc")))
    # remove geopotential candidates
    candidates = [p for p in candidates if "geo.area-subset" not in os.path.basename(p)]
    if not candidates:
        raise FileNotFoundError(
            f"No climate netCDF found in {folder}. Expected {moda} or {renamed}."
        )
    return max(candidates, key=os.path.getmtime)

if RUN:
    out_dir = os.path.join(cfg.dataPath, path_ERA5_raw)
    os.makedirs(out_dir, exist_ok=True)

    c = cdsapi.Client()

    # ----------------------------
    # Download climate variables
    # ----------------------------
    climate_zip = os.path.join(out_dir, "download.netcdf.zip")
    c.retrieve(
        "reanalysis-era5-land-monthly-means",
        {
            "product_type": ["monthly_averaged_reanalysis"],
            "variable": [
                "10m_u_component_of_wind",
                "10m_v_component_of_wind",
                "2m_temperature",
                "forecast_albedo",
                "snow_cover",
                "snow_density",
                "snow_depth_water_equivalent",
                "snowfall",
                "snowmelt",
                "surface_latent_heat_flux",
                "surface_net_thermal_radiation",
                "surface_sensible_heat_flux",
                "surface_solar_radiation_downwards",
                "total_precipitation",
            ],
            "year": [str(y) for y in range(1950, 2026)],
            "month": [f"{m:02d}" for m in range(1, 13)],
            "time": ["00:00"],
            "data_format": "netcdf",
            "download_format": "zip",
            "area": AREA,
        },
        climate_zip,
    )
    with zipfile.ZipFile(climate_zip, "r") as z:
        z.extractall(out_dir)

    # ----------------------------
    # Download geopotential separately
    # ----------------------------
    geopot_zip = os.path.join(out_dir, "download_geopot.netcdf.zip")
    c.retrieve(
        "reanalysis-era5-land-monthly-means",
        {
            "variable": ["geopotential"],
            "data_format": "netcdf",
            "download_format": "zip",
            "area": AREA,
        },
        geopot_zip,
    )
    with zipfile.ZipFile(geopot_zip, "r") as z:
        z.extractall(out_dir)

    # ----------------------------
    # Post-process climate file (works whether moda exists or not)
    # ----------------------------
    climate_in = find_climate_nc(out_dir, AREA_NAME)
    dc = xr.open_dataset(climate_in)

    # Rename only if needed
    if "valid_time" in dc.coords or "valid_time" in dc.dims:
        dc = dc.rename({"valid_time": "time"})

    climate_out = os.path.join(out_dir, f"era5_monthly_averaged_data_{AREA_NAME}.nc")
    if os.path.exists(climate_out):
        os.remove(climate_out)
    dc.to_netcdf(climate_out)

    # ----------------------------
    # Post-process geopotential file
    # ----------------------------
    geopot_in = find_geopot_nc(out_dir)
    dcg = xr.open_dataset(geopot_in)

    geopot_out = os.path.join(out_dir, f"era5_geopotential_pressure_{AREA_NAME}.nc")
    if os.path.exists(geopot_out):
        os.remove(geopot_out)
    dcg.to_netcdf(geopot_out)

    # ----------------------------
    # Cleanup (safe)
    # ----------------------------
    # Only delete the raw moda file if it exists and is not the same as climate_out
    moda_path = os.path.join(out_dir, "data_stream-moda.nc")
    for p in [climate_zip, geopot_zip, geopot_in, moda_path]:
        if os.path.exists(p):
            os.remove(p)

### Check geopotential data:

In [None]:
geopot_era5 = xr.open_dataset(cfg.dataPath + path_ERA5_raw +
                              f'era5_geopotential_pressure_{AREA_NAME}.nc')
display(geopot_era5)

geopot_era5_sorted = geopot_era5.sortby('longitude')
# plot geopot new
geopot_era5_sorted.z.plot()