In [15]:
import cdsapi
import xarray as xr
import pandas as pd
import numpy as np
from pathlib import Path
import time

In [4]:
c = cdsapi.Client()

In [5]:
# specifying sizes and thinnings

lat_dict = {
    'full': slice(50, 25),
    'small': slice(45, 30),
    'slgt_small': slice(50, 25)
}

lon_dict = {
    'full': slice(360-125, 360-66),
    'small': slice(360-105, 360-85),
    'slgt_small': slice(360-125, 360-66)
}

levels_dict = {
    'full': [925, 850, 700, 500, 300],
    'small': [925, 850, 700, 500, 300],
    'slgt_small': [925, 850, 700, 500, 300]
}

time_thin_dict = {
    'full': 1,
    'small': 6,
    'slgt_small': 6
}

space_thin_dict = {
    'full': 1,
    'small': 4,
    'slgt_small': 4
}

risk_level_dict = {
    'full': ['MDT', 'HIGH'],
    'small': ['MDT', 'HIGH'],
    'slgt_small': ['SLGT', 'ENH', 'MDT', 'HIGH']
}

pressure_var_dict = {
    'full': ["geopotential", "potential_vorticity", "specific_humidity", "temperature", "u_component_of_wind", "v_component_of_wind", "vertical_velocity"],
    'small': ["geopotential", "potential_vorticity", "specific_humidity", "temperature", "u_component_of_wind", "v_component_of_wind", "vertical_velocity"],
    'slgt_small': ["geopotential", "potential_vorticity", "specific_humidity", "temperature", "u_component_of_wind", "v_component_of_wind", "vertical_velocity"]
}

surface_var_dict = {
    'full': ["10m_u_component_of_wind", "10m_v_component_of_wind", "2m_dewpoint_temperature", "2m_temperature", "geopotential_at_surface", "toa_incident_solar_radiation"],
    'small': ["10m_u_component_of_wind", "10m_v_component_of_wind", "2m_dewpoint_temperature", "2m_temperature", "geopotential_at_surface", "toa_incident_solar_radiation"],
    'slgt_small': ["10m_u_component_of_wind", "10m_v_component_of_wind", "2m_dewpoint_temperature", "2m_temperature", "geopotential_at_surface", "toa_incident_solar_radiation"]
}

In [6]:
detail = 'slgt_small'

In [7]:
# --- risk days
pph = xr.load_dataset("data/raw_data/labelled_pph.nc")
missing_dates = [...]
dates_of_interest = pph["time"][pph["MAX_CAT"].isin(risk_level_dict[detail])]
dates_of_interest = dates_of_interest[dates_of_interest > "200203310000"]
dates_of_interest = dates_of_interest[~(dates_of_interest.isin(missing_dates))]
selected_days = pd.to_datetime(dates_of_interest.values, format="%Y%m%d%H%M").normalize()

years = np.unique(selected_days.year)

In [8]:
out_dir = Path("/glade/work/milesep/era5_cds")
out_dir.mkdir(parents=True, exist_ok=True)

all_files = []

# --- derive requested hours directly from thin factor
hours = [f"{h:02d}:00" for h in range(0, 24, time_thin_dict[detail])]

In [16]:
def safe_retrieve(dataset, request, target, max_retries=5, wait=30):
    """
    Robust CDSAPI download:
    - Writes to .part file first
    - Retries with exponential backoff if download fails
    """
    tmp_target = target.with_suffix(".nc.part")

    for attempt in range(1, max_retries + 1):
        try:
            c.retrieve(dataset, request, str(tmp_target))
            tmp_target.rename(target)  # rename only after success
            print(f"✅ Downloaded: {target}")
            return target
        except Exception as e:
            print(f"⚠️ Attempt {attempt} failed for {target}: {e}")
            if tmp_target.exists():
                tmp_target.unlink()  # clean up bad partials
            if attempt < max_retries:
                sleep_time = wait * attempt
                print(f"Retrying in {sleep_time}s...")
                time.sleep(sleep_time)
            else:
                raise RuntimeError(f"Failed to download {target} after {max_retries} attempts.")

In [19]:
for year in years:
    days_this_year = selected_days[selected_days.year == year]

    for month in sorted(set(days_this_year.month)):
        days_this_month = days_this_year[days_this_year.month == month]
        days = sorted({f"{d.day:02d}" for d in days_this_month})
        month_str = f"{month:02d}"

        print(year, month, len(days_this_month))

        # ------------------ Pressure levels ------------------
        pl_file = out_dir / f"era5_pl_{year}_{month_str}.nc"
        if pl_file.exists():
            try:
                xr.open_dataset(pl_file).close()
                print(f"Skipping (exists): {pl_file}")
                all_files.append(pl_file)
            except Exception:
                print(f"Corrupt file detected, redownloading: {pl_file}")
                pl_file.unlink()
                safe_retrieve("reanalysis-era5-pressure-levels", {
                    "product_type": "reanalysis",
                    "format": "netcdf",
                    "variable": pressure_var_dict[detail],
                    "pressure_level": [str(l) for l in levels_dict[detail]],
                    "year": str(year),
                    "month": month_str,
                    "day": days,
                    "time": hours,
                    "area": [
                        lat_dict[detail].start,
                        lon_dict[detail].start - 360,
                        lat_dict[detail].stop,
                        lon_dict[detail].stop - 360,
                    ],
                }, pl_file)
                all_files.append(pl_file)
        else:
            safe_retrieve("reanalysis-era5-pressure-levels", {
                "product_type": "reanalysis",
                "format": "netcdf",
                "variable": pressure_var_dict[detail],
                "pressure_level": [str(l) for l in levels_dict[detail]],
                "year": str(year),
                "month": month_str,
                "day": days,
                "time": hours,
                "area": [
                    lat_dict[detail].start,
                    lon_dict[detail].start - 360,
                    lat_dict[detail].stop,
                    lon_dict[detail].stop - 360,
                ],
            }, pl_file)
            all_files.append(pl_file)

        # ------------------ Single levels ------------------
        sfc_file = out_dir / f"era5_sfc_{year}_{month_str}.nc"
        if sfc_file.exists():
            try:
                xr.open_dataset(sfc_file).close()
                print(f"Skipping (exists): {sfc_file}")
                all_files.append(sfc_file)
            except Exception:
                print(f"Corrupt file detected, redownloading: {sfc_file}")
                sfc_file.unlink()
                safe_retrieve("reanalysis-era5-single-levels", {
                    "product_type": "reanalysis",
                    "format": "netcdf",
                    "variable": surface_var_dict[detail],
                    "year": str(year),
                    "month": month_str,
                    "day": days,
                    "time": hours,
                    "area": [
                        lat_dict[detail].start,
                        lon_dict[detail].start - 360,
                        lat_dict[detail].stop,
                        lon_dict[detail].stop - 360,
                    ],
                }, sfc_file)
                all_files.append(sfc_file)
        else:
            safe_retrieve("reanalysis-era5-single-levels", {
                "product_type": "reanalysis",
                "format": "netcdf",
                "variable": surface_var_dict[detail],
                "year": str(year),
                "month": month_str,
                "day": days,
                "time": hours,
                "area": [
                    lat_dict[detail].start,
                    lon_dict[detail].start - 360,
                    lat_dict[detail].stop,
                    lon_dict[detail].stop - 360,
                ],
            }, sfc_file)
            all_files.append(sfc_file)

2002 4 24
Skipping (exists): /glade/work/milesep/era5_cds/era5_pl_2002_04.nc


2025-08-16 23:35:29,011 INFO Request ID is eb856a28-80af-4184-bb0d-6e5d1eaadc1c
2025-08-16 23:35:29,185 INFO status has been updated to accepted
2025-08-16 23:35:36,832 INFO status has been updated to successful


56006aa4c03175722c7aeac41708f861.zip:   0%|          | 0.00/18.4M [00:00<?, ?B/s]

✅ Downloaded: /glade/work/milesep/era5_cds/era5_sfc_2002_04.nc
2002 5 28


2025-08-16 23:35:40,778 INFO Request ID is 29c9d3ee-587d-457a-b2a3-229b90b0dab7
2025-08-16 23:35:40,927 INFO status has been updated to accepted
2025-08-16 23:35:53,724 INFO status has been updated to running
2025-08-16 23:40:01,178 INFO status has been updated to successful


33ee66b280c709916ced967738c7ce0a.nc:   0%|          | 0.00/183M [00:00<?, ?B/s]

✅ Downloaded: /glade/work/milesep/era5_cds/era5_pl_2002_05.nc


2025-08-16 23:42:05,092 INFO Request ID is 154f70b4-4874-4a70-8744-a73145f8dae6
2025-08-16 23:42:05,260 INFO status has been updated to accepted


KeyboardInterrupt: 

In [None]:
# --- combine downloaded files and finish pipeline
ds = xr.open_mfdataset(all_files, combine="by_coords")

# subset exactly the selected days again (to be safe)
time_days = ds.time.dt.floor("D")
ds = ds.sel(time=ds.time[np.isin(time_days, selected_days)])

# add day/tod index, thin spatially
ds = ds.assign_coords(day=ds.time.dt.floor("D"), tod=ds.time.dt.hour)
ds = ds.set_index(time=["day", "tod"]).unstack("time")
ds = ds.thin({"latitude": space_thin_dict[detail], "longitude": space_thin_dict[detail]})

ds.to_zarr(f"/glade/work/milesep/convective_outlook_ml/inputs_raw_{detail}_cds.zarr",
           mode="w", consolidated=True)