In [None]:
import xarray as xr
import pandas as pd
import os
import numpy as np

HRRR_DATA_PATH = "../data/raw/hrrr/"
HRRR_PROCESSED_DATA_PATH = "../data/processed/hrrr/"

In [2]:
file_list = sorted([f for f in os.listdir(HRRR_DATA_PATH) if f.endswith(".nc")])
file_list_dates = [file.split("_")[1] for file in file_list]
file_list_dates = pd.to_datetime(file_list_dates, format="%Y%m%d")

In [None]:
# Get all valid times
ds = xr.open_mfdataset(
    f"{HRRR_DATA_PATH}/*.nc", concat_dim="init_time", combine="nested"
)
time_list = ds.time.values

In [None]:
for time in time_list:
    time = pd.to_datetime(time)
    # Get the models for the three previous initialisations before the time
    previous_dates = file_list_dates[file_list_dates < time][-3:]
    if len(previous_dates) == 0:
        print(f"No data for {time}")
        continue
    previous_dates = previous_dates.strftime("%Y%m%d")
    file_list_select = [f"hrrr_{date}_00z.nc" for date in previous_dates]
    ncs = [HRRR_DATA_PATH + "/" + nc for nc in file_list_select]
    ds = xr.open_mfdataset(ncs, concat_dim="init_time", combine="nested")
    ds = ds.assign_coords(init_time=pd.to_datetime(previous_dates, format="%Y%m%d"))
    ds = ds.sel(time=time)
    lead_time = ds.time - ds.init_time
    ds.coords["init_time"] = lead_time.values
    ds = ds.rename({"init_time": "lead_time"})

    broadcast_array = xr.DataArray(
        [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
        dims=["lead_time"],
        coords={
            "lead_time": pd.timedelta_range(start="6h", end="2d", freq="6h").values
        },
    )
    ds = ds.broadcast_like(broadcast_array)

    ds.to_netcdf(HRRR_PROCESSED_DATA_PATH + f"/hrrr_{time.strftime('%Y%m%d_%H')}_00.nc")
    print(f"Saved data for {time}")