In [None]:
import s3fs
import xarray as xr
import fsspec
import pandas as pd
import os

In [None]:
# Initialize the S3 filesystem
s3 = s3fs.S3FileSystem(anon=True)  # anon=True for public access

directory = "../data/graphcast"
files = os.listdir(directory)

In [None]:
# HRRR domain
latN = 50.4
latS = 24.25
lonW = 234
lonE = 294
dates = pd.date_range(start="2022-01-01", end="2024-08-30")

In [None]:
for date in dates:
    year = date.year
    month = date.month
    if month < 10:
        month = f"0{month}"
    day = date.day
    if day < 10:
        day = f"0{day}"

    if f"GRAP_v100_GFS_{year}{month}{day}_00.nc" in files:
        continue
    url = (
        f"https://noaa-oar-mlwp-data.s3.amazonaws.com/GRAP_v100/{year}/"
        f"{month}{day}/GRAP_v100_GFS_{year}{month}{day}00_f000_f240_06.nc"
    )
    try:
        ds = xr.open_dataset(fsspec.open(url).open())
    except:
        url = (
            f"https://noaa-oar-mlwp-data.s3.amazonaws.com/GRAP_v100/{year}/"
            f"{month}{day}/GRAP_v100_gfs_{year}{month}{day}00_f000_f240_06.nc"
        )
        try:
            ds = xr.open_dataset(fsspec.open(url).open())
        except FileNotFoundError:
            print(f"no data for {year}{month}{day}")
            continue

    print(f"fetched {date}")
    da = (
        ds["apcp"]
        .sel(latitude=slice(latN, latS), longitude=slice(lonW, lonE))
        .isel(time=slice(0, 10))
    )
    da.to_netcdf(f"../data/raw/graphcast/GRAP_v100_GFS_{year}{month}{day}_00.nc")
    print(f"saved {date}")