## Download retrospective data from the National Water Model (NWM) amazon bucket
- Inputs: Excel files or shapefile that consists the information of the COMIDs that you want to download the NWM retrospective data for
- Output: Streamflows from NWM retrospective analysis

In [None]:
import xarray as xr
import fsspec
import pandas as pd
import dask

# Define the path to the excel or base directory
comid_stn = pd.read_excel('usgs_with_comid.xlsx')
all_sites = comid_stn['COMID']

# URI for NWM 3.0 data source
nwm_loc = 's3://noaa-nwm-retrospective-3-0-pds/CONUS/zarr/chrtout.zarr'
# URI for NWM 2.1 data source
# nwm_loc = 's3://noaa-nwm-retrospective-2-1-zarr-pds/chrtout.zarr'

# Open the NWM dataset with fsspec 
ds = xr.open_zarr(fsspec.get_mapper(nwm_loc, anon=True, requester_pays=True), chunks='auto')

# Define the time range for which the data is to be downloaded. NWM 2.1 has data from 1979-2020 and NWM 3.0 has data from 1979-2023 
start_time = '2018-03-01T00:00:00'
end_time = '2019-01-31T00:00:00'

# Initialize an empty DataFrame to store the results
nwm = pd.DataFrame()

# Select the data for all COMIDs and the time range
sub_ds = ds.sel(feature_id=all_sites.values, time=slice(start_time, end_time))

# Computation on the streamflow
streamflow_data = sub_ds['streamflow'].compute()

# Extract the time index 
nwm['timestamp'] = pd.to_datetime(sub_ds['time'].values)

#Store streamflow data in a  dataframe
for i, reach_id in enumerate(all_sites):
    try:
        reach_id_numeric = pd.to_numeric(reach_id)
        nwm[str(reach_id_numeric)] = streamflow_data[:, i].values
    except KeyError as e:
        print(f"COMID {reach_id_numeric} not found in NWM dataset. Error: {e}")
    except Exception as e:
        print(f"An error occurred while processing COMID {reach_id_numeric}: {e}")
print('c')
# Save the DataFrame to a CSV file or anyfile that you ant to work with
output_file = 'nwm_discharge_test2.csv'
nwm.to_csv(output_file)

print(f"Data saved to {output_file}")
print(nwm.head())  
