In [1]:
import pandas as pd
import io
import time
import json
from urllib.request import urlopen
import numpy as np
import os

MAX_ATTEMPTS = 6
MIN_VALID_MINS = 5 * 60
INCH_2_MM = 25.4
MAX_1_MIN = 38
MAX_6_HOUR = 840
OBS_DATA_PATH = "/g/data/wa46/user/nl5316/tw_spatial/obs"

In [2]:
def download_data(uri):
    """Fetch the data from the IEM

    The IEM download service has some protections in place to keep the number
    of inbound requests in check.  This function implements an exponential
    backoff to keep individual downloads from erroring.

    Args:
      uri (string): URL to fetch

    Returns:
      string data
    """
    attempt = 0
    while attempt < MAX_ATTEMPTS:
        try:
            data = urlopen(uri, timeout=300).read().decode("utf-8")
            if data is not None and not data.startswith("ERROR"):
                return data
        except Exception as exp:
            print(f"download_data({uri}) failed with {exp}")
            time.sleep(5)
        attempt += 1

    print("Exhausted attempts to download, returning empty data")


states = (
    "AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN "
    "MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT "
    "WA WI WV WY"
)
networks = [f"{state}_ASOS" for state in states.split()]
stations = []
for network in networks:
    # Get metadata
    uri = "https://mesonet.agron.iastate.edu/" f"geojson/network/{network}.geojson"
    data = urlopen(uri)
    jdict = json.load(data)
    for site in jdict["features"]:
        stations.append(site["properties"]["sid"])

In [3]:
start_year = 2022
end_year = 2024
start_month = 1
end_month = 9
start_day = 1
end_day = 1

In [4]:
file_names = os.listdir(OBS_DATA_PATH)
file_names = [name.replace(".nc", "") for name in file_names]
print(file_names)

['RBG', 'HUF', 'MTP', 'PAWI', 'JXN', 'JAX', 'GAG', 'RDD', 'EWB', 'AKO', 'RNO', 'GLR', 'SBP', 'PAOT', 'VPZ', 'CAE', 'HGR', 'GED', 'LPR', 'RVS', 'PASI', 'WMC', 'VLD', 'LWT', '1V4', 'HDO', 'AZO', 'ART', 'RFD', 'GEY', 'ELZ', 'MGY', 'UAO', 'BFM', 'WAL', 'MPV', 'HEI', 'RNM', 'HUL', 'ABQ', 'FCM', 'BGD', 'BED', 'HLN', 'FSD', 'RMG', 'SJC', 'XWA', 'BIL', 'ABI', 'TTN', 'COU', 'LAN', 'DYL', 'HRL', 'AGS', 'BPK', 'CSV', 'EET', 'IZG', 'NBC', 'PAKN', 'BOS', 'HSI', 'SWO', 'EEO', 'PHLI', 'OQT', 'UTS', 'LNK', 'NZY', 'FIT', 'MOD', 'HWD', 'IAD', 'BJJ', 'LXT', 'AQV', 'PABR', 'BZN', 'LAS', 'CEC', 'FXE', 'PNE', 'MTO', 'MSY', 'BUY', 'LWM', 'FZY', 'DVN', 'SLN', 'LVK', 'SMO', 'SNY', 'BTL', 'ELP', 'OJC', 'GFL', 'ORE', 'ROC', 'PAKT', 'DRO', 'SSI', 'HRO', 'OSU', 'GTF', 'DEQ', 'DTO', 'SDF', 'MSL', 'DAG', 'MIC', 'LUK', 'IXD', 'DMO', 'MKE', 'P69', 'AUS', 'FTW', 'EST', 'MEM', 'CDS', 'RIC', 'RWF', 'EUF', 'BPT', 'ROW', 'SIY', 'IMT', 'SNT', 'SNS', 'NRS', 'NFL', 'RSL', 'HKS', 'NSE', 'NYC', 'BIS', 'EWR', 'MIA', 'TTD', 'DIK'

In [5]:
for station in stations:
    if station in file_names:
        continue
    uri = f"https://mesonet.agron.iastate.edu/cgi-bin/request/asos1min.py?station={station}&tz=UTC&year1={start_year}&month1={start_month}&day1={start_day}&hour1=0&minute1=0&year2={end_year}&month2={end_month}&day2={end_day}&hour2=23&minute2=59&vars=ptype&vars=precip&sample=1min&what=view&delim=comma&gis=yes"
    try:
        data = download_data(uri)
    except:
        # print(f"No data for {station}")
        continue
    try:
        df = pd.read_csv(io.StringIO(data))
    except Exception as exp:
        print(f"An error occurred: {exp} for station {station}")
        continue
    if len(df) == 0:
        continue
    df["precip"] = df["precip"].replace("M", np.nan)
    df["precip"] = df["precip"].astype(float)
    df["precip"] = df["precip"] * INCH_2_MM
    df["valid(UTC)"] = pd.to_datetime(df["valid(UTC)"])
    df.set_index("valid(UTC)", inplace=True)

    df.loc[df["precip"] >= MAX_1_MIN, "precip"] = np.nan

    precip_df = df["precip"]
    df_resampled = precip_df.resample("6h", label="right", closed="right").agg(pd.Series.sum, min_count=MIN_VALID_MINS)
    df_resampled = df_resampled
    df_resampled.loc[df_resampled >= MAX_6_HOUR] = np.nan

    df_resampled.attrs = {
        "station": df.iloc[0]["station"],
        "lat": df.iloc[0]["lat"],
        "lon": df.iloc[0]["lon"],
    }
    da = df_resampled.to_xarray()
    da.attrs = {
        "station": df.iloc[0]["station"],
        "lat": df.iloc[0]["lat"],
        "lon": df.iloc[0]["lon"],
    }
    da.to_netcdf(f"{OBS_DATA_PATH}/{station}.nc")
    print(f"got data for {station}")

An error occurred: Error tokenizing data. C error: Expected 7 fields in line 737171, saw 8
 for station PALH
An error occurred: Error tokenizing data. C error: Expected 7 fields in line 823172, saw 8
 for station PAWD
An error occurred: Error tokenizing data. C error: Expected 7 fields in line 879073, saw 8
 for station BLH
An error occurred: Error tokenizing data. C error: Expected 7 fields in line 6197, saw 8
 for station NID
An error occurred: Error tokenizing data. C error: Expected 7 fields in line 943761, saw 8
 for station PSP
An error occurred: Error tokenizing data. C error: Expected 7 fields in line 48066, saw 8
 for station SDB
An error occurred: Error tokenizing data. C error: Expected 7 fields in line 195960, saw 8
 for station TRM
An error occurred: Error tokenizing data. C error: Expected 7 fields in line 1018158, saw 8
 for station RSW
An error occurred: Error tokenizing data. C error: Expected 7 fields in line 979585, saw 8
 for station HWO
An error occurred: Error tok