In [5]:
import pandas as pd
import io
import time
import json
from urllib.request import urlopen
import numpy as np
import os

MAX_ATTEMPTS = 6
MIN_VALID_MINS = 5 * 60
INCH_2_MM = 25.4
MAX_1_MIN = 38
MAX_6_HOUR = 840
OBS_DATA_PATH = "/g/data/wa46/user/nl5316/tw_spatial/obs"

In [2]:
def download_data(uri):
    """Fetch the data from the IEM

    The IEM download service has some protections in place to keep the number
    of inbound requests in check.  This function implements an exponential
    backoff to keep individual downloads from erroring.

    Args:
      uri (string): URL to fetch

    Returns:
      string data
    """
    attempt = 0
    while attempt < MAX_ATTEMPTS:
        try:
            data = urlopen(uri, timeout=300).read().decode("utf-8")
            if data is not None and not data.startswith("ERROR"):
                return data
        except Exception as exp:
            print(f"download_data({uri}) failed with {exp}")
            time.sleep(5)
        attempt += 1

    print("Exhausted attempts to download, returning empty data")


states = (
    "AK AL AR AZ CA CO CT DE FL GA HI IA ID IL IN KS KY LA MA MD ME MI MN "
    "MO MS MT NC ND NE NH NJ NM NV NY OH OK OR PA RI SC SD TN TX UT VA VT "
    "WA WI WV WY"
)
networks = [f"{state}_ASOS" for state in states.split()]
stations = []
for network in networks:
    # Get metadata
    uri = "https://mesonet.agron.iastate.edu/" f"geojson/network/{network}.geojson"
    data = urlopen(uri)
    jdict = json.load(data)
    for site in jdict["features"]:
        stations.append(site["properties"]["sid"])

In [3]:
start_year = 2022
end_year = 2024
start_month = 1
end_month = 9
start_day = 1
end_day = 1

In [7]:
file_names = os.listdir("../data/obs")
file_names = [name.replace(".nc", "") for name in file_names]
print(file_names)

['PASC', 'PAEG', 'PAHN', 'PADQ', 'PABT', 'PAKN', 'PAVL', 'PAJN', 'PAKV', 'PABI', 'PAKW', 'PAHO', 'PAGK', 'PABR', 'PAKT', 'PACV', 'PAIL', 'PAOT', 'PAEN', 'PAFA', 'PANC', 'PACD', 'PABE']


In [8]:
for station in stations:
    if station in file_names:
        continue
    uri = f"https://mesonet.agron.iastate.edu/cgi-bin/request/asos1min.py?station={station}&tz=UTC&year1={start_year}&month1={start_month}&day1={start_day}&hour1=0&minute1=0&year2={end_year}&month2={end_month}&day2={end_day}&hour2=23&minute2=59&vars=ptype&vars=precip&sample=1min&what=view&delim=comma&gis=yes"
    try:
        data = download_data(uri)
    except:
        # print(f"No data for {station}")
        continue
    try:
        df = pd.read_csv(io.StringIO(data))
    except Exception as exp:
        print(f"An error occurred: {exp} for station {station}")
        continue
    if len(df) == 0:
        continue
    df["precip"] = df["precip"].replace("M", np.nan)
    df["precip"] = df["precip"].astype(float)
    df["precip"] = df["precip"] * INCH_2_MM
    df["valid(UTC)"] = pd.to_datetime(df["valid(UTC)"])
    df.set_index("valid(UTC)", inplace=True)

    df.loc[df["precip"] >= MAX_1_MIN, "precip"] = np.nan

    precip_df = df["precip"]
    df_resampled = precip_df.resample("6h").agg(pd.Series.sum, min_count=MIN_VALID_MINS)
    df_resampled = df_resampled
    df_resampled.loc[df_resampled >= MAX_6_HOUR] = np.nan

    df_resampled.attrs = {
        "station": df.iloc[0]["station"],
        "lat": df.iloc[0]["lat"],
        "lon": df.iloc[0]["lon"],
    }
    da = df_resampled.to_xarray()
    da.attrs = {
        "station": df.iloc[0]["station"],
        "lat": df.iloc[0]["lat"],
        "lon": df.iloc[0]["lon"],
    }
    da.to_netcdf(f"{OBS_DATA_PATH}/{station}.nc")
    print(f"got data for {station}")

An error occurred: Error tokenizing data. C error: Expected 7 fields in line 737171, saw 8
 for station PALH
got data for PAMC
got data for PAMR
got data for PANN
got data for PAOM
got data for PAOR
got data for PAQT
