<a href="https://colab.research.google.com/github/m-wessler/nbm-verification/blob/main/get_obs_synopticAPI_streamline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import os
import time
import json
import requests
import itertools

import pandas as pd
import numpy as np

from glob import glob
from functools import partial
from datetime import datetime, timedelta
from multiprocessing import Pool, cpu_count

# Globals

In [None]:
# Multiprocess settings
process_pool_size = cpu_count()*30

# Synoptic API token
user_token = 'a2386b75ecbc4c2784db1270695dde73'

# Backend APIs
metadata_api = "https://api.synopticdata.com/v2/stations/metadata?"
qc_api = "https://api.synopticdata.com/v2/stations/qcsegments?"

# Data Query APIs
timeseries_api = "https://api.synopticdata.com/v2/stations/timeseries?"
statistics_api = "https://api.synopticlabs.org/v2/stations/statistics?"
precipitation_api = "https://api.synopticdata.com/v2/stations/precipitation?"

# Assign API to element name
synoptic_apis = {
    'qpf':precipitation_api,
    'maxt':statistics_api,
    'mint':statistics_api}

synoptic_networks = {"NWS+RAWS+HADS":"1,2,106",
                     "NWS+RAWS":"1,2",
                     "NWS":"1",
                     "RAWS": "2",
                     "ALL":""}
                    #  "CUSTOM": "&network="+network_input,
                    #  "LIST": "&stid="+network_input}

# Assign synoptic variable to element name
synoptic_vars = {
    'qpf':None,
    'maxt':'air_temp',
    'mint':'air_temp'}

# Assign stat type to element name
stat_type = {
    'qpf':'total',
    'maxt':'maximum',
    'mint':'minimum'}

ob_hours = {
    'qpf':['1200', '1200'],
    'maxt':['1200', '0600'],
    'mint':['0000', '1800']}

# Methods

In [None]:
def mkdir_p(path):
    from pathlib import Path
    Path(path).mkdir(parents=True, exist_ok=True)
    return path

In [None]:
def cwa_list(input_region):

    input_region = input_region.upper()

    region_dict ={
        "WR":["BYZ", "BOI", "LKN", "EKA", "FGZ", "GGW", "TFX", "VEF", "LOX", "MFR",
            "MSO", "PDT", "PSR", "PIH", "PQR", "REV", "STO", "SLC", "SGX", "MTR",
            "HNX", "SEW", "OTX", "TWC"],

        "CR":["ABR", "BIS", "CYS", "LOT", "DVN", "BOU", "DMX", "DTX", "DDC", "DLH",
            "FGF", "GLD", "GJT", "GRR", "GRB", "GID", "IND", "JKL", "EAX", "ARX",
            "ILX", "LMK", "MQT", "MKX", "MPX", "LBF", "APX", "IWX", "OAX", "PAH",
            "PUB", "UNR", "RIW", "FSD", "SGF", "LSX", "TOP", "ICT"],

        "ER":["ALY", "LWX", "BGM", "BOX", "BUF", "BTV", "CAR", "CTP", "RLX", "CHS",
            "ILN", "CLE", "CAE", "GSP", "MHX", "OKX", "PHI", "PBZ", "GYX", "RAH",
            "RNK", "AKQ", "ILM"],

        "SR":["ABQ", "AMA", "FFC", "EWX", "BMX", "BRO", "CRP", "EPZ", "FWD", "HGX",
            "HUN", "JAN", "JAX", "KEY", "MRX", "LCH", "LZK", "LUB", "MLB", "MEG",
            "MAF", "MFL", "MOB", "MRX", "OHX", "LIX", "OUN", "SJT", "SHV", "TAE",
            "TBW", "TSA"]}

    if input_region == "CONUS":
        return np.hstack([region_dict[region] for region in region_dict.keys()])
    else:
        return region_dict[input_region]

In [None]:
def fetch_obs_from_API(date, cwa='', output_type='csv', use_saved=True, **req):

    valid = True
    cwa_filename = req['region'] if req['region'] else cwa

    output_dir = mkdir_p(f'./obs_{output_type}/')

    output_file = output_dir + f'obs.{req["element"]}.{req["ob_stat"]}' +\
                    f'.{date}.{cwa_filename}.{output_type}'

    if os.path.isfile(output_file) & use_saved:
        # print(f'Output file exists for:{iter_item}')
        return output_file

    else:
        json_dir = mkdir_p('./obs_json/')

        json_file = json_dir + f'obs.{req["element"]}.{req["ob_stat"]}' +\
                        f'.{date}.{cwa_filename}.json'


        adjusted_end_date = (datetime.strptime(date, '%Y%m%d') +
                            timedelta(days=req['days_offset'])
                            ).strftime('%Y%m%d')

        if os.path.isfile(json_file) & use_saved:
            # print(f'Polling archived JSON for: {iter_item}')

            with open(json_file, 'rb+') as rfp:
                response_dataframe = pd.json_normalize(json.load(rfp)['STATION'])

        else:
            api_query_args = {
                'api_token':f'&token={user_token}',
                'station_query':f'&cwa={cwa}',
                'network_query':f'&network={req["network_query"]}',
                'start_date_query':f'&start={date}{req["obs_start_hour"]}',
                'end_date_query':f'&end={adjusted_end_date}{req["obs_end_hour"]}',
                'vars_query':(f'&pmode=totals' if req["element"] == 'qpf'
                    else f'&vars={req["vars_query"]}'),
                'stats_query':f'&type={req["ob_stat"]}',
                'timezone_query':'&obtimezone=utc',
                'api_extras':'&fields=name,status,latitude,longitude,elevation'}

            api_query = req['api'] + ''.join(
                [api_query_args[k] for k in api_query_args.keys()])

            print(f'Polling API for: {iter_item}\n{api_query}')

            status_code, response_count = None, 0
            while (status_code != 200) & (response_count <= 10):
                print(f'{iter_item}, HTTP:{status_code}, #:{response_count}')

                # Don't sleep first try, sleep increasing amount for each retry
                time.sleep(2*response_count)

                response = requests.get(api_query)
                # response.raise_for_status()

                status_code = response.status_code
                response_count += 1

            try:
                response_dataframe = pd.json_normalize(
                    response.json()['STATION'])
            except:
                valid = False
            else:
                with open(json_file, 'wb+') as wfp:
                    wfp.write(response.content)

        if valid:
            # Check ACTIVE flag (Can disable in config above if desired)
            response_dataframe = response_dataframe[
                response_dataframe['STATUS'] == "ACTIVE"]

            # Un-nest the QPF totals
            if req['element'] == 'qpf':
                response_dataframe['TOTAL'] = [i[0]['total']
                    for i in response_dataframe['OBSERVATIONS.precipitation']]

            if output_type == 'pickle':
            # Save out df as pickle
                response_dataframe.to_pickle(output_file)

            elif output_type == 'csv':
            # Save out df as csv
                response_dataframe.to_csv(output_file)

            return None

        else:
            return iter_item

# User Input/Multiprocessing Inputs

In [None]:
# Collect user inputs
element = 'maxt'
element = element.lower() # Failsafe

region_selection = 'WR'
cwa_selection = 'SLC'

start_date = '20231101'
end_date = '20231115'

# Immediately convert user input to datetime objects
start_date, end_date = [datetime.strptime(date+'0000', '%Y%m%d%H%M')
    for date in [start_date, end_date]]

# Main/Multiprocessing Call

In [None]:
# Build arg dict
synoptic_api_args = {
    'obs_start_hour':ob_hours[element][0],
    'obs_end_hour':ob_hours[element][1],
    'ob_stat':stat_type[element],
    'api':synoptic_apis[element],
    'element':element,
    'region':region_selection,
    'network_query':synoptic_networks['NWS+RAWS'], # add config feature later
    'vars_query':None if element == 'qpf'
        else f'{synoptic_vars[element]}',
    'days_offset':1 if element != 'mint' else 0}

# Build an iterable date list from range
iter_date = start_date
date_selection_iterable = []
while iter_date <= end_date:
    date_selection_iterable.append(iter_date.strftime('%Y%m%d'))
    iter_date += timedelta(days=1)

# Assign the fixed kwargs to the function
cwa_query = ','.join(cwa_list(region_selection)
                    ) if region_selection is not None else cwa_selection

multiprocess_function = partial(fetch_obs_from_API,
                                cwa=cwa_query,
                                **synoptic_api_args)

In [None]:
# Do the job
for iter_item in date_selection_iterable:
    multiprocess_function(iter_item)

# with Pool(process_pool_size) as pool:
#     print(f'Spooling up process pool for {len(multiprocess_iterable)} tasks '
#           f'across {process_pool_size} workers')

#     retry = pool.map(multiprocess_function, multiprocess_iterable)
#     pool.terminate()

#     print('Multiprocessing Complete')

In [None]:
# Glob together csv files
# Need to filter by variable/region in case of region change or re-run!
searchstring = (f'*{element}*{region_selection}*.csv'
    if region_selection is not None else f'*{element}*{cwa_selection}*.csv')

df = pd.concat(map(pd.read_csv, glob(os.path.join('./obs_csv/', searchstring))),
               ignore_index=True)

if element == 'qpf':
    # Un-nest precipitation observations
    df_qpf = pd.concat([pd.DataFrame(json.loads(row.replace("'", '"')))
            for row in df['OBSERVATIONS.precipitation']], ignore_index=True)

    df = df.drop(columns='OBSERVATIONS.precipitation').join(df_qpf)

# Identify the timestamp column (changes with variable)
for k in df.keys():
    if (('date_time' in k) or ('last_report' in k)):
        time_col = k

df.rename(columns={time_col:'timestamp'}, inplace=True)
time_col = 'timestamp'

# Convert read strings to datetime object
df[time_col] = pd.to_datetime(df['timestamp']).round('60min')

if element == 'maxt':
    # Attribute to the day prior if UTC < 06Z otherwise attribute as stamped
    df['timestamp'] = df['timestamp'].where(df['timestamp'].dt.hour <= 6,
                    df['timestamp']-pd.Timedelta(1, unit='D')).dt.date

elif element == 'mint':
    df['timestamp'] = df['timestamp'].dt.date

# Drop any NaNs and sort by date with station as secondary index
df.set_index(['timestamp'], inplace=True)
df = df[df.index.notnull()].reset_index().set_index(['timestamp', 'STID'])
df.sort_index(inplace=True)

df