In [None]:
import os
import sys
import csv
import requests
import nbm_funcs

import numpy as np
import pandas as pd
import xarray as xr
import multiprocessing as mp

import seaborn as sns
import scipy.stats as scipy
import urllib.request as req
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

os.environ["OMP_NUM_THREADS"] = "1"

***
***
# Configuration
Select 'site' to evaluate, modify 'vsite' if an alternate verification site is preferred<br>
Fixed 'date0' at the start of the NBM v3.2 period (2/20/2020)<br>
Full lead time is 263 hours - Note if date1 is within this period, there will be missing verification data as it does not exist yet!

In [None]:
# NBM 1D Viewer Site to use
site = nbm_funcs._site = 'KMSO'

# Data Range
lead_time_end = 263
init_hours = nbm_funcs._init_hours = [13]#[1, 7, 13, 19]

date0 = nbm_funcs._date0 = datetime(2020, 3, 1)#3, 1)
date1 = nbm_funcs._date1 = datetime(2020, 8, 15)#today

In [None]:
# datadir = nbm_funcs._datadir = '../archive/%s/data/'%site
datadir = nbm_funcs._datadir = '../archive/data/'
os.makedirs(datadir, exist_ok=True)

dates = nbm_funcs._dates = pd.date_range(date0, date1, freq='1D')
date2 = nbm_funcs._date2 = date1 + timedelta(hours=lead_time_end)

print(('\nForecast Site: {}\nInit Hours: '+
      '{}\nFirst Init: {}\nLast Init: {}\nLast Verif: {}').format(
    site, init_hours, date0, date1, date2))

***
***
# Obtain observation data from SynopticLabs (MesoWest) API
These are quality-controlled precipitation observations with adjustable accumulation periods<br>
See more at: https://developers.synopticdata.com/mesonet/v2/stations/precipitation/
<br><br>
If no observation file exists, will download and save for future use

In [None]:
# Get metadata for the select point
meta_base = 'https://api.synopticdata.com/v2/stations/metadata?'
api_token = '&token=a2386b75ecbc4c2784db1270695dde73'
meta_site = '&stid=%s&complete=1'%site
url = meta_base + api_token + meta_site
# print(url)

site_meta_raw = requests.get(url).json()
# print(meta_raw['STATION'][0])

zone = site_meta_raw['STATION'][0]['NWSZONE']
cwa = site_meta_raw['STATION'][0]['CWA']

print('Site: %s\nCWA: %s\nZone: %s'%(site, cwa, zone))

In [None]:
# Get a list of sites in the CWA that report precip
precip_base = 'https://api.synopticdata.com/v2/stations/precip?&complete=1&interval=6'
zone_query = '&nwszone=%s'%zone
cwa_query = '&cwa=%s'%cwa
date_query = '&start=%s&end=%s'%(
    date0.strftime('%Y%m%d%H%M'),
    (date0+timedelta(hours=6)).strftime('%Y%m%d%H%M'))

# We could query for a list of relevant zones within a CWA here
# Then pass a list of zones to the zone query
# !Add later!

# Fix this later! Temp fix to expand the zone for more NWS/FAA/RAWS stations
# Due to 1D Viewer file limitation - Ask Chad for advice?
zone_query = cwa_query

url = precip_base + api_token + zone_query + date_query
zone_meta_raw = requests.get(url).json()

meta = []
for station in zone_meta_raw['STATION']:
    
#     We need to get back to a zone query - for now this will work with a CWA query to only hit sites that exist within a CWA    
    if (('NWS' in station['SHORTNAME']) | ('RAWS' in station['SHORTNAME']) & (station['STID'][0] == 'K')):
        meta.append({k:station[k] for k in station.keys() if type(station[k]) == str})
        
meta = pd.DataFrame(meta).set_index('STID')

meta.shape

In [None]:
for k in meta:
    try:
        meta[k] = meta[k].astype(float)
    except:
        pass

<hr><hr>

## Plot a map of the stations, color by elevation, marker by network

In [None]:
import geopandas as gpd
from glob import glob

geodir = '../forecast-zones/'
zones_shapefile = glob(geodir + '*.shp')[0]

# Read the shapefile
zones = gpd.read_file(zones_shapefile)
# Prune to Western Region using TZ
zones = zones.set_index('TIME_ZONE').loc[['M', 'Mm', 'm', 'MP', 'P']].reset_index()
zones = zones[zones['CWA'] == cwa]

# zones.to_file(geodir + 'forecast-zones.json', driver = 'GeoJSON')
zones

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable

fig, ax = plt.subplots(1, figsize=(16, 16), facecolor='w')
ax.set_title('CWA: %s'%cwa)

zones.plot(column='NAME', color='0.9', edgecolor='0.25', ax=ax, zorder=10)

cbd = ax.scatter(meta['LONGITUDE'], meta['LATITUDE'], c=meta['ELEVATION'], cmap='gist_earth', 
                 s=150, marker='o', edgecolor='k', linewidth=1.5, zorder=20,)

divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="3%", pad=-1.5)
plt.colorbar(cbd, cax=cax)
ax.grid(True, zorder=-10)

ax.set_ylim(bottom=46.30, top=49.10)
ax.set_xlim(left=-124.9, right=-120.6)

fig.show()

<hr><hr>

In [None]:
# Single-process
ob_files = [nbm_funcs.get_precip_obs_mp(i) for i in meta.index.values]
ob_files = [f for f in ob_files if f is not None]

# Multi-process (needs fixing...)
# with mp.get_context('fork').Pool(mp.cpu_count()) as p:    
#     ob_files = p.map_async(nbm_funcs.get_precip_obs_mp, meta.index.values, chunksize=1)
#     ob_files.wait()
# ob_files = [f for f in ob_files.get() if f is not None]

obs = []
for file in ob_files:
    site = file.split('/')[-1].split('_')[0]
    iobs = pd.read_pickle(file)
    iobs['Site'] = np.full(iobs.index.size, fill_value=site, dtype='U10')
    iobs = iobs.reset_index().set_index(['ValidTime', 'Site'])
    obs.append(iobs)
    
obs = pd.concat(obs).sort_index()

mm_in = 1/25.4
obs *= mm_in
[obs.rename(columns={k:k.replace('mm', 'in')}, inplace=True) for k in obs.keys()]

# OPTIONAL! Drop NaN rows... may help elim lower qual dataw
# obs = obs.dropna(how='all')

sites = np.unique(obs.index.get_level_values(1))
print(sites)

print(obs.shape)
obs.head(10)

***
***
# Obtain NBM forecast data from NBM 1D Viewer (csv file API)
These are the NBM 1D output files extracted from the viewer with 3 set accumulation periods<br>
See more at: https://hwp-viz.gsd.esrl.noaa.gov/wave1d/?location=KSLC&col=2&hgt=1&obs=true&fontsize=1&selectedgroup=Default
<br><br>
If no forecast file exists, will download and save for future use. This can take some time.

In [None]:
with mp.get_context('fork').Pool(mp.cpu_count()) as p:    
    nbm_files = p.map_async(nbm_funcs.get_nbm_1d_mp, meta.index, chunksize=1)
    nbm_files.wait()
    
nbm_files = [f for f in nbm_files.get() if f is not None]

nbm = []
for file in nbm_files:
    site = file.split('/')[-1].split('_')[0]
    inbm = pd.read_pickle(file)
    inbm['Site'] = np.full(inbm.index.size, fill_value=site, dtype='U10')
    inbm = inbm.reset_index().set_index(['InitTime', 'ValidTime', 'Site'])
    nbm.append(inbm)
    
nbm = pd.concat(nbm).sort_index()
nbm

# Convert mm to in
nbm *= mm_in

lead = [row[1]-row[0] for row in nbm.index]
lead = np.array([1 + row.days*24 + (row.seconds/3600) for row in lead], dtype=int)
nbm.insert(0, 'LeadTime', lead)

# Nix values where lead time shorter than acc interval
for k in [k for k in nbm.keys() if k != 'LeadTime']:
    if 'APCP24hr' in k:
        nbm[k][nbm['LeadTime'] < 24] = np.nan
    elif 'APCP12hr' in k:
        nbm[k][nbm['LeadTime'] < 12] = np.nan
    elif 'APCP6r' in k:
        nbm[k][nbm['LeadTime'] < 6] = np.nan
    else:
        pass
    
nbm = nbm.dropna(subset=[k for k in nbm.keys() if k != 'LeadTime'], how='all')
nbm[25:50]

In [None]:
# Display some basic stats
nbm.loc[:, ['APCP6hr_surface', 'APCP6hr_surface_70% level', 'APCP6hr_surface_50% level',
            'APCP12hr_surface', 'APCP12hr_surface_70% level', 'APCP12hr_surface_50% level',
            'APCP24hr_surface', 'APCP24hr_surface_70% level', 'APCP24hr_surface_50% level'
            ]].describe().T

#### Plot the distribution of precipitation observations vs forecasts for assessment of representativeness

In [None]:
# Label fix:
site = nbm_funcs._site = zone_query.replace('&', '').replace('=', '_').upper()

figdir = nbm_funcs._figdir = '../archive//%s/figures/'%site
os.makedirs(figdir, exist_ok=True)

In [None]:
thresh_id = nbm_funcs._thresh_id = {'Small':[0, 1], 'Medium':[1, 2], 'Large':[2, 3], 'All':[0, 3]}

# 33rd, 67th percentile determined above
thresholds = nbm_funcs._thresholds = {interval:nbm_funcs.apcp_dist_plot(obs, nbm, interval, show=True) 
              for interval in [6, 12, 24]}

thresholds

***
***
# Reorganize the data for analysis:
#### Isolate the forecasts by accumulation interval and lead time

In [None]:
plist = np.arange(1, 100)

matchfile = datadir + 'CWA_%s_%s_%s_verifMatched.NPremoved.pd'%(
# matchfile = datadir + 'CWA_%s_%s_%s_verifMatched.pd'%(
    cwa, date0.strftime('%Y%m%d'), date1.strftime('%Y%m%d'))

if os.path.isfile(matchfile):
    data = pd.read_pickle(matchfile)

else:
    data = []
    for interval in [6, 12, 24]:

        pkeys = np.array([k for k in nbm.keys() if '%dhr_'%interval in k])
        pkeys = np.array([k for k in pkeys if '%' in k])
        pkeys = pkeys[np.argsort([int(k.split('_')[-1].split('%')[0]) for k in pkeys])]

        for lead_time in np.arange(interval, lead_time_end, 6):

            for esize in ['Small', 'Medium', 'Large', 'NP']:

                try:
                    thresh = [thresholds[interval][thresh_id[esize][0]], 
                              thresholds[interval][thresh_id[esize][1]]]
                except:
                    thresh = [0, 0]

                print('\rProcessing interval %d lead %dh'%(interval, lead_time), end='')

                # We need to break out the verification to each lead time,
                # but within each lead time we have a number of valid times.
                # At each lead time, valid time, isolate the forecast verification

                # Combine the datasets to make it easier to work with
                idata = nbm[nbm['LeadTime'] == lead_time].merge(obs, on=['ValidTime', 'Site']).drop(columns='LeadTime')

                # Subset for event size using the observed precip
                iobs = idata['%dh_precip_in'%interval]
                iobs = iobs.replace(np.nan, 0.) if 'NPremoved' in matchfile else iobs

                # Do the trimming of the selected dataset
                if esize != 'NP':
                    idata = idata[((iobs >= thresh[0]) & (iobs < thresh[1]))]
                else:
                    idata = idata[iobs == 0]

                del iobs

                idata['%dh_precip_in'%interval] = (idata['%dh_precip_in'%interval].replace(np.nan, 0) 
                    if 'NPremoved' in matchfile else idata['%dh_precip_in'%interval])

                for itime in idata.index:

                    try:
                        prob_fx = idata.loc[itime, pkeys]
                        mean_fx = np.nanmean(prob_fx)
                        std_fx = np.nanstd(prob_fx)
                        med_fx = idata.loc[itime, 'APCP%dhr_surface_50%% level'%interval]
                        det_fx = idata.loc[itime, 'APCP%dhr_surface'%interval]

                        # Optional - leave as nan?
                        det_fx = det_fx if ~np.isnan(det_fx) else 0.

                        verif_ob = idata.loc[itime, '%dh_precip_in'%interval]

                        verif_rank = np.searchsorted(prob_fx, verif_ob, 'right')                    
                        verif_rank_val = prob_fx[verif_rank-1]
                        verif_rank_error = verif_rank_val - verif_ob

                        verif_rank = 101 if ((verif_rank >= 99) & (verif_ob > verif_rank_val)) else verif_rank
                        verif_rank = -1 if ((verif_rank <= 1) & (verif_ob < verif_rank_val)) else verif_rank

                        det_rank = np.searchsorted(prob_fx, det_fx, 'right')
                        det_error = det_fx - verif_ob

                    except:
                        pass
                        # print('failed', itime)

                    else:
                        if ~np.isnan(verif_rank_val):

                            data.append([
                                # Indexers
                                interval, lead_time, itime[0], itime[1], esize,

                                # Verification and deterministic
                                verif_ob, det_fx, det_rank, det_error,

                                # Probabilistic
                                verif_rank, verif_rank_val, verif_rank_error, 
                                med_fx, mean_fx, std_fx])

    data = pd.DataFrame(data, columns=['Interval', 'LeadTime', 'ValidTime', 'Site', 'EventSize',
                    'verif_ob', 'det_fx', 'det_rank', 'det_error',
                    'verif_rank', 'verif_rank_val', 'verif_rank_error', 
                    'med_fx', 'mean_fx', 'std_fx'])

    data.to_pickle(matchfile)
    
print('\n\nAvailable keys:\n\t\t{}\nn rows: {}\n'.format('\n\t\t'.join(data.keys()), len(data)))

data

In [None]:
print('NPEs')

fig = plt.figure(facecolor='w', figsize=(10, 6))

plt.hist(data[data['EventSize'] == 'NP']['det_fx'], bins=np.arange(0.01, 2.1, .01), edgecolor='k')

plt.ylim(top=8000)
plt.xlim(left=0)
plt.grid()
plt.title('CWA: %s\nNon-Precipitating Forecast Events\n%s'%(cwa, ''))

plt.show()

data[((data['EventSize'] == 'NP') & (data['det_fx'] > 0.))][['det_fx']].describe().T

***
***
# Create Bulk Temporal Stats Plots
#### Reliability diagrams, bias over time, rank over time, etc.

In [None]:
short, long = 0, 120
plot_type = 'Verification'
plot_var = 'verif_rank'
esize = 'All'

for interval in [6, 12, 24]:

    kwargs = {'_interval':interval, '_esize':esize,
             '_short':short, '_long':long,
             '_plot_type':plot_type, '_plot_var':plot_var}
    
    nbm_funcs.histograms_verif_rank(data, **kwargs, show=True)

#### Plot a reliability diagram style CDF to evaluate percentile rankings

In [None]:
short, long = 0, 120
plot_type = 'Verification'
plot_var = 'verif_rank'
esize = 'All'

for interval in [6, 12, 24]:

    kwargs = {'_interval':interval, '_esize':esize,
             '_short':short, '_long':long,
             '_plot_type':plot_type, '_plot_var':plot_var}

    nbm_funcs.reliability_verif_cdf_multistation(data, **kwargs, show=True)

#### Produce bias, ME, MAE, and percentile rank plots as they evolve over time
This helps illustrate at what leads a dry/wet bias may exist and how severe it may be<br>
Adds value in interpreting the CDF reliability diagrams

In [None]:
short, long = 0, 120
esize = 'All'

for interval in [6, 12, 24]:

    kwargs = {'_interval':interval, '_esize':esize,
             '_short':short, '_long':long}

    nbm_funcs.rank_over_leadtime(data, **kwargs, show=True)

In [None]:
# Create maps of rank at each station over the CWA with mean verif and mean det