### Run in Google CoLab! (Open in new window or new tab)
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m-wessler/nbm-verify/blob/master/notebooks/verify_1Dqpf_dev.ipynb)

In [1]:
import os
import csv
import urllib.request as req

import scipy.stats as scipy
import numpy as np
import pandas as pd
import xarray as xr

import seaborn as sns
import matplotlib.pyplot as plt

from nbm_funcs import *
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

***
***
# Configuration
Select 'site' to evaluate, modify 'vsite' if an alternate verification site is preferred<br>
Fixed 'date0' at the start of the NBM v3.2 period (2/20/2020)<br>
Full lead time is 263 hours - Note if date1 is within this period, there will be missing verification data as it does not exist yet!

In [10]:
# NBM 1D Viewer Site to use
site = 'KSEA'
vsite = site

# Data Range
lead_time = 263
init_hours = [1, 7, 13, 19]

date0 = datetime(2020, 2, 20)
date1 = datetime(2020, 7, 8)

In [11]:
sitepath = site if site == vsite else '_'.join([site, vsite])

datadir = './%s/data/'%sitepath
os.makedirs(datadir, exist_ok=True)

figdir = './%s/figures/'%sitepath
os.makedirs(figdir, exist_ok=True)

dates = pd.date_range(date0, date1, freq='1D')
date2 = date1 + timedelta(hours=lead_time)

print(('\nForecast Site: {}\nVerif Site: {}\nInit Hours: '+
      '{}\nFirst Init: {}\nLast Init: {}\nLast Verif: {}').format(
    site, vsite, init_hours, date0, date1, date2))


Forecast Site: KSEA
Verif Site: KSEA
Init Hours: [1, 7, 13, 19]
First Init: 2020-02-20 00:00:00
Last Init: 2020-07-08 00:00:00
Last Verif: 2020-07-18 23:00:00


***
***
# Obtain observation data from SynopticLabs (MesoWest) API
These are quality-controlled precipitation observations with adjustable accumulation periods<br>
See more at: https://developers.synopticdata.com/mesonet/v2/stations/precipitation/
<br><br>
If no observation file exists, will download and save for future use

In [23]:
obfile = datadir + '%s_obs_%s_%s.pd'%(site, date0.strftime('%Y%m%d'), date1.strftime('%Y%m%d'))

if os.path.isfile(obfile):
    # Load file
    obs = pd.read_pickle(obfile)
    print('\nLoaded obs from file %s\n'%obfile)

else:
    # Get and save file
    obs = get_precip_obs(vsite, date0, date2)
    obs = obs[0].merge(obs[1], how='inner', on='ValidTime').merge(obs[2], how='inner', on='ValidTime')
    obs = obs[[k for k in obs.keys() if 'precip' in k]].sort_index()

    obs.to_pickle(obfile)
    print('\nSaved obs to file %s\n'%obfile)
    
mm_in = 1/25.4
obs *= mm_in
[obs.rename(columns={k:k.replace('mm', 'in')}, inplace=True) for k in obs.keys()]

obs.describe().T


Loaded obs from file ./KSEA/data/KSEA_obs_20200220_20200708.pd



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
6h_precip_in,569.0,0.018369,0.065936,0.0,0.0,0.0,0.0,0.64
12h_precip_in,569.0,0.03633,0.106446,0.0,0.0,0.0,0.01,0.78
24h_precip_in,568.0,0.074192,0.16946,0.0,0.0,0.0,0.053199,1.251969


#### Plot the distribution of precipitation observations for assessment

In [141]:
plt.rcParams.update({'font.size': 14})
binsize = 0.05

thresholds = {}
for interval in [6, 12, 24]:
        
    iobs = obs['%dh_precip_in'%interval].values
    iobs[iobs <= 0.01] = np.nan
    
    thresholds[interval] = np.nanpercentile(iobs, (33, 67))

    fig, ax = plt.subplots(1, 1, figsize=(10, 6), facecolor='w')
    
    ax.hist(iobs, bins=np.arange(0, np.nanmax(iobs), binsize), 
            edgecolor='k', density=True, color='gray', alpha=0.75,
            label='PDF (%.2f in bins)'%binsize)
    
    axx = ax.twinx()
    axx.hist(iobs, bins=np.arange(0, np.nanmax(iobs), 0.00001), 
            density=True, cumulative=True, histtype='step', edgecolor='k', linewidth=2.5)
    axx.plot(0, linewidth=2.5, color='k', label='CDF (Continuous)')
    
    for p, c in zip([33, 67], ['g', 'r']):
        ax.axvline(np.nanpercentile(iobs, p), color=c, linewidth=3, zorder=-1, 
                   label='%dth Percentile: %.2f mm'%(p, np.nanpercentile(iobs, p)))

    ax.set_xticks(np.arange(0, np.nanmax(iobs)+1, binsize))
    ax.set_xticklabels(['%.2f'%v for v in np.arange(0, np.nanmax(iobs)+1, binsize)], rotation=45)
    
    axx.set_ylabel('\nCumulative [%]')
    axx.set_yticks([0, .2, .4, .6, .8, 1.0])
    axx.set_yticklabels([0, 20, 40, 60, 80, 100])
    axx.set_ylim([0, 1.01])
    
    ax.set_xlim([0, np.nanmax(iobs)-0.05])
    
    ax.set_xlabel('\n%dh Observed Accumulated Precipitation [in]'%interval)
    ax.set_ylabel('Frequency [%]\n')
    ax.set_title('%s\n%dh Observed Accumulated Precipitation\nNBM v3.2 Period %s – %s\n'%(
        site, interval, date0.strftime('%Y-%m-%d'), date2.strftime('%Y-%m-%d')))
    
    ax.grid(True)
    
    lines, labels = ax.get_legend_handles_labels()
    lines2, labels2 = axx.get_legend_handles_labels()
    axx.legend(lines + lines2, labels + labels2, loc='center right')

    savestr = '{}_{}h.observedAPCP.png'.format(site, interval)
    print(savestr)
    plt.tight_layout()
    plt.savefig(figdir + savestr, dpi=150)
    
    plt.close()
    # plt.show()

KSEA_6h.observedAPCP.png
KSEA_12h.observedAPCP.png
KSEA_24h.observedAPCP.png


***
***
# Obtain NBM forecast data from NBM 1D Viewer (csv file API)
These are the NBM 1D output files extracted from the viewer with 3 set accumulation periods<br>
See more at: https://hwp-viz.gsd.esrl.noaa.gov/wave1d/?location=KSLC&col=2&hgt=1&obs=true&fontsize=1&selectedgroup=Default
<br><br>
If no forecast file exists, will download and save for future use. This can take some time.

In [187]:
nbmfile = datadir + '%s_nbm_%s_%s.pd'%(site, date0.strftime('%Y%m%d'), date1.strftime('%Y%m%d'))

if os.path.isfile(nbmfile):
    # Load file
    nbm = pd.read_pickle(nbmfile)
    print('Loaded NBM from file %s'%nbmfile)

else:
    url_list = []
    for date in dates:
        for init_hour in init_hours:
            # For now pull from the csv generator
            # Best to get API access or store locally later
            base = 'https://hwp-viz.gsd.esrl.noaa.gov/wave1d/data/archive/'
            datestr = '{:04d}/{:02d}/{:02d}'.format(date.year, date.month, date.day)
            sitestr = '/NBM/{:02d}/{:s}.csv'.format(init_hour, site)
            url_list.append([date, init_hour, base + datestr + sitestr])

    # Try multiprocessing this for speed?
    nbm = np.array([get_1d_csv(url, this=i+1, total=len(url_list)) for i, url in enumerate(url_list)])
    nbm = np.array([line for line in nbm if line is not None])

    header = nbm[0, 0]
    
    # This drops days with incomplete collections. There may be some use
    # to keeping this data, can fix in the future if need be
    # May also want to make the 100 value flexible!
    nbm = np.array([np.array(line[1]) for line in nbm if len(line[1]) == 100])

    nbm = nbm.reshape(-1, nbm.shape[-1])
    nbm[np.where(nbm == '')] = np.nan

    # Aggregate to a clean dataframe
    nbm = pd.DataFrame(nbm, columns=header).set_index(
        ['InitTime', 'ValidTime']).sort_index()

    # Drop last column (misc metadata?)
    nbm = nbm.iloc[:, :-2].astype(float)
    header = nbm.columns

    # variables = np.unique([k.split('_')[0] for k in header])
    # levels = np.unique([k.split('_')[1] for k in header])

    init =  nbm.index.get_level_values(0)
    valid = nbm.index.get_level_values(1)

    # Note the 1h 'fudge factor' in the lead time here
    lead = pd.DataFrame(
        np.transpose([init, valid, ((valid - init).values/3600/1e9).astype(int)+1]), 
        columns=['InitTime', 'ValidTime', 'LeadTime']).set_index(['InitTime', 'ValidTime'])

    nbm.insert(0, 'LeadTime', lead)

    klist = np.array([k for k in np.unique([k for k in list(nbm.keys())]) if ('APCP' in k)&('1hr' not in k)])
    klist = klist[np.argsort(klist)]
    klist = np.append('LeadTime', klist)
    nbm = nbm.loc[:, klist]
    
    # Nix values where lead time shorter than acc interval
    for k in nbm.keys():
        if 'APCP24hr' in k:
            nbm[k][nbm['LeadTime'] < 24] = np.nan
        elif 'APCP12hr' in k:
            nbm[k][nbm['LeadTime'] < 12] = np.nan
        elif 'APCP6hr' in k:
            nbm[k][nbm['LeadTime'] < 6] = np.nan
        else:
            pass
    
    nbm.to_pickle(nbmfile)
    print('\nSaved NBM to file %s'%obfile)

# Convert mm to in
nbm = pd.DataFrame([nbm['LeadTime']] + [nbm[k] * mm_in for k in nbm.keys() if 'LeadTime' not in k]).T

# Display some basic stats
nbm.loc[:, ['APCP6hr_surface', 'APCP6hr_surface_70% level', 'APCP6hr_surface_50% level',
            'APCP12hr_surface', 'APCP12hr_surface_70% level', 'APCP12hr_surface_50% level',
            'APCP24hr_surface', 'APCP24hr_surface_70% level', 'APCP24hr_surface_50% level'
            ]].describe().T

Loaded NBM from file ./KSEA/data/KSEA_nbm_20200220_20200708.pd


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
APCP6hr_surface,21062.0,0.014014,0.037579,0.0,0.0,0.0,0.0,0.55
APCP6hr_surface_70% level,21472.0,0.014956,0.03777,0.0,0.0,0.0,0.008307,0.424961
APCP6hr_surface_50% level,21472.0,0.004676,0.020427,0.0,0.0,0.0,0.0,0.34252
APCP12hr_surface,20984.0,0.035931,0.075421,0.0,0.0,0.0,0.033553,0.686614
APCP12hr_surface_70% level,20984.0,0.037218,0.070633,0.0,0.0,0.000827,0.044656,0.676181
APCP12hr_surface_50% level,20984.0,0.013878,0.040987,0.0,0.0,0.0,0.001654,0.490551
APCP24hr_surface,9736.0,0.075651,0.127154,0.0,0.0,0.004587,0.104469,0.806142
APCP24hr_surface_70% level,9736.0,0.091582,0.127745,0.0,0.0,0.033071,0.141663,0.818543
APCP24hr_surface_50% level,9736.0,0.04492,0.085533,0.0,0.0,0.000354,0.052224,0.630118


#### Plot the distribution of precipitation forecasts for assessment

In [186]:
plt.rcParams.update({'font.size': 14})
binsize = 0.05

thresholds = {}
for interval in [6, 12, 24]:
        
    iobs = nbm['APCP%dhr_surface'%interval].values
    iobs[iobs <= 0.01] = np.nan
    
    thresholds[interval] = np.nanpercentile(iobs, (33, 67))

    fig, ax = plt.subplots(1, 1, figsize=(10, 6), facecolor='w')
    
    ax.hist(iobs, bins=np.arange(0, np.nanmax(iobs), binsize), 
            edgecolor='k', density=True, color='gray', alpha=0.75,
            label='PDF (%.2f in bins)'%binsize)
    
    axx = ax.twinx()
    axx.hist(iobs, bins=np.arange(0, np.nanmax(iobs), 0.00001), 
            density=True, cumulative=True, histtype='step', edgecolor='k', linewidth=2.5)
    axx.plot(0, linewidth=2.5, color='k', label='CDF (Continuous)')
    
    for p, c in zip([33, 67], ['g', 'r']):
        ax.axvline(np.nanpercentile(iobs, p), color=c, linewidth=3, zorder=-1, 
                   label='%dth Percentile: %.2f mm'%(p, np.nanpercentile(iobs, p)))

    ax.set_xticks(np.arange(0, np.nanmax(iobs)+1, binsize))
    ax.set_xticklabels(['%.2f'%v for v in np.arange(0, np.nanmax(iobs)+1, binsize)], rotation=45)
    
    axx.set_ylabel('\nCumulative [%]')
    axx.set_yticks([0, .2, .4, .6, .8, 1.0])
    axx.set_yticklabels([0, 20, 40, 60, 80, 100])
    axx.set_ylim([0, 1.01])
    
    ax.set_xlim([0, np.nanmax(iobs)-0.05])
    
    ax.set_xlabel('\n%dh Forecast Precipitation [in]'%interval)
    ax.set_ylabel('Frequency [%]\n')
    ax.set_title('%s\n%dh Forecast Precipitation\nNBM v3.2 Period %s – %s\n'%(
        site, interval, date0.strftime('%Y-%m-%d'), date2.strftime('%Y-%m-%d')))
    
    ax.grid(True)
    
    lines, labels = ax.get_legend_handles_labels()
    lines2, labels2 = axx.get_legend_handles_labels()
    axx.legend(lines + lines2, labels + labels2, loc='center right')

    savestr = '{}_{}h.forecastAPCP.png'.format(site, interval)
    print(savestr)
    plt.tight_layout()
    plt.savefig(figdir + savestr, dpi=150)
    
    plt.close()
    # plt.show()

KSEA_6h.forecastAPCP.png
KSEA_12h.forecastAPCP.png
KSEA_24h.forecastAPCP.png


***
***
# Reorganize the data for analysis:
#### Isolate the forecasts by accumulation interval and lead time

In [211]:
lead_times = np.unique(nbm['LeadTime']).astype(int)
lead_times = np.array([lt for lt in lead_times if lt%6 == 0])

intervals = [6, 12, 24]

plist = np.arange(1, 100)

# Deterministic forecasts
nbm_det = {}
# Percentile forecasts
nbm_perc = {}
# Probability of exceedence
nbm_probex = {}

for fi in intervals:
    
    nbm_det[fi] = {}
    nbm_perc[fi] = {}
    nbm_probex[fi] = {}    
    
    for lt in lead_times:
        
        ikeys = [k for k in nbm[nbm['LeadTime'] == lt].keys() if '%shr'%fi in k]
        
        ikeys_det = np.array([k for k in ikeys if ('%' not in k)&('>' not in k)])
        ikeys_probex = np.array([k for k in ikeys if '>' in k])

        ikeys_perc = np.array([k for k in ikeys if '%' in k])
        ikeys_perc_int = np.array([k.split('_')[-1].split('%')[0] for k in ikeys if '%' in k]).astype(int)
        psort = np.argsort(ikeys_perc_int)
                
        idata_perc = nbm[nbm['LeadTime'] == lt].loc[:, ikeys_perc[psort]]#.values
        nbm_perc[fi][lt] = idata_perc

***
***
# Create Bulk Temporal Stats Plots
#### Reliability diagrams, bias over time, rank over time, etc.