In [56]:
# % cd /content/drive/My\ Drive/NBM
import os
import csv
import urllib.request as req

import numpy as np
import pandas as pd
import xarray as xr

import bokeh as bo
import seaborn as sns
import matplotlib.pyplot as plt

from multiprocessing import Pool, cpu_count, get_context

from datetime import datetime, timedelta

In [79]:
def get_1d_csv(get_req):

    _date, _init_hour, _url = get_req
    
    try:
        response = req.urlopen(_url).read().decode('utf-8')
        print('\r%s %s'%(_date, _init_hour), end='')
        
    except:
        print('\nNOT FOUND: %s %s'%(_date, _init_hour), end='\n')
        return None
    
    else:
        init = datetime(_date.year, _date.month, _date.day, _init_hour, 0)

        response = response.split('\n')
        header = np.append('InitTime', response[0].split(','))
        
        lines = []
        for line in response[1:]:
            line = line.split(',')

            try:
                line[0] = datetime.strptime(line[0], '%Y%m%d%H')
            except:
                pass
            else:
                lines.append(np.append(init, line))
                        
        return lines

In [2]:
# NBM 1D Viewer Site to use
site = 'KSLC'
os.makedirs('./%s/'%site, exist_ok=True)

# Data Range
date0 = datetime(2020, 5, 1)
date1 = datetime(2020, 5, 31)
dates = pd.date_range(date0, date1, freq='1D')

# Which model init hours to request
init_hours = [13] #[1, 7, 13, 19] #[4, 16]

In [80]:
url_list = []
for date in dates:
    for init_hour in init_hours:
        # For now pull from the csv generator
        # Best to get API access or store locally later
        base = 'https://hwp-viz.gsd.esrl.noaa.gov/wave1d/data/archive/'
        datestr = '{:04d}/{:02d}/{:02d}'.format(date.year, date.month, date.day)
        sitestr = '/NBM/{:02d}/{:s}.csv'.format(init_hour, site)
        url_list.append([date, init_hour, base + datestr + sitestr])
        
sample = req.urlopen(url_list[0][-1]).read().decode('utf-8')
header = np.append('InitTime', sample.split('\n')[0].split(','))

# Try multiprocessing this for speed?
data = [get_1d_csv(i) for i in url_list]

data = np.array([line for line in data if line is not None])
data = data.reshape(-1, data.shape[-1])
data[np.where(data == '')] = np.nan

2020-05-10 00:00:00 13
NOT FOUND: 2020-05-11 00:00:00 13
2020-05-31 00:00:00 13

In [81]:
# Aggregate to a clean dataframe
data = pd.DataFrame(data, columns=header).set_index(
    ['InitTime', 'ValidTime']).sort_index()

# Drop last column (misc metadata?)
data = data.iloc[:, :-2].astype(float)
header = data.columns

# variables = np.unique([k.split('_')[0] for k in header])
# levels = np.unique([k.split('_')[1] for k in header])

init =  data.index.get_level_values(0)
valid = data.index.get_level_values(1)

lead = pd.DataFrame(
    np.transpose([init, valid, ((valid - init).values/3600/1e9).astype(int)]), 
    columns=['InitTime', 'ValidTime', 'LeadTime']).set_index(['InitTime', 'ValidTime'])

data.insert(0, 'LeadTime', lead)

klist = np.array([k for k in np.unique([k for k in list(data.keys())]) if ('APCP' in k)&('1hr' not in k)])
klist = klist[np.argsort(klist)]
klist = np.append('LeadTime', klist)
data = data.loc[:, klist]

In [82]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,LeadTime,APCP12hr_surface,APCP12hr_surface_1% level,APCP12hr_surface_10% level,APCP12hr_surface_11% level,APCP12hr_surface_12% level,APCP12hr_surface_13% level,APCP12hr_surface_14% level,APCP12hr_surface_15% level,APCP12hr_surface_16% level,...,APCP6hr_surface_91% level,APCP6hr_surface_92% level,APCP6hr_surface_93% level,APCP6hr_surface_94% level,APCP6hr_surface_95% level,APCP6hr_surface_96% level,APCP6hr_surface_97% level,APCP6hr_surface_98% level,APCP6hr_surface_99% level,APCP6hr_surface_prob >0.254
InitTime,ValidTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-05-01 13:00:00,2020-05-01 14:00:00,1,,,,,,,,,,...,,,,,,,,,,
2020-05-01 13:00:00,2020-05-01 15:00:00,2,,,,,,,,,,...,,,,,,,,,,
2020-05-01 13:00:00,2020-05-01 16:00:00,3,,,,,,,,,,...,,,,,,,,,,
2020-05-01 13:00:00,2020-05-01 17:00:00,4,,,,,,,,,,...,,,,,,,,,,
2020-05-01 13:00:00,2020-05-01 18:00:00,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.088,1.297,1.558,1.785,2.055,2.410,2.840,3.580,4.441,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-31 13:00:00,2020-06-10 12:00:00,239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.516,1.505,3.0
2020-05-31 13:00:00,2020-06-10 18:00:00,245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.451,1.413,3.569,5.0
2020-05-31 13:00:00,2020-06-11 00:00:00,251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.028,0.191,0.574,1.350,2.503,5.0
2020-05-31 13:00:00,2020-06-11 06:00:00,257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.395,1.535,3.0
