<a href="https://colab.research.google.com/github/m-wessler/nbm-verification/blob/main/get_nbm_aws_streamline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install boto3
!pip install pygrib

import os, pathlib
import boto3, pygrib

import numpy as np
import pandas as pd

from botocore import UNSIGNED
from botocore.client import Config
from multiprocessing import Pool, cpu_count

# Globals

In [None]:
# Define Globals
aws_bucket = 'noaa-nbm-grib2-pds'

# Where to place the grib file (subdirs can be added in local) (not used)
# output_dir = './'

# Which grib variables do each element correlate with
element_var = {'qpf':['APCP'],
                  'maxt':['TMP', 'TMAX', 'APTMP'],
                  'mint':['TMP', 'TMIN', 'APTMP']}

# Which grib levels do each element correlate with
element_lev = {'qpf':'surface',
               'maxt':'2 m above ground',
               'mint':'2 m above ground'}

# If a grib message contains any of these, exclude
excludes = ['ens std dev', '% lev']

# Methods

In [None]:
def mkdir_p(path):
    pathlib.Path(path).mkdir(parents=True, exist_ok=True)
    return path

In [None]:
def cwa_list(input_region):

    input_region = input_region.upper()

    region_dict ={
        "WR":["BYZ", "BOI", "LKN", "EKA", "FGZ", "GGW", "TFX", "VEF", "LOX", "MFR",
            "MSO", "PDT", "PSR", "PIH", "PQR", "REV", "STO", "SLC", "SGX", "MTR",
            "HNX", "SEW", "OTX", "TWC"],

        "CR":["ABR", "BIS", "CYS", "LOT", "DVN", "BOU", "DMX", "DTX", "DDC", "DLH",
            "FGF", "GLD", "GJT", "GRR", "GRB", "GID", "IND", "JKL", "EAX", "ARX",
            "ILX", "LMK", "MQT", "MKX", "MPX", "LBF", "APX", "IWX", "OAX", "PAH",
            "PUB", "UNR", "RIW", "FSD", "SGF", "LSX", "TOP", "ICT"],

        "ER":["ALY", "LWX", "BGM", "BOX", "BUF", "BTV", "CAR", "CTP", "RLX", "CHS",
            "ILN", "CLE", "CAE", "GSP", "MHX", "OKX", "PHI", "PBZ", "GYX", "RAH",
            "RNK", "AKQ", "ILM"],

        "SR":["ABQ", "AMA", "FFC", "EWX", "BMX", "BRO", "CRP", "EPZ", "FWD", "HGX",
            "HUN", "JAN", "JAX", "KEY", "MRX", "LCH", "LZK", "LUB", "MLB", "MEG",
            "MAF", "MFL", "MOB", "MRX", "OHX", "LIX", "OUN", "SJT", "SHV", "TAE",
            "TBW", "TSA"]}

    if input_region == "CONUS":
        return np.hstack([region_dict[region] for region in region_dict.keys()])
    else:
        return region_dict[input_region]

In [None]:
def fetch_grib_from_AWS(**req):

    output_dir = mkdir_p('./nbm/')
    output_file = output_dir + f'{req["yyyymmdd"]}.t{req["hh"]:02d}z.' +\
                    f'fhr{req["fhr"]:03d}.{element}.grib2'

    if os.path.isfile(output_file):
        return output_file

    else:
        for nbm_set in req['nbm_set']:

            bucket_dir = f'blend.{req["yyyymmdd"]}/{req["hh"]:02d}/{nbm_set}/'

            grib_file = f'{bucket_dir}blend.t{req["hh"]:02d}z.'+\
                        f'{nbm_set}.f{req["fhr"]:03d}.{req["nbm_area"]}.grib2'

            index_file = f'{grib_file}.idx'

            print(index_file)

            client = boto3.client('s3', config=Config(signature_version=UNSIGNED))

            index_data_raw = client.get_object(
                Bucket=aws_bucket, Key=index_file)['Body'].read().decode().split('\n')

            cols = ['num', 'byte', 'date', 'var', 'level',
                'forecast', 'fthresh', 'ftype', '']

            # index_data = pd.DataFrame([item.split(':') for item in index_data_raw],
            #                 columns=cols if nbm_set == 'core' else cols[:-1])

            index_data_parsed = [item.split(':') for item in index_data_raw]

            try:
                index_data = pd.DataFrame(index_data_parsed, columns=cols)
            except:
                index_data = pd.DataFrame(index_data_parsed, columns=cols[:-1])

            # Clean up any ghost indicies, set the index
            index_data = index_data[index_data['num'] != '']
            index_data['num'] = index_data['num'].astype(int)
            index_data = index_data.set_index('num')

            # Allow byte ranging to '' (EOF)
            index_data.loc[index_data.shape[0]+1] = ['']*index_data.shape[1]

            for req_var in req['var']:

                print(req_var)

                index_subset = index_data[
                    ((index_data['var'] == req_var) &
                    (index_data['level'] == req['level']))]

                # byte start >> byte range
                for i in index_subset.index:
                    index_subset.loc[i]['byte'] = (
                        index_data.loc[i, 'byte'],
                        index_data.loc[int(i)+1, 'byte'])

                # Filter out excluded vars
                for ex in excludes:
                    mask = np.column_stack([index_subset[col].str.contains(ex, na=False)
                                            for col in index_subset])

                    index_subset = index_subset.loc[~mask.any(axis=1)]

                # Fetch the data by byte range, write from stream
                for index, item in index_subset.iterrows():
                    byte_range = f"bytes={item['byte'][0]}-{item['byte'][1]}"

                    output_bytes = client.get_object(
                        Bucket=aws_bucket, Key=grib_file, Range=byte_range)

                    with open(output_file, 'ab') as wfp:
                        for chunk in output_bytes['Body'].iter_chunks(chunk_size=4096):
                            wfp.write(chunk)

    client.close()
    return output_file

# User Input/Multiprocessing Inputs

In [None]:
element = 'qpf' # input('Desired element? (QPF/MaxT/MinT)').lower()

# Main/Multiprocessing Call

In [None]:
# Build arg dict
request_args = {
    'yyyymmdd':'20231010', # input('Desired init date (YYYYMMDD)? '),
    'hh':12, # int(input('Desired init hour int(HH)? ')),
    'fhr':18, # int(input('Desired forecast hour/lead time int(HHH)?')),
    'nbm_set':['core', 'qmd'] if element == 'qpf' else ['core'],
    'nbm_area':'co',
    'element':element,
    'var':element_var[element],
    'level':element_lev[element]}

In [None]:
# Do the job
grib_output_file = fetch_grib_from_AWS(**request_args)

In [None]:
# Check the output
for item in pygrib.open(grib_output_file).read():
    print(item)