In [None]:
import shutil
import shlex
import pygrib
import subprocess
import sys, os, gc
import numpy as np
import xarray as xr
import pandas as pd
import multiprocessing as mp

from glob import glob
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings('ignore')

core_limit = 60
keep_percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99]

In [None]:
nbm_dir = '/scratch/general/lustre/u1070830/nbm/'
out_dir = '/scratch/general/lustre/u1070830/nbm_new/'
os.makedirs(out_dir, exist_ok=True)

In [None]:
nbm_raw = np.array(sorted([f for f in sorted(glob(nbm_dir + '*/*.grib2')) if 'extract' not in f]))

In [None]:
fhr, interval = 48, 24
nbm_raw_fhr = np.array([f for f in nbm_raw if 'f%03d'%fhr in f])
print('%d files to extract'%len(nbm_raw_fhr))

In [None]:
#TEST REMOVE LATER
# nbm_raw_fhr = [f for f in nbm_raw_fhr if (('t00z' in f) | ('t12z' in f))][:50]

In [None]:
def extract_fhr_data(f): 

    data = []
    
    with pygrib.open(f) as grb:
        
        print(f)
        
        for msg in grb.read():
            
            step = timedelta(hours=msg.endStep - msg.startStep)
            lead = msg.endStep
            
            if (fhr == lead) & (step == timedelta(hours=interval)):
                
                lats, lons = msg.latlons()

                if 'probability' in str(msg).lower():
                    
                    init = datetime.strptime(str(msg.dataDate) + '%04d'%msg.dataTime, '%Y%m%d%H%M')                    
                    valid = datetime.strptime(str(msg.validityDate) + '%04d'%msg.validityTime, '%Y%m%d%H%M')

                    threshold = msg.upperLimit
                    threshold_in = round(threshold/25.4, 2)

                    if threshold_in <= 4.0:

                        # print(init, valid, threshold, lead, threshold_in)
                        
                        idata = xr.DataArray([msg.data()[0].astype(np.float32)], name='probx',
                                                     dims=('valid', 'y', 'x'), 
                                                     coords={'valid':[valid],
                                                             'lat':(('y', 'x'), lats), 
                                                             'lon':(('y', 'x'), lons)})
                        idata['init'] = init                        
                        idata['interval'] = interval
                        idata['step'] = step
                        idata['fhr'] = lead
                        
                        idata['threshold'] = threshold
                        idata['threshold_in'] = threshold_in
                        
                        data.append(idata)

                elif 'percentileValue' in msg.keys():
                    
                    if msg.percentileValue in keep_percentiles:
                        
                        # Append this data later, for now pass
                        # print(msg.percentileValue, msg)
                        pass

    data = xr.concat(data, dim='threshold')
    print(data)
    
    out_file = 'blend.%s.t%02dz.qmd.f%03d.WR.nc'%(init.strftime('%Y%m%d'), init.hour, lead)
    out_dir + out_file
    
    data.to_netcdf(out_dir + out_file, unlimited_dims='valid')
    print(out_file, 'saved')
    
    del data
    gc.collect()
    
    return None

# extract_fhr_data(nbm_raw_fhr[1])

In [None]:
# print('Extracting NBM data for FHR%03d using %d processes'%(fhr, core_limit))

# with mp.get_context('fork').Pool(core_limit) as p:
#     p.map(extract_fhr_data, nbm_raw_fhr, chunksize=1)

In [None]:
for fhr in [24, 48]:
    
    indiv_flist = sorted(glob(out_dir + '*f%03d*.nc'%fhr))

    months = np.array([os.path.basename(f).split('.')[1][:6] for f in indiv_flist]).astype(int)

    for month in np.unique(months):

        month_flist = sorted(glob(out_dir + '*%d*f%03d*.nc'%(month, fhr)))

        xr.open_mfdataset(
            month_flist, 
            combine='nested', concat_dim='valid'
        ).to_netcdf(
            out_dir + 'agg/blend.%d.qmd.f%03d.WR.nc'%(month, fhr))

        print('SAVED: blend.%d.qmd.f%03d.WR.nc'%(month, fhr))

In [None]:
# determine the month of the last existing file... that will be your append. 
# If moved onto next month, assume last month is complete (but can add a check?)