In [1]:
import cfgrib

import xarray as xr
import numpy as np
import multiprocessing as mp

from glob import glob

import warnings
warnings.filterwarnings('ignore')

In [2]:
archive_dir = '/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/archive/'

In [3]:
variable_filters = [{'typeOfLevel': 'isobaricInhPa'},
           {'typeOfLevel': 'unknown', 'stepType':'instant'},
           {'typeOfLevel': 'surface'}]

filter_keys = [['gh', 't', 'r', 'u', 'v'],
              ['pwat'],
              ['orog', 'tp']]

def open_grib(grib_file):
    
    grib_data = []
    
    for variable_filter, keep_keys in zip(variable_filters, filter_keys):
        try:
            grib_data.append(xr.open_dataset(grib_file, engine='cfgrib', 
                            backend_kwargs={'errors':'ignore', 
                                            'filter_by_keys':variable_filter}
                                            )[keep_keys])

        except:
            pass
        
    try:
        return xr.merge(grib_data
                           ).drop(['time', 'level', 'step', 'surface']
                           ).rename({'isobaricInhPa':'level', 'valid_time':'time'})

    except:
        return None

In [None]:
init_hour_files = []

for date_dir in sorted(glob(archive_dir + '*')):
    
    date = date_dir.split('/')[-1]
    
    for init_hour in np.arange(0, 12.1, 12):
        
        for fhr in np.arange(3, 12.1, 3):
        
            try:
                init_hour_files.append(
                    glob(date_dir + '/models/gfs0p25/*%s%02d*f%03d*.WE.grib2'%(
                        date, init_hour, fhr)))
            except:
                pass
                    
init_hour_files = np.hstack(init_hour_files)

In [None]:
len(init_hour_files)

In [None]:
with mp.get_context('fork').Pool(128) as p:
    returned_grib_data = p.map(open_grib, init_hour_files, chunksize=None)
    p.close()
    p.join()

In [None]:
returned_grib_data = [r for r in returned_grib_data if r is not None]

In [None]:
data = xr.concat(returned_grib_data, dim='time')

In [None]:
data.to_netcdf('/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/gfs_aggregate.nc')

In [5]:
data = xr.open_dataset('/uufs/chpc.utah.edu/common/home/steenburgh-group10/mewessler/gfs_aggregate.nc')
data