# By-Band _g_-Point Reduction

# Dependencies

`numpy` is installed in the Python environment at NERSC (`module load python`), but `xarray` is not, so the user must install the package on their own. `PIPPATH` is the assumed location. This notebook depends heavily on `xarray`.

In [1]:
import os, sys, shutil

# "standard" install
import numpy as np

# directory in which libraries installed with conda are saved
PIPPATH = '{}/.local/'.format(os.path.expanduser('~')) + \
    'cori/3.7-anaconda-2019.10/lib/python3.7/site-packages'
PATHS = ['common', PIPPATH]
for path in PATHS: sys.path.append(path)

# user must do `pip install xarray` on cori (or other NERSC machines)
import xarray as XA

# common submodule
import utils


# Function and Class Definitions

This cell will eventually go into its own module and be imported in the previous cell, but first I wanna finish developing and testing.

In [2]:
def pathCheck(path, mkdir=False):
    """
    Determine if file exists. If not, throw an Assertion Exception
    """

    if mkdir:
        # mkdir -p -- create dir tree
        if not os.path.exists(path): os.makedirs(path)
    else:
        assert os.path.exists(path), 'Could not find {}'.format(path)
    # endif mkdir
# end pathCheck

def kDistBandSplit(kFileNC, outDir='band_k_dist'):
    """
    Split a full k-distribution into separate files for each band
    """

    pathCheck(outDir, mkdir=True)

    weights = [
        0.1527534276, 0.1491729617, 0.1420961469, 0.1316886544, 
        0.1181945205, 0.1019300893, 0.0832767040, 0.0626720116, 
        0.0424925000, 0.0046269894, 0.0038279891, 0.0030260086, 
        0.0022199750, 0.0014140010, 0.0005330000, 0.0000750000
    ]
    xaWeights = XA.DataArray(
        weights, dims={'gpt': range(len(weights))}, name='gpt_weights')

    bandFiles = []
    with XA.open_dataset(kFileNC) as kAllObj:
        gLims = kAllObj.bnd_limits_gpt
        ncVars = list(kAllObj.keys())
        dimStr = 'gpt'

        for iBand in kAllObj.bnd.values:
            # make a separate netCDF for each band
            outNC = '{}/coefficients_lw_band{:02d}.nc'.format(outDir, iBand+1)

            # Dataset that will be written to netCDF with new variables and 
            # unedited global attribues
            outDS = XA.Dataset()

            # determine which variables need to be parsed
            for ncVar in ncVars:
                ncDat = kAllObj[ncVar]

                if dimStr in kAllObj[ncVar].dims:
                    # grab only the g-point information for this band
                    # and convert to zero-offset
                    i1, i2 = gLims[iBand].values-1
                    ncDat = ncDat.isel(gpt=slice(i1, i2+1))
                # endif

                # write variable to output dataset
                outDS[ncVar] = XA.DataArray(ncDat)
            # end ncVar loop

            # write weights to output file
            outDS['gpt_weights'] = xaWeights

            outDS.to_netcdf(outNC, mode='w')
            #print('Completed {}'.format(outNC))
            bandFiles.append(outNC)
        # end band loop
    # endwith

    return bandFiles
# end kDistBandSplit()

def costFuncComp(tst_file, ref_file, levs=[0, 10000, 102000], iRecord=0, 
                 ncVars=['net_flux', 'heating_rate', 'band_flux_net']):
    """
    Calculate flexible cost function where RRTMGP-LBLRTM RMS error for 
    any number of allowed parameters (usually just flux or HR) over many 
    levels is computed
    
    Inputs
        tst_file -- string, RRTMGP (test model) netCDF file with fluxes
        ref_file -- string, LBLRTM (reference model) netCDF file with fluxes

    Output
        outParams -- list of cost function arrays (RMS test-ref differences 
          averaged over columns); 1 element per input variable (ncVars)

    Keywords
        levs -- list of floats; pressure levels of interest in Pa
        iRecord -- int; index for forcing scenario (default 0 is no forcing)
        ncVars -- list of strings; netCDF variable names of the arrays to 
          include in the cost function
    """

    outParams = []
    with xr.open_dataset(tst_file) as tst, xr.open_dataset(ref_file) as ref:
        # Compute differences in all variables in datasets at levels 
        # closest to user-provided pressure levels
        # TODO: confirm this is doing what we expect it to
        subsetErr = (tst-ref).sel(lev=levs, method='nearest')
        for ncVar in ncVars:
            # pressure dimension will depend on parameter
            # layer for HR, level for everything else
            pStr = 'lay' if 'heating_rate' in ncVar else 'lev'

            # get array for variable, then compute its test-ref RMS 
            # over all columns and given pressure levels for a given 
            # forcing scenario
            ncParam = getattr(subsetErr, ncVar)
            outParams.append(
                (ncParam.isel(record=iRecord)**2).mean(dim=('col', pStr)))
        # end ncVar loop
    # endwith

    return outParams
# end costFuncComp
    
def normCost(tst_file, ref_file, norm, 
             ncVars=['net_flux', 'heating_rate', 'band_flux_net'], 
             levs=[0, 10000, 102000], ):
    """    
    Returns the summary terms in the cost function
      Each element in each term is normalized (normally by the error at i
      teration 0)

    Inputs
        tst_file -- string, RRTMGP (test model) netCDF file with fluxes
        ref_file -- string, LBLRTM (reference model) netCDF file with fluxes
        norm -- list of floats with RMS error for a given 
          cost function component

    Output
        list of floats that are the RMS error (RRTMGP-LBLRTM)
        for each cost function component normalized by the input 
        `norm` parameter

    Keywords
        levs -- list of floats; pressure levels of interest in Pa
        iRecord -- int; index for whatever the 'record' dimension is in 
          the input netCDF files 
        ncVars -- list of strings; netCDF variable names of the arrays to 
          include in the cost function

    """

    tst_cost = costFuncComp(tst_file, ref_file, ncVars=ncVars, levs=levs)

    # Each scalar term in the cost function is the RMS across the
    #   normalized error in each component. cost_function_components() returns
    #   the squared error
    return [np.sqrt((c/n).mean()) for (c, n) in zip(tst_cost, norm)]
# end normCost

def recordDimRename(inNC, outNC):
    """
    Rename "record" dimension in given netCDF file
    """
    
    outDS = xa.Dataset()

    with xa.open_dataset(inNC) as inObj:
        # save global attributes for later -- will stuff into buffer, unedited
        globalAtt = inObj.attrs

        # write buffer netCDF, complete with global attributes
        ncVars = list(inObj.keys())

        for ncVar in ncVars: 
            ncDat = inObj[ncVar]

            if 'record' in ncDat.dims:
                # which dimension corresponds to `record`?
                dims = list(ncDat.dims)
                iRec = dims.index('record')
                dims[iRec] = 'forcing'

                # save variable with new dimensions
                outDS[ncVar] = xa.DataArray(ncDat, dims=dims)
            else:
                # retain any variables without a record dimension
                outDS[ncVar] = xa.DataArray(ncDat)
            # endif record
        # end ncVar loop
    # endwith

    # stuff the global attributes into the new dataset
    for att in globalAtt: outDS.attrs[att] = globalAtt[att]
    outDS.to_netcdf(outNC, mode='w')
    print('Completed {}'.format(outNC))
# end recordDimRename()

class kDistOptBand:
    def __init__(self, inFile, band, lw, idxForce, iCombine):
        """
        - For a given band, loop over possible g-point combinations within 
            each band, creating k-distribution and band-wise flux files for 
            each possible combination
        - Run a RRTMGP executable that performs computations for a single band
        - Compute broadband fluxes and heating rates
        - Compute cost function from broadband parameters and determine 
            optimal combination of g-points

        Input
          inFile -- string, netCF created with kDistBandSplit() method
          band -- int, band number that is being processed with object
          lw -- boolean, do longwave domain (otherwise shortwave)
          idxForce -- int, index of forcing scenario
          iCombine -- int, index for what iteration of g-point combining is 
              underway

        Keywords
        """

        # see constructor doc
        self.inNC = str(inFile)
        self.iBand = int(band)
        self.doLW = bool(lw)
        self.domainStr = 'LW' if lw else 'SW'
        self.iForce = int(idxForce)
        self.iCombine = int(iCombine)

        # directory where model will be run for each g-point 
        # combination
        self.workDir = '{}/workdir_band_{}'.format(os.getcwd(), self.iBand)
        pathCheck(self.workDir, mkdir=True)

        # directory to store optimal netCDFs for each iteration and band
        self.optDir = '{}/band_{}_opt'.format(os.getcwd(), self.iBand)
        pathCheck(self.optDir, mkdir=True)

        # metadata for keeping track of how g-points were 
        # combined; we will keep appending after each iteration
        self.gCombine = {}

        # what netCDF variables have a g-point dimension and will thus 
        # need to be modified in the combination iterations?
        self.gptVars = ['kmajor', 'gpt_weights']
        if self.doLW:
            self.gptVars.append('plank_fraction')
        else:
            self.gptVars += ['rayl_lower', 'rayl_upper', 
                            'solar_source_facular' , 
                            'solar_source_sunspot', 'solar_source_quiet']
        # endif doLW

        # ATTRIBUTES THAT WILL GET RE-ASSIGNED IN CLASS

        # list of netCDFs for each g-point combination in a given band 
        # and combination iteration
        self.trialNC = []

        # the trialNC that optimizes cost function for given comb iter
        # starts off as input file
        self.optNC = str(self.inNC)

        # the number of g-points in a given comb iter
        self.nGpt = 16

        # original g-point IDs for a given band
        # TO DO: have not started trying to preserve these guys
        self.gOrigID = range(1, self.nGpt+1)
    # end constructor

    def gPointCombine(self):
        """
        Combine g-points in a given band with adjacent g-point

        TOcDO: will probably have to modify other variables in 
        self.inNC like Ben does in combine_gpoints_fn.py
        """

        with XA.open_dataset(self.inNC) as kDS:
            kVal = kDS.kmajor
            weights = kDS.gpt_weights
            ncVars = list(kDS.keys())

            # combine all nearest neighbor g-point indices 
            # and associated weights for given band
            self.nGpt = kDS.dims['gpt']
            gCombine = [[x, x+1] for x in range(self.nGpt-1)]
            wCombine = [weights[np.array(gc)] for gc in gCombine]

            for gc, wc in zip(gCombine, wCombine):
                # loop over each g-point combination and create 
                # a k-distribution netCDF for each
                outNC='{}/coefficients_{}_g{:02d}-{:02d}_iter{:02d}.nc'.format(
                    self.workDir, self.domainStr, gc[0], gc[1], self.iCombine)
                self.trialNC.append(outNC)

                g1, g2 = gc
                w1, w2 = wc

                outDS = XA.Dataset()

                # each trial netCDF has its own set of g-points 
                # that we will save for metadata purposes -- 
                # the combination that optimizes the cost function
                # will have its `g_combine` attribute perpetuated
                # append g-point combinations metadata for given 
                # band and iteration in given band
                outDS.attrs['g_combine'] = '{}+{}'.format(g1, g2)

                for ncVar in ncVars:
                    ncDat = kDS[ncVar]
                    if ncVar in self.gptVars:
                        kg1, kg2 = ncDat.sel(gpt=g1), ncDat.isel(gpt=g2)

                        if ncVar == 'gpt_weights':
                            # replace g1' weight with integrated weight at 
                            # g1 and g2
                            ncDat = XA.where(
                                ncDat.gpt == g1, w1 + w2, ncDat)
                        else:
                            pass
                            # replace g1' slice with weighted average of 
                            # g1 and g2; TO DO: make sure this is how 
                            # other params in addition to k are treated
                            ncDat = XA.where(ncDat.gpt == g1, 
                                (kg1*w1 + kg2*w2) / (w1 + w2), ncDat)
                        # endif ncVar

                        # remove the g2 slice; weird logic:
                        # http://xarray.pydata.org/en/stable/generated/
                        # xarray.DataArray.where.html#xarray.DataArray.where
                        ncDat = ncDat.where(ncDat.gpt != g2, drop=True)
                    else:
                        # retain any variables without a gpt dimension
                        pass
                    # endif ncVar

                    # stuff new dataset with combined or unaltered data
                    outDS[ncVar] = XA.DataArray(ncDat)
                # end ncVar loop

                outDS.to_netcdf(outNC, 'w')
            # end combination loop
        # endwith kDS
    # end gPointCombine()

    def findOptimal(self, iCombine):
        """
        Determine which g-point combination for a given iteration in a band
        optimized the cost function

        Input
            iCombine -- int, iteration number for g-point combinations 
                in a given band
        """

        # TO DO: loop through trial netCDFs, calculate their normalized 
        # cost function components, then determine what is the optimal solution
        iOpt = 0
        self.optNC = self.trialNC[iOpt]

        # determine optimal combination and grab g-point combination attribute
        with XA.open_dataset(self.optNC) as optDS:
            self.gCombine['iter{:02d}'.format(iCombine)] = \
              optDS.attrs['g_combine']

        for i in self.gCombine.keys(): print(self.gCombine[i])
    # end findOptimal()
    
    def runBandRRTMGP(self):
        """
        Run the RRTMGP executable for a single band
        """
    # end runBandRRTMGP()
# end kDistOptBand

def computeBB():
    """
    Compute broadband fluxes after g-points have been combined
    """
# end computeBB()


# Static Inputs

In [3]:
# only do one domain or the other
DOLW = True
DOSW = not DOLW
NBANDS = 16 if DOLW else 14

# forcing scenario (0 is no forcing...need a more comprehensive list)
IFORCING = 0


# Paths

Robert: change the `EXE` and `GARAND` global variables to whatever path you have for your by-band executable and Garand profile specs, respectively.

In [4]:
PROJECT = '/global/project/projectdirs/e3sm/pernak18/'
KFULLNC = '{}/reference_netCDF/g-point-reduce/'.format(PROJECT) + \
  'rrtmgp-data-lw-g256-2018-12-04.nc'
EXE = '{}/g-point-reduction/k-distribution-opt/rrtmgp_garand_atmos'.format(
    PROJECT)
GARAND = '{}/reference_netCDF/g-point-reduce/'.format(PROJECT) + \
  'lblrtm-lw-flux-inputs-outputs-garandANDpreind.nc'
PATHS = [KFULLNC, EXE]
for PATH in PATHS: pathCheck(PATH)


# Executable Test

This cell is meant to do a simple staging of the by-band _k_-distribution files and RRTMGP inputs (i.e., Garand profile specifications) and run a modified RRTMGP executable that works with single bands.

In [5]:
import subprocess as SUB

# divide full k-distribution into subsets for each band
print('Band splitting commenced')
kFiles = kDistBandSplit(KFULLNC)
print('Band splitting completed')

testDir = 'exe_test'
pathCheck(testDir, mkdir=True)

topDir = os.getcwd()
os.chdir(testDir)

# so we don't overwrite the LBL results
inRRTMGP = 'rrtmgp-inputs-outputs.nc'
shutil.copyfile(GARAND, inRRTMGP)

# only doing one band for now
for kFile in kFiles:
    base = os.path.basename(kFile)
    kAbsPath = '{}/{}'.format(topDir, kFile)
    if os.path.islink(base): os.unlink(base)
    os.symlink(kAbsPath, base)
    
    #SUB.call([EXE, inRRTMGP, base])
    break
# end kFile loop

os.chdir(topDir)

Band splitting commenced
Band splitting completed


# Main Driver

In [17]:
# divide full k-distribution into subsets for each band
print('Band splitting commenced')
kFiles = kDistBandSplit(KFULLNC)
print('Band splitting completed')

# leave `bands` empty if all bands should be processed
bands = [1]
if not bands: bands = range(1, NBANDS+1)

# loop over bands and instantiate a band optimization object
# optimizing each band
for iBand, kFile in enumerate(kFiles):
    band = iBand + 1
    if band not in bands: continue

    iComb = 1

    while True:
        print(kFile)

        # start with `kFile` with no g-point combinations for a given band
        kObj = kDistOptBand(kFile, band, DOLW, IFORCING, iComb)

        kObj.gPointCombine()

        # if there are not enough g-points to combine, stop iterating
        if kObj.nGpt == 1: break

        # run RRTMGP on all files self.trialNC (each g-point combination)
        #kObj.runBandRRTMGP()

        # determine optimal combination
        kObj.findOptimal(kObj.iCombine)

        # keep a copy of the optimal netCDF
        shutil.copy2(kObj.optNC, '{}/{}'.format(
            kObj.optDir, os.path.basename(kObj.optNC)))
        
        # replace `kFile` with netCDF that corresponds to g-point combination
        # that minimizes the cost function
        kFile = kObj.optNC

        # next iteration
        iComb += 1
    # end while

    print('Band {} complete'.format(band))

    # cleanup
    shutil.rmtree(kObj.workDir)
# end kFile loop

# small edit to flux file -- rename the `record` dimension


Band splitting commenced
Band splitting completed
band_k_dist/coefficients_lw_band01.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter01.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter02.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter03.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter04.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter05.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter06.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter07.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter08.nc
0+1
/global/u1/p/pernak18/RRTMGP/g-point-reduction/workdir_band_1/coefficients_LW_g00-01_iter09.nc
0+1
/global/u1/p/pern