# By-Band _g_-Point Reduction

# Dependencies

`numpy` is installed in the Python environment at NERSC (`module load python`), but `xarray` is not, so the user must install the package on their own. `PIPPATH` is the assumed location. This notebook depends heavily on `xarray`.

In [1]:
import os, sys, shutil, glob

# "standard" install
import numpy as np

from multiprocessing import Pool

# directory in which libraries installed with conda are saved
PIPPATH = '/global/homes/k/kcadyper/.local/lib/python3.8/site-packages/'
# PIPPATH = '/global/homes/e/emlawer/.local/cori/3.8-anaconda-2020.11/' + \
#     'lib/python3.8/site-packages'
PATHS = ['common', PIPPATH]
for path in PATHS: sys.path.append(path)

# needed at AER unless i update `pandas`
import warnings
#warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# user must do `pip install xarray` on cori (or other NERSC machines)
import xarray as xa

# local module
import by_band_lib as BYBAND

# Static Inputs

In [2]:
# only do one domain or the other
DOLW = True
DOMAIN = 'LW' if DOLW else 'SW'
NBANDS = 16 if DOLW else 14

# does band-splitting need to be done, or are there existing files 
# that have divided up the full k-distribution?
BANDSPLIT = True

# Paths

In [3]:
PROJECT = '/global/project/projectdirs/e3sm/pernak18/'
EXE = '{}/g-point-reduction/garand_atmos/rrtmgp_garand_atmos'.format(
    PROJECT)
REFDIR = '{}/reference_netCDF/g-point-reduce'.format(PROJECT)

# test (RRTMGP) and reference (LBL) flux netCDF files, full k-distributions, 
# and by-band Garand input file
fluxSuffix = 'flux-inputs-outputs-garandANDpreind.nc'
if DOLW:
    GARAND = '{}/multi_garand_template_single_band.nc'.format(REFDIR)
    KFULLNC = '{}/rrtmgp-data-lw-g256-2018-12-04.nc'.format(REFDIR)
    KFULLNC = '{}/rrtmgp-data-lw-g256-jen-xs.nc'.format(REFDIR)
    REFNC = '{}/lblrtm-lw-{}'.format(REFDIR, fluxSuffix)
    TESTNC = '{}/rrtmgp-lw-{}'.format(REFDIR, fluxSuffix)
    #TESTNC = 'rrtmgp-lw-flux-inputs-outputs-garand-all.nc'
else:
    GARAND = '{}/charts_multi_garand_template_single_band.nc'.format(REFDIR)
    KFULLNC = '{}/rrtmgp-data-sw-g224-2018-12-04.nc'.format(REFDIR)
    REFNC = '{}/charts-sw-{}'.format(REFDIR, fluxSuffix)
    TESTNC = '{}/rrtmgp-sw-{}'.format(REFDIR, fluxSuffix)
# endif LW

BANDSPLITDIR = 'band_k_dist'
FULLBANDFLUXDIR = 'full_band_flux'

for PATH in PATHS: BYBAND.pathCheck(PATH)

CWD = os.getcwd()

KPICKLE = '{}_k-dist.pickle'.format(DOMAIN)
pickleCost = '{}_cost-optimize.pickle'.format(DOMAIN)

# Band Splitting

Break up full _k_-distribution file into separate distributions for each band, then calculate the corresponding fluxes. This should only need to be run once.

After some clarifications from Robert (30-Nov-2020), I believe the plan of action is:

1. create Nbands k-distribution files
2. drive the Fortran executable Nbands times to produce Nbands flux results
3. the trial g-point combinations then loop over bands and the possible g-point combinations within each band, creating k-distribution and band-wise flux files for each possible combination
4. The Python code assembles broadband fluxes from the band-wise flux files in order to compute the cost functions

In [4]:
if BANDSPLIT:
    print('Band splitting commenced')
    BYBAND.pathCheck(BANDSPLITDIR, mkdir=True)
    BYBAND.pathCheck(FULLBANDFLUXDIR, mkdir=True)
    kFiles, fullBandFluxes = [], []
    for iBand in range(NBANDS):
        # divide full k-distribution into subsets for each band
        kObj = BYBAND.gCombine_kDist(KFULLNC, iBand, DOLW, 1, 
            fullBandKDir=BANDSPLITDIR, fullBandFluxDir=FULLBANDFLUXDIR, 
            profilesNC=GARAND)
        kFiles.append(kObj.kBandNC)
        kObj.kDistBand()

        # quick, non-parallelized flux calculations (because the 
        # executable is run in one directory)
        # TO DO: HAVEN'T TESTED THIS SINCE IT HAS BEEN MOVED OUT OF THE CLASS
        BYBAND.fluxCompute(kObj.kBandNC, kObj.profiles, kObj.exe, 
                           kObj.fullBandFluxDir, kObj.fluxBandNC)
        fullBandFluxes.append(kObj.fluxBandNC)
    # end band loop
    print('Band splitting completed')
else:
    kFiles = sorted(glob.glob('{}/coefficients_{}_band??.nc'.format(
        BANDSPLITDIR, DOMAIN)))
    fullBandFluxes = sorted(glob.glob('{}/flux_{}_band??.nc'.format(
        FULLBANDFLUXDIR, DOMAIN)))

    if len(kFiles) == 0 or len(fullBandFluxes) == 0:
        print('WARNING: set `BANDSPLIT` to `True` and run this cell again')
# endif BANDSPLIT


Band splitting commenced
Band splitting completed


# Pressure Levels for Cost Function

Pressure levels [Pa] for the Garand atmospheres are printed to standard output with indices that can be used in the cost function:

In [None]:
with xa.open_dataset(REFNC) as refDS:
    pLev = refDS['p_lev'].isel(record=0)
for iLev, pLev in enumerate(pLev.isel(col=0).values): print(iLev, pLev)

# _g_-Point Combining

Combine _g_-point reduced for bands with full-band fluxes from other bands, find optimal _g_-point combination for given iteration, proceed to next iteration.

First, find all _g_-point combinations for each band. Store the band object in a dictionary for use in flux computation. This cell only needs to be run once, and to save time in development, the dictionary is saved in a `pickle` file and can be loaded in the next cell.

In [19]:
# this should be parallelized; also is part of preprocessing so we 
# shouldn't have to run it multiple times
kBandDict = {}
for iBand, kFile in enumerate(kFiles):
    #if iBand != 0: continue
    band = iBand + 1
    kObj = BYBAND.gCombine_kDist(kFile, iBand, DOLW, 1, 
        fullBandKDir=BANDSPLITDIR, 
        fullBandFluxDir=FULLBANDFLUXDIR)
    kObj.gPointCombine()
    kBandDict['band{:02d}'.format(band)] = kObj

    print('Band {} complete'.format(band))
# end kFile loop

import pickle
with open(KPICKLE, 'wb') as fp: pickle.dump(kBandDict, fp)

Band 1 complete
Band 2 complete
Band 3 complete
Band 4 complete
Band 5 complete
Band 6 complete
Band 7 complete
Band 8 complete
Band 9 complete
Band 10 complete
Band 11 complete
Band 12 complete
Band 13 complete
Band 14 complete
Band 15 complete
Band 16 complete


Now compute fluxes in parallel for every _g_-point combination -- merging occurs in each band, and these combinations in a given band are used with broadband fluxes from other bands. These concatenations each have an associated `xarray` dataset assigned to it. Cost function components are then calculated based for each dataset, and the one that minimizes the error in the cost function will have its associated netCDF saved to disk.

Uncomment pickling block to restore dictionary from previous cell.

# Reduction and Optimization

Test and reference netCDF files have flux and heating rate arrays of dimension `record` x `col` x `lay`/`lev` and `band` if the array is broken down by band. `record` represents atmospheric specifications that can be used in [forcing scenarios](https://github.com/pernak18/g-point-reduction/wiki/LW-Forcing-Number-Convention#g-point-reduction-convention-).

Alternatively, the atmospheric specifications from any scenario can also be used. "Bare" parameters like `heating_rate` and `flux_net` will be treated as PD specifications, so the user will have to specify explicitly if they want the fluxes or heating rates from other scenarios by using the `flux_*_N` and `heating_rate_N` convention, where `N` is the scenario index as listed in the above list. The same convention applies to band fluxes and HRs. `N` = 0 will work just like `heating_rate` and `flux_net`.

Forcing for this exercise is defined as PI subtracted from scenario (2-6). The convention for these quantities is `*_forcing_N`, where `*` is the typical flux or heating rate (band or broadband) string, and `N` again is the forcing scenario (`N` of 2 would be forcing due to doubling methane).

In [None]:
# pickling for developement purposes so this dictionary doesn't need 
# to be regenerated for every code change.
import pickle
with open(KPICKLE, 'rb') as fp: kBandDict = pickle.load(fp)

# components used in cost function computation
# variable names in RRTMGP and LBL flux netCDF file, except for 
# forcing, which has to be specifed with "_forcing" appended to 
# the appropriate array. e.g., "flux_net_forcing" for net flux forcing
# netCDF arrays ('heating_rate', 'flux_net', 'band_flux_net', etc.)
# or forcing scenarios: convention is  ('flux_net_forcing_3') for 
CFCOMPS = ['flux_dif_net', 'flux_dir_dn', 'heating_rate']
CFCOMPS = ['flux_net', 'band_flux_net', 'heating_rate',
  'heating_rate_7', 'flux_net_forcing_5', 'flux_net_forcing_6',
  'flux_net_forcing_7', 'flux_net_forcing_9', 'flux_net_forcing_10',
  'flux_net_forcing_11', 'flux_net_forcing_12', 'flux_net_forcing_13',
  'flux_net_forcing_14', 'flux_net_forcing_15', 'flux_net_forcing_16',
  'flux_net_forcing_17', 'flux_net_forcing_18']

# level indices for each component 
# (e.g., 0 for surface, 41 for Garand TOA)
# one dictionary key per component so each component
# can have its own set of level indices
CFLEVS = {}
LEVELS = {}
LEVELS['flux_net'] = [0, 26, 42]
LEVELS['band_flux_net'] = [42]
LEVELS['heating_rate'] = range(42)
LEVELS['heating_rate_7'] = range(42)
LEVELS['flux_net_forcing_5'] = [0, 26, 42]
LEVELS['flux_net_forcing_6'] = [0, 26, 42]
LEVELS['flux_net_forcing_7'] = [0, 26, 42]
LEVELS['flux_net_forcing_9'] = [0, 26, 42]
LEVELS['flux_net_forcing_10'] = [0, 26, 42]
LEVELS['flux_net_forcing_11'] = [0, 26, 42]
LEVELS['flux_net_forcing_12'] = [0, 26, 42]
LEVELS['flux_net_forcing_13'] = [0, 26, 42]
LEVELS['flux_net_forcing_14'] = [0, 26, 42]
LEVELS['flux_net_forcing_15'] = [0, 26, 42]
LEVELS['flux_net_forcing_16'] = [0, 26, 42]
LEVELS['flux_net_forcing_17'] = [0, 26, 42]
LEVELS['flux_net_forcing_18'] = [0, 26, 42]
CFLEVS = dict(LEVELS)

# weights for each cost function component
CFWGT = [0.6, 0.04, 0.12, 0.12, 0.01, 0.02, 0.04, 0.005,
        0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005,
        0.005]

# directory under which to store k-distribution files that optimize 
# the cost function for each iteration and diagnistics (if necessary)
CFDIR = 'xsecs-test'
BYBAND.pathCheck(CFDIR, mkdir=True)

# write diagnostic netCDFs with cost function components
DIAGNOSTICS = True

RESTORE = True

if RESTORE:
    assert os.path.exists(pickleCost), 'Cannot find {}'.format(pickleCost)
    print('Restoring {}'.format(pickleCost))
    with open(pickleCost, 'rb') as fp: coObj = pickle.load(fp)
else:
    # instantiate object for computing cost
    coObj = BYBAND.gCombine_Cost(
        kBandDict, fullBandFluxes, REFNC, TESTNC, 1, 
        DOLW, profilesNC=GARAND, exeRRTMGP=EXE, 
        costFuncComp=CFCOMPS, costFuncLevs=CFLEVS, 
        costWeights=CFWGT, optDir='./{}'.format(CFDIR))
# endif RESTORE

# number of iterations for the optimization
NITER = 149

for i in range(coObj.iCombine, NITER+1):
    wgtInfo = ['{:.2f} ({})'.format(
        wgt, comp) for wgt, comp in zip(CFWGT, CFCOMPS)]
    wgtInfo = ' '.join(wgtInfo)

    print('Iteration {}'.format(i))
    coObj.kMap()
    coObj.fluxComputePool()
    coObj.fluxCombine()
    coObj.costFuncComp(init=True)
    coObj.costFuncComp()
    coObj.findOptimal()
    if coObj.optimized: break
    if DIAGNOSTICS: coObj.costDiagnostics()
    coObj.setupNextIter()
    with open(pickleCost, 'wb') as fp: pickle.dump(coObj, fp)
    coObj.calcOptFlux(
        fluxOutNC='optimized_{}_fluxes_iter{:03d}.nc'.format(DOMAIN, i))
# end iteration loop

KOUTNC = 'rrtmgp-data-{}-g-red.nc'.format(DOMAIN)
coObj.kDistOpt(KFULLNC, kOutNC=KOUTNC)
coObj.calcOptFlux(fluxOutNC='optimized_{}_fluxes.nc'.format(DOMAIN))

In [None]:
import glob
import matplotlib.pyplot as plt
  
Iter = []
CostMin = []
ncfiles = glob.glob("xsecs-test/diagnostics/cost_components_iter*.nc")
ncfiles.sort

for id in ncfiles:
    ipos = id.find('iter')
    Iter.append(id[ipos+4:ipos+7])
    with xa.open_dataset(id) as ds:
        cost= ds.trial_total_cost.values
        CostMin.append(min(cost))
    

#print ("here")
npIter = np.array(Iter)
npCostMin = np.array(CostMin)
iSort = np.argsort(npIter)
npIter = npIter[iSort]
npCostMin = npCostMin[iSort]
  
#for ip in range(len(Iter)):
 #   print (ip,npIter[ip],npCostMin[ip])
    
    
fig = plt.figure()
ax=fig.add_axes([0,0,1,1])
ax.set_ylim(80.,100.)
ax.plot(npIter,npCostMin,marker='.')
ax.set_xticks([0,20,40,60,80,100,120,140,160])
ax.set_xlabel("Iteration")
ax.set_ylabel("Cost Function")
#plt.plot(npIter,npCostMin,marker='.')
plt.show
    
fig2 = plt.figure()
ax2=fig2.add_axes([0,0,1,1])
len = npCostMin.shape

diff = npCostMin[1:]-npCostMin[0:len[0]-1]
ax2.plot(npIter[1:],diff,marker='.')
ax2.set_xticks([0,20,40,60,80,100,120,140,160])
ax2.set_xlabel("Iteration")
ax2.set_ylabel("Cost Function Delta")

zeroX = np.array([0.,len[0]])
zeroY =np.array([0.,0.])
ax2.plot(zeroX,zeroY,linestyle='dashed')
#plt.plot(npIter,npCostMin,marker='.')
plt.show

for ip in range(len[0]-1):
    print (ip,npIter[ip+1],diff[ip])




In [None]:
Iter = []
CostMin = []
for id in glob.glob("xsecs-test/diagnostics/cost_components_iter*.nc"):
    ipos = id.find('iter')
    Iter.append(id[ipos+4:ipos+7])
    with xa.open_dataset(id) as ds:
        cost= ds.trial_total_cost.values
        CostMin.append(min(cost))


for ip in Iter :
    print(ip)

In [21]:
# pickling for developement purposes so this dictionary doesn't need 
# to be regenerated for every code change.
import pickle
with open(KPICKLE, 'rb') as fp: kBandDict = pickle.load(fp)

# components used in cost function computation
# variable names in RRTMGP and LBL flux netCDF file, except for 
# forcing, which has to be specifed with "_forcing" appended to 
# the appropriate array. e.g., "flux_net_forcing" for net flux forcing
# netCDF arrays ('heating_rate', 'flux_net', 'band_flux_net', etc.)
# or forcing scenarios: convention is  ('flux_net_forcing_3') for 
CFCOMPS = ['flux_dif_net', 'flux_dir_dn', 'heating_rate']
CFCOMPS = ['flux_net', 'band_flux_net', 'heating_rate',
  'heating_rate_7', 'flux_net_forcing_5', 'flux_net_forcing_6',
  'flux_net_forcing_7', 'flux_net_forcing_9', 'flux_net_forcing_10',
  'flux_net_forcing_11', 'flux_net_forcing_12', 'flux_net_forcing_13',
  'flux_net_forcing_14', 'flux_net_forcing_15', 'flux_net_forcing_16',
  'flux_net_forcing_17', 'flux_net_forcing_18']

# level indices for each component 
# (e.g., 0 for surface, 41 for Garand TOA)
# one dictionary key per component so each component
# can have its own set of level indices
CFLEVS = {}
LEVELS = {}
LEVELS['flux_net'] = [0, 26, 42]
LEVELS['band_flux_net'] = [42]
LEVELS['heating_rate'] = range(42)
LEVELS['heating_rate_7'] = range(42)
LEVELS['flux_net_forcing_5'] = [0, 26, 42]
LEVELS['flux_net_forcing_6'] = [0, 26, 42]
LEVELS['flux_net_forcing_7'] = [0, 26, 42]
LEVELS['flux_net_forcing_9'] = [0, 26, 42]
LEVELS['flux_net_forcing_10'] = [0, 26, 42]
LEVELS['flux_net_forcing_11'] = [0, 26, 42]
LEVELS['flux_net_forcing_12'] = [0, 26, 42]
LEVELS['flux_net_forcing_13'] = [0, 26, 42]
LEVELS['flux_net_forcing_14'] = [0, 26, 42]
LEVELS['flux_net_forcing_15'] = [0, 26, 42]
LEVELS['flux_net_forcing_16'] = [0, 26, 42]
LEVELS['flux_net_forcing_17'] = [0, 26, 42]
LEVELS['flux_net_forcing_18'] = [0, 26, 42]
CFLEVS = dict(LEVELS)

# weights for each cost function component
CFWGT = [0.6, 0.04, 0.12, 0.12, 0.01, 0.02, 0.04, 0.005,
        0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005,
        0.005]

# directory under which to store k-distribution files that optimize 
# the cost function for each iteration and diagnistics (if necessary)
CFDIR = 'xsecs-test'
BYBAND.pathCheck(CFDIR, mkdir=True)

# write diagnostic netCDFs with cost function components
DIAGNOSTICS = True

RESTORE = True

if RESTORE:
    assert os.path.exists(pickleCost), 'Cannot find {}'.format(pickleCost)
    print('Restoring {}'.format(pickleCost))
    with open(pickleCost, 'rb') as fp: coObj = pickle.load(fp)
else:
    # instantiate object for computing cost
    coObj = BYBAND.gCombine_Cost(
        kBandDict, fullBandFluxes, REFNC, TESTNC, 1, 
        DOLW, profilesNC=GARAND, exeRRTMGP=EXE, 
        costFuncComp=CFCOMPS, costFuncLevs=CFLEVS, 
        costWeights=CFWGT, optDir='./{}'.format(CFDIR))
# endif RESTORE

# number of iterations for the optimization
NITER = 100

for i in range(coObj.iCombine, NITER+1):
    wgtInfo = ['{:.2f} ({})'.format(
        wgt, comp) for wgt, comp in zip(CFWGT, CFCOMPS)]
    wgtInfo = ' '.join(wgtInfo)

    print('Iteration {}'.format(i))
    coObj.kMap()
    coObj.fluxComputePool()
    coObj.fluxCombine()
    coObj.costFuncComp(init=True)
    coObj.costFuncComp()
    coObj.findOptimal()
    print (coObj.dCost)
    if (coObj.dcost > 0.1): 
        print (i,coObj.dcost)
    if coObj.optimized: break
    if DIAGNOSTICS: coObj.costDiagnostics()
    coObj.setupNextIter()
    with open(pickleCost, 'wb') as fp: pickle.dump(coObj, fp)
    coObj.calcOptFlux(
        fluxOutNC='optimized_{}_fluxes_iter{:03d}.nc'.format(DOMAIN, i))
# end iteration loop

KOUTNC = 'rrtmgp-data-{}-g-red.nc'.format(DOMAIN)
coObj.kDistOpt(KFULLNC, kOutNC=KOUTNC)
coObj.calcOptFlux(fluxOutNC='optimized_{}_fluxes.nc'.format(DOMAIN))

Restoring LW_cost-optimize.pickle
