### Importing packages and data_utils.py

In [1]:
from climsim_utils.data_utils import *

### Instantiating class

In [2]:
grid_path = '../grid_info/ClimSim_low-res_grid-info.nc'
norm_path = './normalizations/'

grid_info = xr.open_dataset(grid_path)
input_mean = xr.open_dataset(norm_path + 'inputs/input_mean.nc').astype(np.float32)
input_max = xr.open_dataset(norm_path + 'inputs/input_max.nc').astype(np.float32)
input_min = xr.open_dataset(norm_path + 'inputs/input_min.nc').astype(np.float32)
output_scale = xr.open_dataset(norm_path + 'outputs/output_scale.nc').astype(np.float32)

ml_backend = 'pytorch'
input_abbrev = 'mlexpand'
output_abbrev = 'mlo'
data = data_utils(grid_info = grid_info, 
                  input_mean = input_mean, 
                  input_max = input_max, 
                  input_min = input_min, 
                  output_scale = output_scale,
                  ml_backend = ml_backend,
                  normalize = True,
                  input_abbrev = input_abbrev,
                  output_abbrev = output_abbrev,
                  save_h5=True,
                  save_npy=False,
                  )



In [4]:
data.input_mean

### Create training data

In [2]:
from climsim_utils.data_utils import *

grid_path = '../grid_info/ClimSim_low-res_grid-info.nc'
norm_path = './normalizations/'



grid_info = xr.open_dataset(grid_path)
input_mean = xr.open_dataset(norm_path + 'inputs/input_mean_v4_pervar.nc').astype(np.float32)
input_max = xr.open_dataset(norm_path + 'inputs/input_max_v4_pervar.nc').astype(np.float32)
input_min = xr.open_dataset(norm_path + 'inputs/input_min_v4_pervar.nc').astype(np.float32)
output_scale = xr.open_dataset(norm_path + 'outputs/output_scale.nc').astype(np.float32)

ml_backend = 'pytorch'
input_abbrev = 'mlexpand'
output_abbrev = 'mlo'
data = data_utils(grid_info = grid_info, 
                  input_mean = input_mean, 
                  input_max = input_max, 
                  input_min = input_min, 
                  output_scale = output_scale,
                  ml_backend = ml_backend,
                  normalize = True,
                  input_abbrev = input_abbrev,
                  output_abbrev = output_abbrev,
                  save_h5=True,
                  save_npy=False,
                  )



# set data path
data.data_path = '/ocean/projects/atm200007p/jlin96/neurips_proj/e3sm_train/'
data.data_path = "/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/"

data_save_path =  "/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/preprocessed/"
# set inputs and outputs to V1 subset
#data.set_to_v1_vars()
#data.set_to_v2_vars()
#data.set_to_v4_vars()
data.set_to_v4_rnn_vars()

# v1 inputs (name :: description :: dimension :: units): 

# 'state_t' :: air temperature :: 60 :: K 
# 'state_q0001' :: specific humidity :: 60 :: kg/kg
# 'state_ps' :: surface pressure :: 1 :: Pa
# 'pbuf_SOLIN' :: solar insolation :: 1 :: W/m^2
# 'pbuf_LHFLX' :: surface latent heat flux :: 1 :: W/m^2
# 'pbuf_SHFLX' :: surface sensible heat flux :: 1 :: W/m^2

# v1 outputs (name :: description :: dimension :: units): 

# 'ptend_t' :: heating tendency :: 60 :: K/s 
# 'ptend_q0001' :: moistening tendency :: 60 :: kg/kg/s
# 'cam_out_NETSW' :: net shortwave flux at surface :: 1 :: W/m^2
# 'cam_out_FLWDS' :: downward longwave flux at surface :: 1 :: W/m^2 
# 'cam_out_PRECSC' :: snow rate (liquid water equivalent) :: 1 :: m/s 
# 'cam_out_PRECC' :: rain rate :: 1 :: m/s
# 'cam_out_SOLS' :: downward visible direct solar flux to surface :: 1 :: W/m^2
# 'cam_out_SOLL' :: downward near-infrared direct solar flux to surface :: 1 :: W/m^2
# 'cam_out_SOLSD' :: downward diffuse solar flux to surface :: 1 :: W/m^2
# 'cam_out_SOLLD' :: downward diffuse near-infrared solar flux to surface :: 1 :: W/m^2

# set regular expressions for selecting training data
#data.set_regexps(data_split = 'train', 
#                 regexps = ['E3SM-MMF.mli.000[1234567]-*-*-*.nc', # years 1 through 7
#                            'E3SM-MMF.mli.0008-01-*-*.nc']) # first month of year 8

regexp0 = 'E3SM-MMF.mlexpand.0001-02-01-*.nc'

regexp0 = 'E3SM-MMF.mlexpand.0001-02-01-0*.nc'


data.set_regexps(data_split = 'train',
                regexps = [regexp0]) # years 1   month 2
                 #regexps = ['E3SM-MMF.mlexpand.0001-02-*-*.nc']) # years 1   month 2

# set temporal subsampling
#data.set_stride_sample(data_split = 'train', stride_sample = 7)
data.set_stride_sample(data_split = 'train', stride_sample = 1)

# create list of files to extract data from
data.set_filelist(data_split = 'train')
                 
# save numpy files of training data
# data.save_as_npy(data_split = 'train', save_path = data_save_path)

2D Input variables: ['state_t', 'state_rh', 'state_q0002', 'state_q0003', 'state_u', 'state_v', 'state_t_dyn', 'state_q0_dyn', 'state_u_dyn', 'tm_state_t_dyn', 'tm_state_q0_dyn', 'tm_state_u_dyn', 'pbuf_ozone', 'pbuf_CH4', 'pbuf_N2O']
1D (scalar) Input variables: ['state_ps', 'pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX', 'pbuf_TAUX', 'pbuf_TAUY', 'pbuf_COSZRS', 'cam_in_ALDIF', 'cam_in_ALDIR', 'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_LWUP', 'cam_in_ICEFRAC', 'cam_in_LANDFRAC', 'cam_in_OCNFRAC', 'cam_in_SNOWHICE', 'cam_in_SNOWHLAND', 'tm_state_ps', 'tm_pbuf_SOLIN', 'tm_pbuf_LHFLX', 'tm_pbuf_SHFLX', 'tm_pbuf_COSZRS', 'clat', 'slat', 'lat', 'lon']
2D Output variables: ['ptend_t', 'ptend_q0001', 'ptend_q0002', 'ptend_q0003', 'ptend_u', 'ptend_v']
1D (scalar) Output variables: ['cam_out_NETSW', 'cam_out_FLWDS', 'cam_out_PRECSC', 'cam_out_PRECC', 'cam_out_SOLS', 'cam_out_SOLL', 'cam_out_SOLSD', 'cam_out_SOLLD']


In [3]:
# Test new v4-RNN datasaver
data.data_path = '/ocean/projects/atm200007p/jlin96/neurips_proj/e3sm_train/'
data.data_path = "/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/"

data_save_path =  "/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/preprocessed/"
# set inputs and outputs to V1 subset
#data.set_to_v1_vars()
#data.set_to_v2_vars()
#data.set_to_v4_vars()
data.set_to_v4_rnn_vars()

data.set_filelist(data_split = 'train')

#filelist = data.get_filelist('train')

filelist=["/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-02400_sp.nc",
          "/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-03600_sp.nc"]


def save_as_h5_keeplev_new(save_path = '', save_filename = ''):
    '''
    This function saves the training data as a .h5 file while keeping vertical structure
    '''
    self = data
    #filelist = self.get_filelist(data_split)
    i = 0

    vars_1D_inp = self.vars_1D_inp
    vars_2D_inp = self.vars_2D_inp
    vars_1D_outp = self.vars_1D_outp
    vars_2D_outp = self.vars_2D_outp

    if "lat" in vars_1D_inp:
        vars_1D_inp.remove('lat'); vars_1D_inp.remove('lon')   

    compression = "gzip"
    compression = "lzf"
    comp_level = 8
    comp_level = 5

    nrows = 0 # keep track of total number of rows, grows with each new file 
    nlev = 60 
    nx      = len(self.vars_2D_inp)
    nx_sfc  = len(self.vars_1D_inp)
    ny      = len(self.vars_2D_outp)
    ny_sfc  = len(self.vars_1D_outp)

    # if save_path not exist, create it
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    # add "/" to the end of save_path if it does not exist
    if save_path[-1] != '/':
        save_path = save_path + '/'

    if save_filename == '':
        save_filename = 'data.h5'

    if save_filename[-3:] !='.h5':
        save_filename = save_filename + '.h5'

    h5_path = save_path + "_" + save_filename
    print("Saving preprocessed input/output data to {}".format(h5_path), flush=True)
    print("Starting generator, len of filelist is {}".format(len(filelist)), flush=True)

    with h5py.File(save_filename, 'w') as hdf:

        for file in filelist:

            # print(file)
            # read inputs
            ds_input = self.get_input(file)
            # read targets
            ds_target = self.get_target(file)
            # print(ds_input)
            # print("shape ds input", ds_input.shape)

            # normalization, scaling
            if self.normalize:
                ds_input = (ds_input - self.input_mean)/(self.input_max - self.input_min)
                ds_target = ds_target*self.output_scale
                # for varname in ds_input:
                #     ds_input = (ds_input - self.input_mean)/(self.input_max - self.input_min)
                #     ds_target = ds_target*self.output_scale
            else:
                ds_input = ds_input.drop(['lat','lon'])

            if i==0:

                for inpvar in vars_2D_inp:
                    print("LEV-",inpvar, "min:", ds_input[inpvar].values.min(), "max:", ds_input[inpvar].values.max())

                for inpvar in vars_1D_inp:
                    print("SCA-",inpvar, "min:", ds_input[inpvar].values.min(), "max:", ds_input[inpvar].values.max())

                for outpvar in vars_2D_outp:
                    print("yLEV-",outpvar, "min:", ds_target[outpvar].values.min(), "max:", ds_target[outpvar].values.max())

                for outpvar in vars_1D_outp:
                    print("ySCA-",outpvar, "min:", ds_target[outpvar].values.min(), "max:", ds_target[outpvar].values.max())


            # stack
            # ds = ds.stack({'batch':{'sample','ncol'}})
            ds_input = ds_input.stack({'batch':{'ncol'}})

            ds_input_lev = ds_input[vars_2D_inp]
            ds_input_lev = ds_input_lev.to_dataarray(dim='features', name='inputs_lev')

            ds_input_scalar = ds_input[vars_1D_inp]
            ds_input_scalar = ds_input_scalar.to_dataarray(dim='features', name='inputs_scalar')
            ds_target = ds_target.stack({'batch':{'ncol'}})

            ds_target_lev = ds_target[vars_2D_outp]
            ds_target_lev = ds_target_lev.to_dataarray(dim='features', name='outputs_lev')

            ds_target_scalar = ds_target[vars_1D_outp]
            ds_target_scalar = ds_target_scalar.to_dataarray(dim='features', name='outputs_scalar')


            input_lev = np.transpose(ds_input_lev.values)
            input_sca = np.transpose(ds_input_scalar.values)
            output_lev = np.transpose(ds_target_lev.values)
            output_sca = np.transpose(ds_target_scalar.values)

            ds_input.close(); ds_target.close()
            ds_input_lev.close(); ds_input_scalar.close()
            ds_target_lev.close(); ds_target_scalar.close()

            # if i % 20 == 1: 
            #     print("i = {}, shape input lev {}; input_sca {}  ".format(i, npy_input_lev.shape, npy_input_sca.shape),flush=True)

            if self.normalize:
                # replace inf and nan with 0
                input_lev[np.isinf(input_lev)] = 0 ; input_lev[np.isnan(input_lev)] = 0
                input_sca[np.isinf(input_sca)] = 0 ; input_sca[np.isnan(input_sca)] = 0
                output_lev[np.isinf(output_lev)] = 0 ; output_lev[np.isnan(output_lev)] = 0
                output_sca[np.isinf(output_sca)] = 0 ; output_sca[np.isnan(output_sca)] = 0

            # Now save to the HDF5 file 
            nrows = nrows + input_lev.shape[0]

            if i==0:
                xlay = hdf.create_dataset("input_lev", (nrows, nlev, nx), maxshape=(None, nlev, nx),
                                                compression=compression, dtype='float32')#, compression_opts=comp_level)
                xsfc = hdf.create_dataset("input_sca", (nrows, nx_sfc), maxshape=(None, nx_sfc),
                                                compression=compression, dtype='float32')#, compression_opts=comp_level)

                ylay = hdf.create_dataset("output_lev", (nrows, nlev, ny), maxshape=(None, nlev, ny),
                                                compression=compression, dtype='float32')#, compression_opts=comp_level)
                ysfc = hdf.create_dataset("output_sca", (nrows, ny_sfc), maxshape=(None, ny_sfc),
                                                compression=compression, dtype='float32')#, compression_opts=comp_level)       

                xlay[:] = input_lev; xlay.attrs['varnames'] = vars_2D_inp
                xsfc[:] = input_sca; xsfc.attrs['varnames'] = vars_1D_inp

                ylay[:] = output_lev; ylay.attrs['varnames'] = vars_2D_outp
                ysfc[:] = output_sca; ysfc.attrs['varnames'] = vars_1D_outp

                print("Min, max xlay:", input_lev.min(), input_lev.max())
                print("Min, max xsfc:", input_sca.min(), input_sca.max())
                print("Min, max ylay:", output_lev.min(), output_lev.max())
                print("Min, max ysfc:", output_sca.min(), output_sca.max())

            else:
                xlay =  hdf['input_lev']
                xsfc =  hdf['input_sca']
                ylay =  hdf['output_lev']
                ysfc =  hdf['output_sca']

                xlay.resize(nrows, axis=0); xlay[i0:nrows, :] = input_lev
                xsfc.resize(nrows, axis=0); xsfc[i0:nrows, :] = input_sca
                ylay.resize(nrows, axis=0); ylay[i0:nrows, :] = output_lev
                ysfc.resize(nrows, axis=0); ysfc[i0:nrows, :] = output_sca

            i0 = nrows
            i = i +1

            gc.collect()

        # del npy_input_lev, npy_input_sca, npy_output_lev, npy_output_sca


save_path = data_save_path 
save_filename = 'testnew.h5'
save_as_h5_keeplev_new(save_path,save_filename )

2D Input variables: ['state_t', 'state_rh', 'state_q0002', 'state_q0003', 'state_u', 'state_v', 'state_t_dyn', 'state_q0_dyn', 'state_u_dyn', 'tm_state_t_dyn', 'tm_state_q0_dyn', 'tm_state_u_dyn', 'pbuf_ozone', 'pbuf_CH4', 'pbuf_N2O']
1D (scalar) Input variables: ['state_ps', 'pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX', 'pbuf_TAUX', 'pbuf_TAUY', 'pbuf_COSZRS', 'cam_in_ALDIF', 'cam_in_ALDIR', 'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_LWUP', 'cam_in_ICEFRAC', 'cam_in_LANDFRAC', 'cam_in_OCNFRAC', 'cam_in_SNOWHICE', 'cam_in_SNOWHLAND', 'tm_state_ps', 'tm_pbuf_SOLIN', 'tm_pbuf_LHFLX', 'tm_pbuf_SHFLX', 'tm_pbuf_COSZRS', 'clat', 'slat', 'lat', 'lon']
2D Output variables: ['ptend_t', 'ptend_q0001', 'ptend_q0002', 'ptend_q0003', 'ptend_u', 'ptend_v']
1D (scalar) Output variables: ['cam_out_NETSW', 'cam_out_FLWDS', 'cam_out_PRECSC', 'cam_out_PRECC', 'cam_out_SOLS', 'cam_out_SOLL', 'cam_out_SOLSD', 'cam_out_SOLLD']
Saving preprocessed input/output data to /network/group/aopp/predict/HMC009_UKKONEN_CLI

In [21]:
self= data
assert self.input_train is not None
state_ps = self.input_train[:,self.ps_index]
if self.normalize:
    state_ps = state_ps*(self.input_max['state_ps'].values - self.input_min['state_ps'].values) + self.input_mean['state_ps'].values
state_ps = np.reshape(state_ps, (-1, self.num_latlon))
pressure_grid_p1 = np.array(self.grid_info['P0']*self.grid_info['hyai'])[:,np.newaxis,np.newaxis]
pressure_grid_p2 = self.grid_info['hybi'].values[:, np.newaxis, np.newaxis] * state_ps[np.newaxis, :, :]
self.pressure_grid_train = pressure_grid_p1 + pressure_grid_p2
self.dp_train = self.pressure_grid_train[1:61,:,:] - self.pressure_grid_train[0:60,:,:]
self.dp_train = self.dp_train.transpose((1,2,0))

AssertionError: 

In [22]:
self.ps_index

360

In [5]:
print(data.vars_1D_inp)

['state_ps', 'pbuf_SOLIN', 'pbuf_LHFLX', 'pbuf_SHFLX', 'pbuf_TAUX', 'pbuf_TAUY', 'pbuf_COSZRS', 'cam_in_ALDIF', 'cam_in_ALDIR', 'cam_in_ASDIF', 'cam_in_ASDIR', 'cam_in_LWUP', 'cam_in_ICEFRAC', 'cam_in_LANDFRAC', 'cam_in_OCNFRAC', 'cam_in_SNOWHICE', 'cam_in_SNOWHLAND', 'lat', 'lon']


In [10]:
with h5py.File('testh5file', 'w') as hdf:
    hf = hdf.create_dataset('input_lev', data=np.zeros((30,3)), 
                    compression='gzip', compression_opts=3,
                    dtype='float32')
    hf.attrs['varnames'] =data.vars_2D_inp 


In [12]:
hf = h5py.File('testh5file', 'r')

In [19]:
hf['input_lev'].attrs['varnames']

array(['state_t', 'state_q0001', 'state_q0002', 'state_q0003', 'state_u',
       'state_v', 'pbuf_ozone', 'pbuf_CH4', 'pbuf_N2O'], dtype=object)

In [7]:
#data.save_as_h5_keeplev(data_split = 'train', save_path = data_save_path)
savename = regexp0.removesuffix('*.nc')    # Returns 'abcdc'
print(savename)
data.save_as_h5_keeplev(data_split = 'train', save_path = data_save_path, save_filename = savename)

E3SM-MMF.mlexpand.0001-02-01-
Saving daily data input/output file to /network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/preprocessed/train_E3SM-MMF.mlexpand.0001-02-01-.h5
Min max inputs_lev (-0.9132654666900635,0.9945988655090332)
Min max inputs_sca (-0.886635422706604,0.9597903490066528)
Min max output_lev (-2.0515353679656982,1.8390401601791382)
Min max output_sca (0.0,2.5923449993133545)


In [5]:
data.get_filelist('train')

['/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-02400_sp.nc',
 '/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-03600_sp.nc',
 '/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-04800_sp.nc',
 '/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-06000_sp.nc',
 '/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-07200_sp.nc',
 '/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-08400_sp.nc',
 '/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MM

In [6]:
file="/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-02400_sp.nc"
print(file)
ds = xr.open_dataset(file, engine = 'netcdf4')

/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-02400_sp.nc


In [43]:
ds['state_v']

In [15]:
ch4 = ds['pbuf_CH4']

In [32]:
ch4[:,0]

In [18]:
ch4_sc = (ch4 - data.input_mean)/(data.input_max - data.input_min)

In [20]:
print(ch4.min(), ch4.max())
print(ch4_sc.min(), ch4_sc.max())

<xarray.DataArray 'pbuf_CH4' ()> Size: 4B
array(1.7876416e-10, dtype=float32) <xarray.DataArray 'pbuf_CH4' ()> Size: 4B
array(9.986059e-07, dtype=float32)
<xarray.Dataset> Size: 232B
Dimensions:                ()
Data variables: (12/58)
    cam_in_ALDIF           float32 4B -0.565
    cam_in_ALDIR           float32 4B -0.5815
    cam_in_ASDIF           float32 4B -0.5536
    cam_in_ASDIR           float32 4B -0.5705
    cam_in_ICEFRAC         float32 4B 1.788e-10
    cam_in_LANDFRAC        float32 4B 1.788e-10
    ...                     ...
    tm_pbuf_COSZRS         float32 4B -0.2504
    clat                   float32 4B 1.788e-10
    slat                   float32 4B 1.788e-10
    icol                   float32 4B 1.788e-10
    pbuf_SOLIN_pm          float32 4B 1.272e-13
    pbuf_COSZRS_pm         float32 4B -0.2504 <xarray.Dataset> Size: 232B
Dimensions:                ()
Data variables: (12/58)
    cam_in_ALDIF           float32 4B -0.565
    cam_in_ALDIR           float32 4B -0.

In [27]:

ch4_sc['state_ps'][:].values

array([[-2.7819579, -2.7819579, -2.7819579, ..., -2.7819579, -2.7819579,
        -2.7819579],
       [-2.7819579, -2.7819579, -2.7819579, ..., -2.7819579, -2.7819579,
        -2.7819579],
       [-2.7819579, -2.7819579, -2.7819579, ..., -2.7819579, -2.7819579,
        -2.7819579],
       ...,
       [-2.7819579, -2.7819579, -2.7819579, ..., -2.7819579, -2.7819579,
        -2.7819579],
       [-2.7819579, -2.7819579, -2.7819579, ..., -2.7819579, -2.7819579,
        -2.7819579],
       [-2.7819579, -2.7819579, -2.7819579, ..., -2.7819579, -2.7819579,
        -2.7819579]], dtype=float32)

In [30]:
ch4_sc['pbuf_CH4'][:,0].values

array([0.18617925, 0.18233107, 0.17807856, 0.1734049 , 0.16831806,
       0.16285957, 0.1571065 , 0.15115932, 0.14512093, 0.13907184,
       0.13305308, 0.12706527, 0.12108126, 0.11506306, 0.10897277,
       0.10277096, 0.09640224, 0.08977236, 0.0827229 , 0.08641438,
       0.11151074, 0.1278672 , 0.10257551, 0.07144294, 0.03778508,
       0.01205091, 0.00293423,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan],
      dtype=float32)

In [17]:
data.input_mean['pbuf_CH4']

In [31]:
data.input_max['pbuf_CH4']

In [17]:
len(data.get_filelist('train'))

69

In [6]:
data.save_as_npy(data_split = 'train', save_path = data_save_path)

In [7]:
                
fx = h5py.File(data_save_path+"train_input.h5", 'r')
fy = h5py.File(data_save_path+"train_target.h5", 'r')


In [15]:
fx["data"]

<HDF5 dataset "data": shape (26496, 557), type "<f4">

In [16]:
ncol = 384
print(fx["data"].shape[0]//ncol)

69


In [18]:
data.normalize

True

In [31]:
#filelist = data.get_filelist('train')

filelist=["/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-02400_sp.nc",
          "/network/group/aopp/predict/HMC009_UKKONEN_CLIMSIM/ClimSim_data/ClimSim_low-res-expanded/train/0001-02/E3SM-MMF.mlexpand.0001-02-01-03600_sp.nc"]

def gen_keepvertical():
    i = 0
    for file in filelist:
       # print(file)
        # read inputs
        ds_input = data.get_input(file)
        # read targets
        ds_target = data.get_target(file)
       # print(ds_input)
       # print("shape ds input", ds_input.shape)

        # normalization, scaling
        if data.normalize:
            ds_input = (ds_input - data.input_mean)/(data.input_max - data.input_min)
            ds_target = ds_target*data.output_scale
        else:
            ds_input = ds_input.drop(['lat','lon'])

        # stack
        # ds = ds.stack({'batch':{'sample','ncol'}})
        ds_input = ds_input.stack({'batch':{'ncol'}})
        #print(ds_input)
        
        vars_1D = []; vars_2D = []
        
        all_vars = list(ds_input.keys())
        for var in all_vars:
            if 'lev' in ds_input[var].dims:
                vars_2D.append(var)
            else:
                vars_1D.append(var)  
                
        if i==0:
            
            for inpvar in vars_2D:
                print("LEV-",inpvar, "min:", ds_input[inpvar].values.min(), "max:", ds_input[inpvar].values.max())

            for inpvar in vars_1D:
                print("SCA-",inpvar, "min:", ds_input[inpvar].values.min(), "max:", ds_input[inpvar].values.max())

        #print(vars_2D)
       # vars_1D.remove('lat'); vars_1D.remove('lon')        
                
        ds_input_lev = ds_input[vars_2D]
        ds_input_lev = ds_input_lev.to_dataarray(dim='features', name='inputs_lev')
        
        ds_input_scalar = ds_input[vars_1D]
        ds_input_scalar = ds_input_scalar.to_dataarray(dim='features', name='inputs_scalar')
        #ds_input = ds_input.to_stacked_array('mlvar', sample_dims=['batch'], name=data.input_abbrev)
        #ds_input = ds_input.to_stacked_array('mlvar', sample_dims=['batch','lev'], name=data.input_abbrev)

        
        # dso = dso.stack({'batch':{'sample','ncol'}})
        ds_target = ds_target.stack({'batch':{'ncol'}})
        
        vars_1D = []; vars_2D = []
        
        all_vars = list(ds_target.keys())
        for var in all_vars:
            if 'lev' in ds_target[var].dims:
                vars_2D.append(var)
            else:
                vars_1D.append(var)  
                
        #print(vars_2D)

        ds_target_lev = ds_target[vars_2D]
        ds_target_lev = ds_target_lev.to_dataarray(dim='features', name='outputs_lev')
        
        ds_target_scalar = ds_target[vars_1D]
        ds_target_scalar = ds_target_scalar.to_dataarray(dim='features', name='outputs_scalar')
        i = i + 1
        
      #  ds_target = ds_target.to_stacked_array('mlvar', sample_dims=['batch'], name=data.output_abbrev)
        #print(ds_input_lev.values.shape)
        
        #yield (ds_input.values, ds_target.values)
        yield (ds_input_lev.values, ds_input_scalar.values, ds_target_lev.values, ds_target_scalar.values)





In [16]:
file=filelist[0]


In [32]:
i = 0
normalize = True 

for item in gen_keepvertical():
    npy_input_lev0 = item[0]
    npy_input_sca0 = item[1]
    npy_output_lev0 = item[2]
    npy_output_sca0 = item[3]    

    if normalize:
        # replace inf and nan with 0
        npy_input_lev0[np.isinf(npy_input_lev0)] = 0 ; npy_input_lev0[np.isnan(npy_input_lev0)] = 0
        npy_input_sca0[np.isinf(npy_input_sca0)] = 0 ; npy_input_sca0[np.isnan(npy_input_sca0)] = 0
        npy_output_lev0[np.isinf(npy_output_lev0)] = 0 ; npy_output_lev0[np.isnan(npy_output_lev0)] = 0
        npy_output_sca0[np.isinf(npy_output_sca0)] = 0 ; npy_output_sca0[np.isnan(npy_output_sca0)] = 0
    
        
    if i==0:
        npy_input_lev = npy_input_lev0
        npy_input_sca = npy_input_sca0
        npy_output_lev = npy_output_lev0
        npy_output_sca = npy_output_sca0
    else:
        npy_input_lev = np.concatenate((npy_input_lev, npy_input_lev0 ))
        npy_input_sca = np.concatenate((npy_input_sca, npy_input_sca0 ))
        npy_output_lev = np.concatenate((npy_output_lev,npy_output_lev0 ))
        npy_output_sca = np.concatenate((npy_output_sca, npy_output_sca0 ))
        
    #print(item[0].dtype, item[0].shape)
    #print(item[1].dtype, item[1].shape)
    #print(item[2].dtype, item[2].shape)
    #print(item[3].dtype, item[3].shape)
    print("xlev", npy_input_lev.shape,npy_input_lev.min(), npy_input_lev.max())
    print("xsfc" ,npy_input_sca.shape,npy_input_sca.min(), npy_input_sca.max())
    print("ylev", npy_output_lev.shape,npy_output_lev.min(), npy_output_lev.max())
    print("ysfc", npy_output_sca.shape,npy_output_sca.min(), npy_output_sca.max())
    i += 1 


LEV- state_t min: -0.47139478 max: 0.4257792
LEV- state_rh min: 1.686834099154969e-07 max: 1.2675894228430487
LEV- state_u min: -0.34757945 max: 0.47107396
LEV- state_v min: -0.3061085 max: 0.37708783
LEV- state_t_dyn min: -0.4582763 max: 0.32795382
LEV- state_q0_dyn min: -0.31840444 max: 0.30475682
LEV- state_u_dyn min: -0.31736884 max: 0.2741454
LEV- tm_state_t_dyn min: -0.43210196 max: 0.3193997
LEV- tm_state_q0_dyn min: -0.34382746 max: 0.39103693
LEV- tm_state_u_dyn min: -0.30808392 max: 0.2767687
LEV- state_t_prvphy min: -10.871728 max: 14.160386
LEV- state_q0001_prvphy min: -13.907437 max: 6.8541975
LEV- state_u_prvphy min: -8.720417 max: 8.947801
LEV- tm_state_t_prvphy min: -11.511813 max: 15.222005
LEV- tm_state_q0001_prvphy min: -15.063039 max: 9.174218
LEV- tm_state_u_prvphy min: -9.537982 max: 11.222265
LEV- pbuf_ozone min: -0.65362036 max: 0.7232802
LEV- pbuf_CH4 min: nan max: nan
LEV- pbuf_N2O min: nan max: nan
SCA- state_ps min: -0.65923727 max: 0.11903675
SCA- pbuf_SOLI

In [72]:
npy_input_lev.shape

(18, 60, 384)

In [65]:
npy_iterator = list(gen_keepvertical())

(9, 60, 384)
(9, 60, 384)


In [64]:
npy_input = np.concatenate([npy_iterator[x][0] for x in range(len(npy_iterator))])

TypeError: object of type 'function' has no len()

In [19]:

class IterablDataset_lev():
    def __init__(this_self, data_generator):
        this_self.data_generator = data_generator

    def as_numpy_iterator(this_self):
        for item in this_self.data_generator:

            # Convert item to numpy array
            input_lev_array = np.array(item[0])
            input_scalar_array = np.array(item[1])

            target_lev_array = np.array(item[2])
            target_scalar_array = np.array(item[3])

            # Assert final dimensions are correct.
            #assert input_array.shape[-1] == this_self.output_shapes[0][-1]
            #assert target_array.shape[-1] == this_self.output_shapes[1][-1]
            print(input_array)
            
            yield (input_array, target_array)
            
dataset = IterablDataset_lev(
    gen_keepvertical(),
)

In [63]:
filelist = data.get_filelist('train')

ds_input = data.get_input(filelist[0])

In [117]:
ds_target = data.get_target(filelist[0])
ds_target.keys()

KeysView(<xarray.Dataset> Size: 565kB
Dimensions:         (lev: 60, ncol: 384)
Dimensions without coordinates: lev, ncol
Data variables: (12/14)
    ptend_t         (lev, ncol) float32 92kB -1.434e-05 ... -9.506e-05
    ptend_q0001     (lev, ncol) float32 92kB 0.0 0.0 0.0 ... 2.325e-08 3.063e-08
    ptend_q0002     (lev, ncol) float32 92kB 0.0 0.0 ... -1.073e-09 -1.441e-09
    ptend_q0003     (lev, ncol) float32 92kB 0.0 0.0 ... -4.314e-09 -1.442e-09
    ptend_u         (lev, ncol) float32 92kB 0.0 0.0 ... 7.509e-06 -1.026e-05
    ptend_v         (lev, ncol) float32 92kB 0.0 0.0 ... -4.532e-05 -1.876e-05
    ...              ...
    cam_out_PRECSC  (ncol) float32 2kB 0.0 0.0 0.0 0.0 ... 0.0 4.41e-09 5.17e-09
    cam_out_PRECC   (ncol) float32 2kB 0.0 5.429e-09 ... 4.41e-09 5.791e-09
    cam_out_SOLS    (ncol) float32 2kB 0.0 0.0 0.0 0.0 ... 108.1 113.8 108.8
    cam_out_SOLL    (ncol) float32 2kB 0.0 0.0 0.0 0.0 ... 168.7 107.1 177.7
    cam_out_SOLSD   (ncol) float32 2kB 0.0 0.0 0.0 0

In [106]:
ds_input.keys()


KeysView(<xarray.Dataset> Size: 862kB
Dimensions:           (lev: 60, ncol: 384)
Dimensions without coordinates: lev, ncol
Data variables: (12/28)
    state_t           (lev, ncol) float32 92kB 213.7 213.4 216.8 ... 264.6 270.2
    state_q0001       (lev, ncol) float32 92kB 1.484e-06 1.486e-06 ... 0.002006
    state_q0002       (lev, ncol) float32 92kB 2.127e-33 1.705e-33 ... 1.73e-06
    state_q0003       (lev, ncol) float32 92kB 1.201e-25 1.228e-25 ... 1.731e-06
    state_u           (lev, ncol) float32 92kB -76.54 -66.56 ... 3.737 -1.25
    state_v           (lev, ncol) float32 92kB -2.411 -5.106 ... -8.319 -5.164
    ...                ...
    cam_in_SNOWHLAND  (ncol) float32 2kB 0.0 0.0 0.0 ... 0.01101 0.00382
    pbuf_ozone        (lev, ncol) float32 92kB 2.501e-07 2.456e-07 ... 7.145e-08
    pbuf_CH4          (lev, ncol) float32 92kB 1.705e-07 1.682e-07 ... 9.986e-07
    pbuf_N2O          (lev, ncol) float32 92kB 2.709e-08 2.573e-08 ... 4.909e-07
    lat               (ncol) flo

In [88]:
ds_input.keys()
#ds_input["state_t"].shape
vars_1D = []
vars_2D = []

all_vars = list(ds_input.keys())
for var in all_vars:
    print(var)
    if 'lev' in ds_input[var].dims:
        vars_2D.append(var)
    else:
        vars_1D.append(var)

state_t
state_q0001
state_q0002
state_q0003
state_u
state_v
state_ps
pbuf_SOLIN
pbuf_LHFLX
pbuf_SHFLX
pbuf_TAUX
pbuf_TAUY
pbuf_COSZRS
cam_in_ALDIF
cam_in_ALDIR
cam_in_ASDIF
cam_in_ASDIR
cam_in_LWUP
cam_in_ICEFRAC
cam_in_LANDFRAC
cam_in_OCNFRAC
cam_in_SNOWHICE
cam_in_SNOWHLAND
pbuf_ozone
pbuf_CH4
pbuf_N2O
lat
lon


In [102]:
vars_1D.remove('lat')

TypeError: 'str' object cannot be interpreted as an integer

In [103]:
vars_1D.remove('lat')
vars_1D.remove('lon')


In [113]:
ds = ds_input[vars_2D]


In [114]:
ds

In [115]:
#ds = ds.to_stacked_array('mlvar', sample_dims=['ncol'], name='inputs_lev')
ds = ds.to_dataarray(dim='features', name=None)

In [116]:
ds

In [86]:
ds_input[var].dims

('ncol',)

In [76]:
ds_input.variables

Frozen({'state_t': <xarray.Variable (lev: 60, ncol: 384)> Size: 92kB
array([[213.69246, 213.42026, 216.78186, ..., 223.19545, 222.76604,
        227.97432],
       [221.97739, 225.43901, 219.18047, ..., 230.10353, 236.63925,
        234.35434],
       [228.30034, 228.46686, 233.8395 , ..., 230.09628, 236.09723,
        241.05812],
       ...,
       [291.79578, 288.5246 , 294.99802, ..., 260.5572 , 262.12875,
        267.87933],
       [293.03784, 289.73593, 296.22693, ..., 260.48407, 263.24918,
        269.00827],
       [294.28735, 290.97095, 297.48245, ..., 259.69998, 264.55057,
        270.22974]], dtype=float32), 'state_q0001': <xarray.Variable (lev: 60, ncol: 384)> Size: 92kB
array([[1.4841227e-06, 1.4864689e-06, 1.4791285e-06, ..., 1.4471311e-06,
        1.4588319e-06, 1.4461871e-06],
       [1.4771180e-06, 1.4762566e-06, 1.4679080e-06, ..., 1.4125744e-06,
        1.4272078e-06, 1.4265547e-06],
       [1.4648356e-06, 1.4665736e-06, 1.4442642e-06, ..., 1.3399480e-06,
        1.33

In [None]:


def save_as_h5_keeplev_new(self,
             data_split, 
             save_path = '',
             save_filename = ''):
    '''
    This function saves the training data as a .h5 file while keeping vertical structure
    '''
    filelist = self.get_filelist(data_split)
    i = 0

    print("Starting generator, len of filelist is {}".format(len(filelist)), flush=True)
    for file in filelist:
       # print(file)
        # read inputs
        ds_input = self.get_input(file)
        # read targets
        ds_target = self.get_target(file)
       # print(ds_input)
       # print("shape ds input", ds_input.shape)

        # normalization, scaling
        if self.normalize:
            ds_input = (ds_input - self.input_mean)/(self.input_max - self.input_min)
            ds_target = ds_target*self.output_scale
        else:
            ds_input = ds_input.drop(['lat','lon'])

        # stack
        # ds = ds.stack({'batch':{'sample','ncol'}})
        ds_input = ds_input.stack({'batch':{'ncol'}})
        #print(ds_input)

        #vars_1D_inp = []; vars_2D_inp = []
        #all_vars = list(ds_input.keys())
        #for var in all_vars:
        #    if 'lev' in ds_input[var].dims:
        #        vars_2D_inp.append(var)
        #    else:
        #        vars_1D_inp.append(var)  
        vars_1D_inp = self.vars_1D_inp
        vars_2D_inp = self.vars_2D_inp

        #print(vars_2D)
        if "lat" in vars_1D_inp:
            vars_1D_inp.remove('lat'); vars_1D_inp.remove('lon')        

        ds_input_lev = ds_input[vars_2D_inp]
        ds_input_lev = ds_input_lev.to_dataarray(dim='features', name='inputs_lev')

        ds_input_scalar = ds_input[vars_1D_inp]
        ds_input_scalar = ds_input_scalar.to_dataarray(dim='features', name='inputs_scalar')
        #ds_input = ds_input.to_stacked_array('mlvar', sample_dims=['batch'], name=data.input_abbrev)
        #ds_input = ds_input.to_stacked_array('mlvar', sample_dims=['batch','lev'], name=data.input_abbrev)


        # dso = dso.stack({'batch':{'sample','ncol'}})
        ds_target = ds_target.stack({'batch':{'ncol'}})

        #vars_1D_outp = []; vars_2D_outp = []
        #all_vars = list(ds_target.keys())
        #for var in all_vars:
        #    if 'lev' in ds_target[var].dims:
        #        vars_2D_outp.append(var)
        #    else:
        #        vars_1D_outp.append(var)  
        vars_1D_outp = self.vars_1D_outp
        vars_2D_outp = self.vars_2D_outp

        #print(vars_2D)

        ds_target_lev = ds_target[vars_2D_outp]
        ds_target_lev = ds_target_lev.to_dataarray(dim='features', name='outputs_lev')

        ds_target_scalar = ds_target[vars_1D_outp]
        ds_target_scalar = ds_target_scalar.to_dataarray(dim='features', name='outputs_scalar')

      #  ds_target = ds_target.to_stacked_array('mlvar', sample_dims=['batch'], name=data.output_abbrev)
        #print(ds_input_lev.values.shape)

        #yield (ds_input.values, ds_target.values)
        input_lev = np.transpose(ds_input_lev.values)
        input_sca = np.transpose(ds_input_scalar.values)
        output_lev = np.transpose(ds_target_lev.values)
        output_sca = np.transpose(ds_target_scalar.values)
        ds_input.close(); ds_target.close()
        ds_input_lev.close(); ds_input_scalar.close()
        ds_target_lev.close(); ds_target_scalar.close()

        yield input_lev, input_sca, output_lev, output_sca


        npy_input_lev0 = item[0]
        npy_input_sca0 = item[1]
        npy_output_lev0 = item[2]
        npy_output_sca0 = item[3]    

        if i % 20 == 1: 
            print("i = {}, shape input lev {}; input_sca {}  ".format(i, npy_input_lev.shape, npy_input_sca.shape),flush=True)

        if self.normalize:
            # replace inf and nan with 0
            npy_input_lev0[np.isinf(npy_input_lev0)] = 0 ; npy_input_lev0[np.isnan(npy_input_lev0)] = 0
            npy_input_sca0[np.isinf(npy_input_sca0)] = 0 ; npy_input_sca0[np.isnan(npy_input_sca0)] = 0
            npy_output_lev0[np.isinf(npy_output_lev0)] = 0 ; npy_output_lev0[np.isnan(npy_output_lev0)] = 0
            npy_output_sca0[np.isinf(npy_output_sca0)] = 0 ; npy_output_sca0[np.isnan(npy_output_sca0)] = 0


        if i==0:
            npy_input_lev = npy_input_lev0
            npy_input_sca = npy_input_sca0
            npy_output_lev = npy_output_lev0
            npy_output_sca = npy_output_sca0
        else:
            npy_input_lev = np.concatenate((npy_input_lev, npy_input_lev0 ))
            npy_input_sca = np.concatenate((npy_input_sca, npy_input_sca0 ))
            npy_output_lev = np.concatenate((npy_output_lev,npy_output_lev0 ))
            npy_output_sca = np.concatenate((npy_output_sca, npy_output_sca0 ))

        del item
        gc.collect()

        #print(item[0].dtype, item[0].shape)
        #print(item[1].dtype, item[1].shape)
        #print(item[2].dtype, item[2].shape)
        #print(item[3].dtype, item[3].shape)
        #print(npy_input_lev.shape,npy_input_sca.shape, npy_output_lev0.shape, npy_output_sca0.shape  )
        i += 1 

    # if save_path not exist, create it
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    # add "/" to the end of save_path if it does not exist
    if save_path[-1] != '/':
        save_path = save_path + '/'

    if save_filename == '':
        save_filename = 'data.h5'

    if save_filename[-3:] !='.h5':
        save_filename = save_filename + '.h5'

    h5_path = save_path + data_split + "_" + save_filename
    compression_level = 8
    print("Attempting to Save daily data input/output file to {}".format(h5_path), flush=True)
    print("Min max inputs_lev ({},{})".format(npy_input_lev.min(), npy_input_lev.max()))
    print("Min max inputs_sca ({},{})".format(npy_input_sca.min(), npy_input_sca.max()))
    print("Min max output_lev ({},{})".format(npy_output_lev.min(), npy_output_lev.max()))
    print("Min max output_sca ({},{})".format(npy_output_sca.min(), npy_output_sca.max()))

    with h5py.File(h5_path, 'w') as hdf:
        hdf.create_dataset('input_lev', data=npy_input_lev, 
                        compression='gzip', compression_opts=compression_level,
                        dtype='float32')
        hdf.create_dataset('input_sca', data=npy_input_sca, 
                        compression='gzip', compression_opts=compression_level,
                        dtype='float32')            
        hdf.create_dataset('output_lev', data=npy_output_lev, 
                        compression='gzip', compression_opts=compression_level,
                        dtype='float32') 
        hdf.create_dataset('output_sca', data=npy_output_sca, 
                        compression='gzip', compression_opts=compression_level,
                        dtype='float32') 

    del npy_input_lev, npy_input_sca, npy_output_lev, npy_output_sca

### Create validation data

In [7]:
# set regular expressions for selecting validation data
data.set_regexps(data_split = 'val',
                 regexps = ['E3SM-MMF.mli.0008-0[23456789]-*-*.nc', # months 2 through 9 of year 8
                            'E3SM-MMF.mli.0008-1[012]-*-*.nc', # months 10 through 12 of year 8
                            'E3SM-MMF.mli.0009-01-*-*.nc']) # first month of year 9
# set temporal subsampling
data.set_stride_sample(data_split = 'val', stride_sample = 7)
# create list of files to extract data from
data.set_filelist(data_split = 'val')
# save numpy files of validation data
data.save_as_npy(data_split = 'val', save_path = '')

2023-06-14 05:19:01.725068: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


### Create scoring data

In [8]:
# set regular expressions for selecting scoring data (stride of 6 is needed for daily averaging)
data.set_regexps(data_split = 'scoring',
                 regexps = ['E3SM-MMF.mli.0008-0[23456789]-*-*.nc', # months 2 through 9 of year 8
                            'E3SM-MMF.mli.0008-1[012]-*-*.nc', # months 10 through 12 of year 8
                            'E3SM-MMF.mli.0009-01-*-*.nc']) # first month of year 9
# set temporal subsampling
data.set_stride_sample(data_split = 'scoring', stride_sample = 6)
# create list of files to extract data from
data.set_filelist(data_split = 'scoring')
# save numpy files of scoring data
data.save_as_npy(data_split = 'scoring', save_path = '')

2023-06-14 05:35:01.140716: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype int32
	 [[{{node Placeholder/_0}}]]


In [9]:
print('finished')

finished
