In [None]:
import os, glob
import sys
sys.path.append("../utils")
import numpy as np
import xarray as xr
import tensorflow as tf

from other_utils import to_list

# Test z-score normalization on-the-fly

In this Jupyter Notebook, the on-the-fly z-score normalization is tested based on the preprocessed ERA5 to IFS downscaling data (under `/p/scratch/deepacf/maelstrom/maelstrom_data/ap5_michael/preprocessed_era5_ifs/netcdf_data/all_files`). 

The check is performed on a smaller number of predictor/predictand variables. Also no shuffling is performed to ease the manual data reading from the corresponding netCDF-file. In particular, the first time step in the file `preproc_2016-01.nc` is checked.

We start with some preparations on the data files to be used.

In [None]:
# parameters
data_dir = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap5_michael/preprocessed_era5_ifs/all_files"

predictors = ["2t_in", "z_in", "sshf_in"]
predictands = ["t2m_tar", "z_tar"]

# work on parameters
nc_files_dir = glob.glob(os.path.join(data_dir, "preproc*.nc"))
nc_files = sorted(nc_files_dir)

all_vars = to_list(predictors) + to_list(predictands)

In [None]:
print(nc_files)
#for yr_mm in month_list:
#    nc_files = nc_files + subset_files_on_date(nc_files_dir, yr_mm, date_alias="%Y-%m")

Adapted function to create tf.Dataset-object from the netCDF-files.

In [None]:
def make_dataset(nc_files, predictors, predictands, norm_data):

    def gen(nc_files_ds):

        for file in nc_files_ds:
            ds = xr.open_dataset(file, engine='netcdf4')
            ds = ds[all_vars]
            ntimes = len(ds["time"])
            for t in range(ntimes):
                ds_t = ds.isel({"time": t})
                in_data, tar_data = ds_t[predictors].to_array(dim="variables").transpose(..., "variables"), \
                                    ds_t[predictands].to_array(dim="variables").transpose(..., "variables")
                yield tuple((in_data, tar_data))

    s0 = next(iter(gen(nc_files)))
    gen_mod = gen(nc_files)

    tfds_dat = tf.data.Dataset.from_generator(lambda: gen_mod,
                                              output_signature=(tf.TensorSpec(s0[0].shape, dtype=s0[0].dtype),
                                                                tf.TensorSpec(s0[1].shape, dtype=s0[1].dtype)))

    def normalize_batch(batch: tuple, norm_dict):

        mu_in, std_in = tf.constant(norm_dict["mu_in"], dtype=batch[0].dtype), \
                        tf.constant(norm_dict["std_in"], dtype=batch[0].dtype)
        mu_tar, std_tar = tf.constant(norm_dict["mu_tar"], dtype=batch[1].dtype), \
                          tf.constant(norm_dict["std_tar"], dtype=batch[1].dtype)

        in_normed, tar_normed = tf.divide(tf.subtract(batch[0], mu_in), std_in), \
                                tf.divide(tf.subtract(batch[1], mu_tar), std_tar)

        return in_normed, tar_normed

    def parse_example(in_data, tar_data):
        return normalize_batch((in_data, tar_data), norm_data)

    tfds_dat = tfds_dat.batch(2).map(parse_example)
    tfds_dat = tfds_dat.repeat(1).prefetch(1000)

    return tfds_dat, {"shape_in": s0[0].shape, "shape_tar": s0[1].shape}

In [None]:
norm_data = {"mu_in": [275., 500., 6000.], "mu_tar": [275, 500.], 
             "std_in": [10, 300., 8000.], "std_tar": [10., 300.]}


tfds_test, shp_dict = make_dataset(nc_files, predictors, predictands, norm_data)
tfds_test = iter(tfds_test)

batch1 = tfds_test.get_next()
    
in_data, tar_data = batch1[0].numpy()[0,...], batch1[1].numpy()[0,...]

Read the reference data:

In [None]:
ds_ref = xr.open_dataset(nc_files[0])

ds_ref = ds_ref.isel(time=0)
in_data_ref, tar_data_ref = ds_ref[predictors].to_array("variables"), ds_ref[predictands].to_array("variables")
in_data_ref, tar_data_ref = in_data_ref.transpose(..., "variables"), tar_data_ref.transpose(..., "variables")

Convert the data from the dataset iterator into DataArrays:

In [None]:
in_data = xr.DataArray(in_data, coords=in_data_ref.coords, dims=in_data_ref.dims)
tar_data = xr.DataArray(tar_data, coords=tar_data_ref.coords, dims=tar_data_ref.dims)

Do the same with the data from the normalization dictionary to ease subsequent computation (-> z-score normalization).

In [None]:
mu_in = xr.DataArray(norm_data["mu_in"], coords=in_data["variables"].coords, dims=in_data["variables"].dims)
mu_tar = xr.DataArray(norm_data["mu_tar"], coords=tar_data["variables"].coords, dims=tar_data["variables"].dims)

std_in = xr.DataArray(norm_data["std_in"], coords=in_data["variables"].coords, dims=in_data["variables"].dims)
std_tar = xr.DataArray(norm_data["std_tar"], coords=tar_data["variables"].coords, dims=tar_data["variables"].dims)

Finally, we perform a manual normalization of the data read from the netCDF-file and check the difference w.r.t. to the data from the mini-batch.

In [None]:
diff_in = (in_data_ref - mu_in)/std_in - in_data
diff_tar = (tar_data_ref - mu_tar)/std_tar - tar_data

assert np.amax(diff_in) < 1.e-06, "Normalizing input data did not work properly."
assert np.amax(diff_tar) < 1.e-06, "Normalizing target data did not work properly."

Since no assert-error is thrown, the functionality of the normalization routine when setting up the dataset is verified.