In [None]:
import os
import shutil
import glob
import time
from tqdm import tqdm
import random
import numpy as np
import xarray as xr
import pandas as pd
import tensorflow as tf

# Build up efficient TF data pipelines from netCDF-files

We test two different approaches to build up the dataset input streams. <br>
The first one is based on `open_dataset` and requires a large buffer size to enable proper sampling (buffer size $\mathcal{O}(10^4)$ so that at minimum 10 files are buffered). This is due to the fact that only the data files are randomized, not the data samples itself as in the next approach. <br>
The second approach is based on `open_mfdatset` which makes the data sampling much easier since it allows randomization on an index-list for the time dimension to build up the iterator.

In [None]:
def load_nc_dir_with_generator(dir_, patt, shuffle=True, seed=42):
    """
    Opens datafiles via looping in the generator-method. This implies a larger buffer_size (> 12*744 where 12 corresponds to months per year
    and 744 is the number of time steps in a monthly datafile) when shuffling since the buffer gets filled up with data from sequential (unordered) files.
    :param dir_: The directory where the netCDF-files are located.
    :param patt: Substring-pattern to identify the desired netCDF-files ("{patt}*.nc" is applied for searching)
    :param shuffle: flag to enable shuffling
    :param seed: seed for random shuffling
    :return: tf.Dataset for data streain in neural networks
    """
    
    nc_files = glob.glob(os.path.join(dir_, f"{patt}*.nc"))

    if shuffle:
        random.seed(seed)
        random.shuffle(nc_files)
    
    def gen(nc_files, shuffle=True, seed=42):

        for file in nc_files:
            ds = xr.open_dataset(file, engine='netcdf4')
            ntimes = len(ds["time"])
            for t in range(ntimes):
                ds_t = ds.isel({"time": t})
                data_dict = {key: tf.convert_to_tensor(val) for key, val in ds_t.items()}
                data_dict["time"]= np.array([pd.to_datetime(ds_t["time"].values).strftime("%Y-%m-%d %H:%M")])
                yield data_dict


    sample = next(iter(gen(nc_files, shuffle, seed=seed)))
    
    gen_mod = gen(nc_files, shuffle, seed)

    return tf.data.Dataset.from_generator(
        lambda: gen_mod,
        output_signature={
            key: tf.TensorSpec(val.shape, dtype=val.dtype)
            for key, val in sample.items()
        }
    )

def load_mfnc_dir_with_generator(dir_: str, patt: str, shuffle: bool = True, seed: int = 42):
    """
    Opens netCDF-files using xarray's open_mfdataset-method. Shuffling of the data is achieved by shuffling over the time step-indices.
    For efficiency, decoding of the time is disabled (implying shared time-units for all netCDF-data to avoid overwriting with open_mfdataset!!!) 
    since this information is not required for data streaming.
    :param dir_: The directory where the netCDF-files are located.
    :param patt: Substring-pattern to identify the desired netCDF-files ("{patt}*.nc" is applied for searching)
    :param shuffle: flag to enable shuffling
    :param seed: seed for random shuffling
    :return: tf.Dataset for data streain in neural networks
    """    
    ds_all = xr.open_mfdataset(os.path.join(dir_, f"{patt}*.nc"), cache=False, decode_cf=False)
    ntimes = len(ds_all["time"])
    if shuffle: 
        random.seed(seed)
        time_list = random.sample(range(ntimes), ntimes)
    else:
        time_list = range(ntimes)   


    def gen(ds_all):
        #ds_all = xr.open_mfdataset(os.path.join(dir_, f"{patt}*.nc"), cache=False, decode_cf=False)#, parallel=True)#, decode_times=False)       
        for t in time_list:
            # ds = xr.decode_cf(ds_all.isel({"time": t}))
            ds = ds_all.isel({"time": t})
            data_dict = {key: tf.convert_to_tensor(val) for key, val in ds.items()}
            # data_dict["time"]= np.array([pd.to_datetime(ds["time"].values).strftime("%Y-%m-%d %H:%M")])
            yield data_dict        

                    
    sample = next(iter(gen(ds_all)))
    
    gen_mod = gen(ds_all)
    
    return tf.data.Dataset.from_generator(
        lambda: gen_mod,
        output_signature={
            key: tf.TensorSpec(val.shape, dtype=val.dtype)
            for key, val in sample.items()
        }
    )


### highly in efficient in terms of memory -> not tested subsequently!!!
def load_data(dir_, patt) -> xr.DataArray:
    """
    Obtain the data and meta information from the netcdf files, including the len of the samples, mim and max values
    return: data as xarray's DataArray with dimensions [channels, time, lat, lon]
    """

    def reshape_ds(ds):
        da = ds.to_array(dim="variables")
        da = da.transpose(..., "variables")
        return da
    
    ds = xr.open_mfdataset(os.path.join(dir_, f"{patt}*.nc"), cache=False, parallel=True)
    da = reshape_ds(ds)
    init_times = da["time"]
    
    nvars = len(da["variables"])

    return da, init_times #da.chunk(chunks={"time": 744, "variables": nvars}), init_times

After setting up the data directory, both strategies are benchmarked.

In [None]:
datadir = "/p/scratch/deepacf/maelstrom/maelstrom_data/ap5_michael/preprocessed_era5_ifs/all_files/"
pattern = "preproc_"

We create the respective TF datasets, ...

In [None]:
tfds_test = load_nc_dir_with_generator(datadir, pattern)
tfds_test_mf = load_mfnc_dir_with_generator(datadir, pattern)
tfds_test_mf2 = load_mfnc_dir_with_generator(datadir, pattern)

... configure them and... 

In [None]:
sleep_sec = 0
ap1 = iter(tfds_test.shuffle(buffer_size=20000).batch(32).prefetch(100).repeat(1))
ap2 = iter(tfds_test_mf.batch(32).prefetch(100).repeat(1))
ap3 = iter(tfds_test_mf2.shuffle(buffer_size=1000).batch(32).prefetch(100).repeat(1))

then run both approaches. We start with approach 1:

In [None]:
%%time

sleep_sec = 0
ntakes = 1000

for i in tqdm(range(ntakes)):
    if i == 1:
        time_s = time.time()
    
    batch = ap1.get_next()
    #print(batch["2t_in"])
    #print("***************")
   
load_time_ap1 = (time.time() - time_s)/float(ntakes-1)
print("After filling the buffer, retrieving each minibatch took: {0:5.04f}s".format(load_time_ap1))

Now, we continue with approach 2:

In [None]:
%%time

time_s = time.time()

for i in tqdm(range(ntakes)):
    if i == 0:
        print(f"Sleeping for {sleep_sec}s...")
        time.sleep(sleep_sec)
    
    batch = ap2.get_next()
    #print(batch["2t_in"])
    #print("***************")
    
load_time_ap2 = (time.time() - time_s)/float(ntakes)


In [None]:
%%time

sleep_sec = 0

for i in tqdm(range(ntakes)):
    if i == 1:
        time_s = time.time()
    
    batch = ap3.get_next()
    #print(batch["2t_in"])
    #print("***************")
   
load_time_ap3 = (time.time() - time_s)/float(ntakes-1)
print("After filling the buffer, retrieving each minibatch took: {0:5.04f}s".format(load_time_ap3))

### Results

In [None]:
print("After filling the buffer, retrieving each minibatch with APPROACH 1 took: {0:5.04f}s".format(load_time_ap1))
print("Retrieving each minibatch with APPROACH 2 took: {0:5.04f}s".format(load_time_ap2))
print("After filling the buffer, retrieving each minibatch with APPROACH 3 took: {0:5.04f}s".format(load_time_ap3))

Thus, we see that the first approach outperforms the second approach (at least after the buffer has been filled once).

In [None]:
print(xr.__version__)

## New approach after discussion with Stefan Kesselheim

A new approach which also allows for parallelization is tested by creating a dataset handler with a `get_item`-method.

In [None]:
import sys
sys.path.append("../utils/")
from other_utils import to_list, subset_files_on_date, extract_date

class MonthlyNetCDFDataset():
    
    def __init__(self, data_dir, month_list, predictors, predictands, fsuffix="preproc"):
        """
        Create netCDF dataset instance.
        :param data_dir: directory where monthly netCDF-files are located
        :param month_list: lisr if months (strings with format <YYYY>-<MM>)
        :param predictors: list of predictor variables
        :param predictands: list of predictands
        :param fsuffix: suffix of netCDF-files 
        """
        # some preparations
        nc_files_dir = glob.glob(os.path.join(data_dir, fsuffix+"*.nc"))
        all_vars = to_list(predictors) + to_list(predictands)
        
        # filter net-CDF files based on months of interest
        if not nc_files_dir:
            raise FileNotFoundError(f"Could not find any netCDF-files under '{data_dir}' with suffix {fsuffix}")
        self.files = []
        for yr_mm in month_list:
            self.files = self.files + subset_files_on_date(nc_files_dir, yr_mm, date_alias="%Y-%m")
                                    
        self.fsuffix = fsuffix
        self.date_of_files, self.times, self.nsamples = self.get_data_info()
        self.file_handles = [xr.open_dataset(dfile) for dfile in self.files]
        
    def get_data_info(self):
        # check if self.files is not empty
        if not self.files:
            raise FileNotFoundError("Could not find any datafiles under '{0}' containing data for the months: {1}"
                                    .format(data_dir, ", ".join(month_list)))
            
        # retrieve dates from filenames
        date_of_files = [extract_date(os.path.basename(dfile)).strftime("%Y-%m") for dfile in self.files]
        # open (lazily) all netCDF-files to retrieve data timestamps
        ds_all = xr.open_mfdataset(self.files)
        
        # convert timestamps to more convenient pandas datetime-objects and get number of samples
        times = [pd.to_datetime(str(time.values)) for time in ds_all["time"]]            
        nsamples = len(ds_all["time"])
        
        return date_of_files, times, nsamples
    
    def __getitem__(self, i):
        print(type(i))
        file_index, infile_index = self.get_indices(i)
        return self.file_handles[file_index].isel({"time": infile_index})
    
    def get_indices(self, i):
        month_now = self.times[i].strftime("%Y-%m")
        file_index = self.date_of_files.index(month_now)
        infile_index = int((self.times[i] - pd.to_datetime(month_now + "-01 00:00"))/pd.Timedelta(hours=1))
        
        return file_index, infile_index
                                           
                                           

In [None]:
month_list = [date.strftime("%Y-%m") for date in pd.date_range(start="2017-01", end="2017-12", freq='MS')]

ds = MonthlyNetCDFDataset(datadir, month_list, ["t2m", "t_850", "slhf"], ["2t", "z"], fsuffix="preproc_")

In [None]:
ds[10]

In [None]:
a = tf.range(ds.nsamples)

#tfds = tf.data.Dataset.range(ds.nsamples).map(lambda x: ds.__getitem__(x))
tfds = tf.data.Dataset.range(ds.nsamples).map(lambda x: ds.__getitem__(x))