# Preprocessing

The preprocessing step is required to "reshape" the raw output of Wflow_sbm into a configuration that can be readily used  by the LSTM model for training and inference. 

$ WFLOW\_SBM: (lat, lon, time, feature) \rightarrow LSTM: (gridcell, time, feature)$

Depending on the number of features, the time range, time step and spatial resolution, the Wflow_sbm output may well reach several tens of GB, therefore the preprocessing step converts the original data format to .zarr.

In [1]:
%load_ext autoreload
%autoreload 

In [2]:
import numpy as np
import xarray as xr
from pathlib import Path
from hython.preprocessor import reshape
from hython.utils import write_to_zarr, read_from_zarr, build_mask_dataarray
from numcodecs import Blosc


# Settings

In [24]:
dir_wflow_model = "datademo" 
dir_input = Path("../data") / dir_wflow_model

dir_output = dir_input

dynamic_names = ["precip", "pet", "temp"] #  WFLOW forcings, input features of the surrogate
static_names = [ 'thetaS', 'thetaR', 'RootingDepth', 'Swood','KsatVer', "Sl"] # WFLOW static parameters, input features of the surrogate
target_names = [ "vwc","actevap"] # WFLOW outputs, targets of the surrogate


mask_names = ["wflow_missing", "wflow_lake"]

file_dynamic =  "forcings.nc" 
file_static = "staticmaps.nc"
file_target = "output.nc"


train_start = "2016-01-01"
train_end = "2018-12-31"

test_start = "2019-01-01"
test_end = "2020-12-31"

train_range = slice(train_start,train_end)
test_range = slice(test_start, test_end)


In [25]:
# read data 
dynamics = xr.open_dataset(dir_input / file_dynamic)
statics = xr.open_dataset(dir_input / file_static)
targets = xr.open_dataset(dir_input / file_target).isel(lat=slice(None, None, -1))

In [26]:
try:
    forcings = forcings.rename({"latitude":"lat", "longitude":"lon"})
    params = params.rename({"latitude":"lat", "longitude":"lon"})
except:
    pass

In [27]:
# filter features
statics = params[static_names]
dynamics = forcings[dynamic_names]
targets = targets[target_names]

In [28]:
# reshape
Xd, Xs, Y  = reshape(
                   forcings, 
                   params, 
                   targets,
                   return_type="xarray"
                   )

dynamic:  (1452, 2192, 3)  => (GRIDCELL, TIME, FEATURE)
static:  (1452, 6)  => (GRIDCELL, FEATURE)
target:  (1452, 2191, 2)  => (GRIDCELL, TIME, TARGET)


In [18]:
mask_missing = np.isnan(statics[static_names[0]]).rename("mask_missing")

In [20]:
masks = build_mask_dataarray([mask_missing], names= ["mask_missing"])

In [21]:
ATTRS = {
        "shape_label":mask_missing.dims,
        "shape":mask_missing.shape
        }

In [22]:
# write to zarr

outfp = str(outd /  f"{wflow_model}.zarr")

compressor = Blosc(cname='zl4', clevel=4, shuffle=Blosc.BITSHUFFLE)

write_to_zarr(Xd ,
              url= outfp, 
              group="xd", 
              storage_options={"compressor":compressor}, 
              chunks="auto", 
              append_on_time=True, 
              multi_index="gridcell", 
              append_attrs = ATTRS)

write_to_zarr(Y ,url= outfp,  group="y", storage_options={"compressor":compressor}, chunks="auto", append_on_time=True, multi_index="gridcell",append_attrs = ATTRS)

write_to_zarr(Xs ,url= outfp, group="xs", storage_options={"compressor":compressor}, chunks="auto", multi_index="gridcell",append_attrs = ATTRS)


In [23]:
write_to_zarr(masks,url= outfp, group="mask", storage_options={"compressor":compressor}, overwrite=True)