# Preprocessing

The preprocessing step is required to "reshape" the raw output of Wflow_sbm into a configuration that can be readily used  by the LSTM model for training and inference. 

$ WFLOW\_SBM: (lat, lon, time, feature) \rightarrow LSTM: (gridcell, time, feature)$

Depending on the number of features, the time range, time step and spatial resolution, the Wflow_sbm output may well reach several tens of GB, therefore the preprocessing step converts the original data format to .zarr.

In [42]:
import numpy as np
import xarray as xr
from pathlib import Path

from hython.preprocessor import reshape
from hython.utils import write_to_zarr, build_mask_dataarray

from numcodecs import Blosc


# Settings

In [56]:
wflow_model = "adg1km_eobs" 


# masking
mask_from_static = ["wflow_lakeareas"]
rename_mask = ["mask_lake"]

# inputs 
dir_input =  "/mnt/CEPH_PROJECTS/InterTwin/Wflow/models"

file_dynamic =  "forcings.nc" 
file_static = "staticmaps.nc"
file_target = "output.nc"

# outputs
file_preprocessed_output = "adg1km_eobs"
dir_output = "/mnt/CEPH_PROJECTS/InterTwin/hydrologic_data/surrogate_training"

#  temporal range
temporal_range = ["2016-01-01","2018-12-31"]

# select variable names
dynamic_names: ["precip", "pet", "temp"]
static_names: [ 'thetaS', 'thetaR', 'RootingDepth', 'Swood','KsatVer', "Sl"]
target_names: [ "vwc","actevap"]

# select soil layer
soil_layers = [1]

In [57]:
# read data 
dynamics = xr.open_dataset(f"{dir_input}/{wflow_model}/{file_dynamic}")
statics = xr.open_dataset(f"{dir_input}/{wflow_model}/{file_static}")
targets = xr.open_dataset(f"{dir_input}/{wflow_model}/run_default/{file_target}")

In [58]:
try:
    dynamics = dynamics.rename({"latitude":"lat", "longitude":"lon"})
    statics = statics.rename({"latitude":"lat", "longitude":"lon"})
except:
    pass

targets = targets.isel(lat=slice(None, None, -1))

temporal_range = slice(*temporal_range)

In [59]:
# MASKING & FILTERING

# filter soil layers 
if len(soil_layers) > 0:
    if len(soil_layers) == 1:
        statics = statics.sel(layer=soil_layers).squeeze("layer")
        targets = targets.sel(layer=soil_layers).squeeze("layer")
    else:
        raise NotImplementedError("Preprocessing multiple soil layers not yet implemented")
    
# masking, TODO: works only for lake layer. Improve the logic.
mask_missing = np.isnan(statics[static_names[0]]).rename("mask_missing")

masks = []
masks.append(mask_missing)

for i, mask in enumerate(mask_from_static):
    masks.append((statics[mask] > 0).astype(np.bool_).rename(rename_mask[i]))

masks = build_mask_dataarray(masks, names = ["mask_missing"]+ rename_mask)

# select variables
statics = statics[static_names]
dynamics = dynamics[dynamic_names]
targets = targets[target_names]

In [60]:
# RESHAPING

Xd, Xs, Y  = reshape(
                dynamics, 
                statics, 
                targets,
                return_type="xarray"
                )

dynamic:  (40140, 2192, 3)  => (GRIDCELL, TIME, FEATURE)
static:  (40140, 6)  => (GRIDCELL, FEATURE)
target:  (40140, 2191, 2)  => (GRIDCELL, TIME, TARGET)


In [62]:
# attrs to pass to output
ATTRS = {
        "shape_label":mask_missing.dims,
        "shape":mask_missing.shape
        }

# remove as it cause serialization issues
Xd.attrs.pop("_FillValue", None)

In [63]:
# WRITE 

compressor = Blosc(cname='zl4', clevel=4, shuffle=Blosc.BITSHUFFLE)

file_output = f"{dir_output}/{file_preprocessed_output}.zarr"
write_to_zarr(Xd ,
            url= file_output, 
            group="xd", 
            storage_options={"compressor":compressor}, 
            chunks="auto", 
            append_on_time=True, 
            multi_index="gridcell", 
            append_attrs = ATTRS, 
            overwrite=True)

write_to_zarr(Y ,url= file_output,  group="y", storage_options={"compressor":compressor}, chunks="auto", append_on_time=True, multi_index="gridcell",append_attrs = ATTRS)

write_to_zarr(Xs ,url= file_output, group="xs", storage_options={"compressor":compressor}, chunks="auto", multi_index="gridcell",append_attrs = ATTRS)

write_to_zarr(masks,url= file_output, group="mask", storage_options={"compressor":compressor}, overwrite=True)

{}
{'standard_name': 'thickness_of_rainfall_amount', 'long_name': 'rainfall', 'units': 'mm', 'cell_methods': 'time: mean', 'unit': 'mm', 'precip_fn': 'eobs', 'shape_label': ('lat', 'lon'), 'shape': (180, 223)}
{}
{'shape_label': ('lat', 'lon'), 'shape': (180, 223)}
