# Preprocessing

The preprocessing step is required to "reshape" the raw output of Wflow_sbm into a configuration that can be readily used  by the LSTM model for training and inference. 

$ WFLOW\_SBM: (lat, lon, time, feature) \rightarrow LSTM: (gridcell, time, feature)$

Depending on the number of features, the time range, time step and spatial resolution, the Wflow_sbm output may well reach several tens of GB, therefore the preprocessing step converts the original data format to .zarr.

In [21]:
import numpy as np
import xarray as xr
from pathlib import Path

import dask

from hython.preprocess import reshape
from hython.utils import write_to_zarr, read_from_zarr

from numcodecs import Blosc

import matplotlib.pyplot as plt

# Settings

In [11]:
wflow_model = "datademo" 

dynamic_names = ["precip", "pet", "temp"] #  WFLOW forcings, input features of the surrogate
static_names = [ 'thetaS', 'thetaR', 'RootingDepth', 'Swood','KsatVer', "Sl", "wflow_dem"] # WFLOW static parameters, input features of the surrogate
target_names = [ "vwc","actevap"] # WFLOW outputs, targets of the surrogate

fn_forcings =  "forcings.nc" 
fn_params = "staticmaps.nc"
fn_targets = "output.nc"


train_start = "2016-01-01"
train_end = "2018-12-31"

test_start = "2019-01-01"
test_end = "2020-12-31"

train_range = slice(train_start,train_end)
test_range = slice(test_start, test_end)

wd = Path("../data") / wflow_model
outd = wd

fp_dynamic_forcings = wd / fn_forcings 
fp_wflow_static_params = wd / fn_params
fp_target = wd / fn_targets

In [12]:
# read data 
forcings = xr.open_dataset(fp_dynamic_forcings)
params = xr.open_dataset(fp_wflow_static_params)
targets = xr.open_dataset(fp_target).isel(lat=slice(None, None, -1))

In [15]:
try:
    forcings = forcings.rename({"latitude":"lat", "longitude":"lon"})
    params = params.rename({"latitude":"lat", "longitude":"lon"})
except:
    pass

In [16]:
# filter features
params = params[static_names]
forcings = forcings[dynamic_names]
targets = targets[target_names]

In [17]:
# reshape
Xd, Xs, Y  = reshape(
                   forcings, 
                   params, 
                   targets,
                   return_type="xarray"
                   )

dynamic:  (1452, 2192, 3)  => (GRIDCELL, TIME, FEATURE)
static:  (1452, 7)  => (GRIDCELL, FEATURE)
target:  (1452, 2191, 2)  => (GRIDCELL, TIME, TARGET)


In [18]:
# masking out missing data
missing_mask = np.isnan(params[static_names[0]]).rename("mask")

In [19]:
# additional masking
#wflow_lakes = params.wflow_lakeareas
#mask_lakes = (wflow_lakes > 0).astype(np.bool_).rename("mask_lake")

In [20]:
# write to zarr

outfp = str(outd /  f"{wflow_model}.zarr")

Y.attrs.clear()
Xd.attrs.clear()

compressor = Blosc(cname='zl4', clevel=4, shuffle=Blosc.BITSHUFFLE)

write_to_zarr(Xd ,url= outfp, group="xd", storage_options={"compressor":compressor}, chunks="auto", multi_index="gridcell")

write_to_zarr(Y ,url= outfp,  group="y", storage_options={"compressor":compressor}, chunks="auto", multi_index="gridcell")

write_to_zarr(Xs ,url= outfp, group="xs", storage_options={"compressor":compressor}, chunks="auto", multi_index="gridcell")

write_to_zarr(missing_mask,url= outfp, group="mask", storage_options={"compressor":compressor})

# write_to_zarr(mask_lakes,url= outfp, group="mask_lake", storage_options={"compressor":compressor})