# Benchmarking the application "Downscaling of 2m temperature from IFS HRES with a U-Net"


In [None]:
import numpy as np
import tensorflow as tf
print(tf.__version__)

In [1]:
!pip install pandas==1.1.
!pip install tensorflow==2.3.1
!pip install climetlab==0.8.14

In [3]:
!pip install climetlab-maelstrom-downscaling==0.1.0

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os, sys
sys.path.append("/user/s6mllang/.local/bin/")
import time
import tensorflow.keras as keras
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.utils as ku
from downscaling_utils import *
from unet_model import build_unet, get_lr_scheduler
import xarray as xr
import datetime as dt

... we define to auxiliary functions that accomplish this job for us. Note that the function `get_ifs_data`automatically detects the running node to decide if data can be downloaded or if it must be available on scratch. If the data is unavailable in the file system, execution on the login node to acquire the data is **mandatory**.

Now, let's get the data. For this, please **adapt the `datadir`-variable** if you don't have access to `/p/project/deepacf/maelstrom/data/downscaling_unet/` (check via terminal). In this case, also run on the **login node first** to download the data. If you have access (or if you already obtained the data), the data will be loaded from the filesystem automatically (also on the computing node). <br>
For convenience, we will also take a brief look on the training data which comprises 1464 time steps over our target domain with 128x96 grid points in zonal and meridional direction.

In [3]:
datadir = "/automount/user/s6mllang/fzj_esde/maelstrom/downscaling_data"

data_obj = DownscalingData(datadir)

%set_download_flag: Datafiles are already available under '/automount/user/s6mllang/fzj_esde/maelstrom/downscaling_data'
%get_data: Start reading the data from '/automount/user/s6mllang/fzj_esde/maelstrom/downscaling_data'...
%get_data: Dataset was retrieved succesfully.


In [4]:
# set daytime for which downsclaing model is trained (i.e. either 0 or 12)
hour = 12    

# preprocess data for training
int_data, tart_data, opt_norm = data_obj.preprocess_data("train", daytime=12)
inv_data, tarv_data = data_obj.preprocess_data("val", daytime=hour, opt_norm=opt_norm)

print(data_obj.timing)
print(data_obj.data_info["memory_datasets"])
print(data_obj.data_info["nsamples"])


{'loading': 4.351184751838446, 'preprocessing_train': 0.5186087638139725, 'preprocessing_val': 0.05851025879383087}
{'train': 575681728, 'val': 73141456, 'test': 70782112}
{'train': 1464, 'val': 186, 'test': 180}


In [5]:
import tensorflow.keras.utils as ku
shape_in = (96, 128, 3)

if "login" in data_obj.host:
    unet_model = build_unet(shape_in, z_branch=True)
    ku.plot_model(unet_model, to_file=os.path.join(os.getcwd(), "unet_downscaling_model.png"), show_shapes=True)

In [8]:
# define class for creating timer callback
class TimeHistory(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        self.epoch_times = []

    def on_epoch_begin(self, epoch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, epoch, logs={}):
        self.epoch_times.append(time.time() - self.epoch_time_start)
        
z_branch = True                    # flag if additionally training on surface elevation is performed

# create callbacks
callback_list = [get_lr_scheduler(), TimeHistory()]

In [None]:
# build, compile and train the model
nepochs = 2
unet_model = build_unet(shape_in, z_branch=z_branch)
if z_branch:
    unet_model.compile(optimizer=Adam(learning_rate=5*10**(-4)),
                   loss={"output_temp": "mae", "output_z": "mae"}, 
                   loss_weights={"output_temp": 1.0, "output_z": 1.0})
    
    history = unet_model.fit(x=int_data.values, y={"output_temp": tart_data.isel(variable=0).values,
                                                   "output_z": tart_data.isel(variable=1).values},
                             batch_size=32, epochs=nepochs, callbacks=callback_list, 
                             validation_data=(inv_data.values, {"output_temp": tarv_data.isel(variable=0).values,
                                                                "output_z": tarv_data.isel(variable=1).values}))
else:
    unet_model.compile(optimizer=Adam(learning_rate=5*10**(-4)), loss="mae")

    history = unet_model.fit(x=int_data.values, y=tart_data.isel(variable=0).values, batch_size=32,
                             epochs=nepochs, callbacks=callback_list,
                             validation_data=(inv_data.values, tarv_data.isel(variable=0).values))

Epoch 1/2
Epoch 2/2
 2/23 [=>............................] - ETA: 5:31 - loss: 0.3606 - output_temp_loss: 0.2260 - output_z_loss: 0.1346

In [None]:
# preprocess the test data first
inte_data, tarte_data = preprocess_data_for_unet(ds_test, daytime=hour, opt_norm=opt_norm)

# generate the downscaled fields
y_pred_test = unet_model.predict(inte_data.values, verbose=1)

In [None]:
comparison_type = "test"            # change here to switch between validation and testing data
if comparison_type == "validation":
  y_pred = y_pred_val
  ds_ref = ds_val.sel(time=dt.time(hour))
  var_ref = tarv_data.isel(variable=0)
elif comparison_type == "test":
  y_pred = y_pred_test
  ds_ref = ds_test.sel(time=dt.time(hour))
  var_ref = tarte_data.isel(variable=0)
else:
  ValueError("Unknown comparison_type '{0}' chosen.".format(comparison_type))

if np.ndim(y_pred) == 5:                # cropping necessary if z_branch is True (two output channels)
  y_pred = y_pred[0]
else:
  pass

In [None]:
# get some relevant information from the original dataset, ...
coords = var_ref.squeeze().coords
dims = var_ref.squeeze().dims

# denomralize...
y_pred_trans = np.squeeze(y_pred)*opt_norm["std_tar"].squeeze().values + opt_norm["mu_tar"].squeeze().values
# and make xarray DataArray 
y_pred_trans = xr.DataArray(y_pred_trans, coords=coords, dims=dims, name="t2m_downscaled")

In [None]:
mse = ((y_pred_trans - ds_ref["t2m_tar"])**2).mean(dim=["lat", "lon"])

print("MSE of downscaled 2m temperature: {0:.3f} K**2 (+/-{1:.3f} K**2)".format(mse.mean().values, mse.std().values))

As we see, the model has learned to recover a lot of details resulting mainly from the topography. Especially over the Alpes, but also over the the German low mountain ranges, the differences have become smaller and less structured. It is also noted that the differences near the coast (e.g. at the Baltic Sea) have become smaller. <br>
However, some systematic features are still visible, the differences can stilll be as large as 3 K and especially in the Alps, the differences are somehow 'blurry'. Thus, there is still room for further improvement. 
These improvements will not only pertain the model architecture, but will also target to engulf more meteorological variables. The latter will also enable the network to generalize with respect to daytime and season. Note, that this has not been done yet, since we trained the U-net with data between April and September at 12 UTC only.
 