# Data Assimilation with Newtonian Nudging 

In [None]:
import os
from pathlib import Path
pad = Path(os.getcwd())
if pad.name == 'data_assimilation':
    pad_correct = pad.parent
    os.chdir(pad_correct)
from functions.PDM import PDM
from functions.performance_metrics import NSE, mNSE, FHV
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.colors as colors
import hvplot 
import hvplot.pandas
import itertools
import warnings
from numba import jit
from datetime import datetime

exec_parameter_testing = False
presentation = False

%load_ext autoreload 
%autoreload 2 

In [None]:
%run "data_assimilation/data_prep.py"

## Necessary data load in 

In [None]:
#Needed for PDM inputs
preprocess_output_folder = Path('data/Zwalm_data/preprocess_output')
p_zwalm = pd.read_pickle(preprocess_output_folder / 'zwalm_p_thiessen.pkl')
ep_zwalm = pd.read_pickle(preprocess_output_folder / 'zwalm_ep_thiessen.pkl')
param = pd.read_csv("data/Zwalm_PDM_parameters/NM_opt_param.csv")
zwalm_shape = gpd.read_file('data/Zwalm_shape/zwalm_shapefile_emma_31370.shp')
area_zwalm_new = np.single(zwalm_shape.area[0] / 10**6)
deltat = np.array(1, dtype=np.float32)  # hour
deltat_out = np.array(24, dtype=np.float32)  # daily averaging

#observational C*
ml_obs_op_pad = Path("data/ml_obs_op_data")
Cstar_obs_lin_reg = pd.read_pickle(ml_obs_op_pad/'lin_reg/full_data/y_hat_retimed.pickle')
Cstar_obs_lin_reg_nt = pd.read_pickle(ml_obs_op_pad/'lin_reg/full_data_no_time/y_hat_retimed.pickle')
Cstar_obs_lin_reg_nf = pd.read_pickle(ml_obs_op_pad/'lin_reg/full_data_no_forest/y_hat_retimed.pickle')
# Cstar_obs_ridge_w = pd.read_pickle(ml_obs_op_pad/'ridge/window/y_hat_retimed.pickle')
Cstar_obs_lasso_w = pd.read_pickle(ml_obs_op_pad/'lasso/window/y_hat_retimed.pickle')
Cstar_obs_SVR_lin = pd.read_pickle(ml_obs_op_pad/'SVR/linear/y_hat_retimed.pickle')
Cstar_obs_GPR = pd.read_pickle(ml_obs_op_pad/'GPR/y_hat_retimed.pickle')
#Observational flow for comparison
Q_obs_daily = pd.read_pickle('data/Zwalm_data/pywaterinfo_output/Q_day.pkl')
Q_obs_daily = Q_obs_daily.rename(columns = {'Timestamp':'t'})
Q_obs_daily = Q_obs_daily.set_index('t')
Q_obs_daily.head(2)

Evaluate model performance starting on the first day of the month of first observation. Evaluate based on daily flow!

In [None]:
first_obs_date = Cstar_obs_lin_reg.index[0]
start_p1 = pd.Timestamp(datetime(year = first_obs_date.year, month = first_obs_date.month, day = 1))
print(f'Start of evaluation: {start_p1}')
end_PDM_calibration = pd.Timestamp(datetime(year = 2019, month = 12, day = 31, hour = 23))
begin_ML_training_only = end_PDM_calibration + np.timedelta64(1,'h')
print(f'End of PDM calibration period: {end_PDM_calibration}')
end_ML_training = pd.Timestamp(datetime(year = 2020, month = 12, day =31))
print(f'End of ML training period: {end_ML_training}')
begin_all_test = end_ML_training + np.timedelta64(1,'D')
end_all_test = Cstar_obs_lin_reg.index[-1]
print(f'Last date used for training {end_all_test}')

## Define general function definition

Define a function to repeatedly compare different Newtonian Nudging parameters and observation operator models

In [None]:
def DA_OL_comparison(gamma:float, kappa:float, tau:int, Cstar_obs, plot_style = 'dynamic', return_figures = False, figs = None, axes = None,**kwargs):
    """
    Wrapper for comparing OL and DA (Newtonian Nudging) version of PDM with a certain observation operator model 

    Parameters
    ----------- 
    gamma: float
        the observational uncertainty, as of now fixed for all timestamps (between 0 and 1)
    kappa: float
        The Nudging factor (between 0 and 1)
    tau: int
        The number of hours before and after the time of observation for which to apply DA.
    Cstar_obs: pandas.Series of pandas.DataFrame
        Dataframe/Series with the observed C* (from observation operator model) with time as idex
    plot_style: string
        'dynamic' execued hvplot plotting, 'static' exectued matplotlib plotting, other argument(e.g. None) disable plotting
    return_figures: bool, default = False
        If True, returns fig and axes object of the static plots the order they are displayed
    figs: tuple, default = None
        figure objects
    axes: tuple, default = None
        axes objects 
    **kwargs: 
        key word arguments for PDM function with DA

    Returns
    -------
    delta_dict: dictionary
      dictionary containig the differences in NSE between DA and OL for 4 periods:
        - Calibration: from start of observation till the end of PMD calibration
        - ML_training: no PDM calibration, only observation operator model was trained this period
        - Test: both PDM and observation opertor model untrained in this period
        - Full: from start till end of observations
        
    figs: tuple
        figure objects (only if return_figures = True and plot_style = 'static')
    axes: tuple
        axes objects (only if return_figures = True and plot_style = 'static')

    """
    #Calculate DA and non DA PDM
    pd_zwalm_out_DA = PDM(P=p_zwalm['P_thiessen'].values,
                        EP=ep_zwalm['EP_thiessen'].values,
                        t=p_zwalm['Timestamp'].values,
                        area=area_zwalm_new, deltat=deltat, deltatout=deltat_out,
                        parameters=param, m=3, DA = True, Cstar_obs = Cstar_obs.values.flatten(),t_obs = Cstar_obs.index.values, gamma = gamma, kappa = kappa,  tau = np.timedelta64(tau,'h'),**kwargs)
    pd_zwalm_out_DA = pd_zwalm_out_DA.set_index('Time')
    pd_zwalm_out = PDM(P=p_zwalm['P_thiessen'].values,
                        EP=ep_zwalm['EP_thiessen'].values,
                        t=p_zwalm['Timestamp'].values,
                        area=area_zwalm_new, deltat=deltat, deltatout=deltat_out,
                        parameters=param, m=3, DA = False)
    pd_zwalm_out = pd_zwalm_out.set_index('Time')
    Q_out_diff = pd_zwalm_out_DA['qmodm3s'] - pd_zwalm_out['qmodm3s'
                                                           ]
    #Plotting
    diff_Cstar = pd_zwalm_out_DA['Cstar'] - pd_zwalm_out['Cstar']
    if plot_style == 'dynamic':
        display(pd_zwalm_out_DA['Cstar'][start_p1:].hvplot(ylabel='[mm]',
            label = 'C* DA')*pd_zwalm_out['Cstar'][start_p1:].hvplot(label = 'C* OL'))
        display(diff_Cstar[start_p1:].hvplot(ylabel='[mm]', label = r'$\Delta C^* $'))

        display(Q_obs_daily['Value'][start_p1:].hvplot(label = 'Observed')*pd_zwalm_out_DA['qmodm3s'][start_p1:].hvplot(ylabel='[m^3/s]',label = 'DA')*pd_zwalm_out['qmodm3s'][start_p1:].hvplot(label = 'OL',line_dash = 'dotted', frame_width = 800, frame_height = 400))

        display(Q_out_diff[start_p1:].hvplot(title = 'Q_out DA - Q_out OL', ylabel = '[m^3/s]', frame_width = 800))
    
    elif plot_style == 'static':
        if (figs == None) and (axes == None):
            figs_list = []
            axes_list = []
            for i in range(4):
                fig_temp, ax_temp = plt.subplots()
                figs_list.append(fig_temp)
                axes_list.append(ax_temp)
            figs = tuple(figs_list)
            axes = tuple(axes_list)
        pd_zwalm_out_DA['Cstar'][start_p1:].plot(label = 'DA', ylabel = r'$C^*$ [mm]', ax = axes[0], c = 'tab:orange')#type:ignore
        pd_zwalm_out['Cstar'][start_p1:].plot(label = 'OL', ax = axes[0], c = 'tab:green')#type:ignore
        axes[0].legend()#type:ignore

        diff_Cstar[start_p1:].plot(ylabel=r'$C^*_{\rm DA} - C^*_{\rm OL}$ [mm]', ax = axes[1])#type:ignore
        Q_obs_daily['Value'][start_p1:].plot(label = 'Observed', ax = axes[2])
        pd_zwalm_out_DA['qmodm3s'][start_p1:].plot(ylabel= r'$Q$ [m$^3$/s]', label = 'DA', ax = axes[2])#type:ignore
        pd_zwalm_out['qmodm3s'][start_p1:].plot(label = 'OL',linestyle = 'dotted', ax = axes[2])#type:ignore
        axes[2].legend()#type:ignore

        Q_out_diff[start_p1:].plot(ylabel = r'$Q_{\rm DA} - Q_{\rm OL}$  [m$^3$/s]', ax = axes[3])#type:ignore

    #Metrics
    def metric_wrapper_DA_OL_comparison(function, metric_name, p_start,p_end):
        metric_OL = function(pd_zwalm_out['qmodm3s'][p_start:p_end],Q_obs_daily['Value'][p_start:p_end])
        metric_DA = function(pd_zwalm_out_DA['qmodm3s'][p_start:p_end],Q_obs_daily['Value'][p_start:p_end])
        print(f'OL {metric_name} from {p_start} till {p_end}: {metric_OL}')
        print(f'DA {metric_name} from {p_start} till {p_end}: {metric_DA}')
        if metric_name == 'FHV':
            delta_metric = np.abs(metric_DA) - np.abs(metric_OL)
        else:
            delta_metric = metric_DA - metric_OL
        print(f'Delta {metric_name}: {delta_metric}')
        return metric_OL, metric_DA, delta_metric
    metric_dict = {'NSE':NSE, 'mNSE':mNSE, 'FHV':FHV}
    delta_dict = {}
    for metric_name in metric_dict.keys():   
        print('\n ------------------')
        print(f'METRIC: {metric_name}')
        print('---------------------')
        metric_OL_cal, metric_DA_cal, delta_cal = metric_wrapper_DA_OL_comparison(metric_dict[metric_name],metric_name,start_p1,end_PDM_calibration)
        print('\n')
        metric_OL_MLt, metric_DA_MLt, delta_MLt =  metric_wrapper_DA_OL_comparison(metric_dict[metric_name],metric_name,begin_ML_training_only,end_ML_training)
        print('\n')
        metric_OL_test, metric_DA_test, delta_test = metric_wrapper_DA_OL_comparison(metric_dict[metric_name],metric_name,begin_all_test,end_all_test)
        print('\n')
        metric_OL_full, metric_DA_full, delta_full = metric_wrapper_DA_OL_comparison(metric_dict[metric_name],metric_name,start_p1,end_all_test)
        # metric_dict = {'OL_cal':metric_OL_cal, 'DA_cal':metric_DA_cal,'OL_ML_training':metric_OL_MLt, 'DA_ML_training':metric_DA_MLt, 'metric_OL_test':metric_OL_test, 'metric_DA_test':metric_DA_test,'metric_OL_full':metric_OL_full, 'metric_DA_full':metric_DA_full}
        delta_dict_temp = {'delta_cal':delta_cal, 'delta_Mlt':delta_MLt,'delta_test':delta_test, 'delta_full':delta_full}
        delta_dict[metric_name] = delta_dict_temp
    if not return_figures:
        return delta_dict
    else:
        if plot_style == 'static':
            return delta_dict, figs, axes
        else:
            raise ValueError("Plot style must be 'static' to allow 'return figures' to be true")

Start with $\tau = 5h$ day, $K*\gamma$ = 0.5

# Linear regressinon: full feature set

In [None]:
kappa = 1
gamma = 0.25#0.5
tau = 2#5
font_size = 13
if presentation:
    plt.rcParams.update({'font.size': font_size})
delta_dict, figs, axes = DA_OL_comparison(gamma, kappa, tau, Cstar_obs_lin_reg, plot_style = 'static', return_figures=True)
#combined_figure=True)
display(delta_dict)

In [None]:
pad_pres = Path('Figures/presentation_12_04')
if not os.path.exists(pad_pres):
    os.makedirs(pad_pres)
if presentation:
    if len(figs) == 4: 
        fig_diff = figs[2]
        ax_diff = axes[2]
        ax_diff.set_xlabel('Tijd')
        ax_diff.set_title(r'Lineare regressie: $Q_{out}$ DA - $Q_{out}$ OL')
        fig_diff.savefig(pad_pres/'Q_diff_lin_reg.svg',format = 'svg')
        display(fig_diff)
    else:
        fig_combined = figs[1]
        fig_combined.suptitle('Linear regression')
        display(fig_combined)
        fig_combined.savefig(pad_pres/'Q_DA_vs_OL_lin_reg.svg',format = 'svg', transparent = True)


# Linear regression: no time

In [None]:
fig1, (ax1, ax2) = plt.subplots(2,1, figsize = (6,7), constrained_layout = True)
fig2, (ax3,ax4) = plt.subplots(2,1, figsize = (6,7), constrained_layout = True)
figs = (fig1, fig2)
axes = (ax1, ax2, ax3, ax4)

DA_OL_comparison(gamma, kappa, tau, Cstar_obs_lin_reg_nt, plot_style = 'static', figs = figs, axes = axes)

axes[0].set_xlabel('')
axes[2].set_xlabel('')

# Linear regression: no forest

In [None]:
# fig1, ((ax1_nf, ax1_gpr),(ax2_nf, ax2_gpr) ) = plt.subplots(2,2, figsize = (9,6), constrained_layout = True)#, sharey = 'row')
# fig2, ((ax3_nf, ax3_gpr),(ax4_nf, ax4_gpr) ) = plt.subplots(2,2, figsize = (9,6), constrained_layout = True)#, sharey = 'row')
# figs = (fig1, fig2)
# axes = (ax1_nf, ax2_nf, ax3_nf, ax4_nf)

delta_dict = DA_OL_comparison(gamma, kappa, tau, Cstar_obs_lin_reg_nf, plot_style = 'dynamic')
display(delta_dict)
# #C* plot
# axes[0].set_xlabel('')
# axes[0].set_title('(a)')
# # Cstar_plot_nf = Cstar_obs_lin_reg_nf.reset_index()
# # Cstar_plot_nf.plot.scatter(x = 't', y = 'C*', ax = axes[0], marker = "x")
# ylim_C_diff = axes[1].get_ylim()
# #Q plot
# axes[2].set_xlabel('')
# axes[2].set_title('(a)')
# ylim_Q_diff = axes[3].get_ylim()

# Lasso window regression

In [None]:
# DA_OL_comparison(gamma, kappa, tau, Cstar_obs_ridge_w['C*'], plot_style = 'dynamic')
dict_out = DA_OL_comparison(gamma, kappa, tau, Cstar_obs_lasso_w['C*'], plot_style = 'dynamic')
display(dict_out)

# SVR

In [None]:
out_dict = DA_OL_comparison(gamma, kappa, tau, Cstar_obs_SVR_lin['C*'], plot_style = 'dynamic')
display(out_dict)

# GPR

rerun LR for visulisation

In [None]:
fig1, ((ax1_nf, ax1_gpr),(ax2_nf, ax2_gpr) ) = plt.subplots(2,2, figsize = (9,6), constrained_layout = True)#, sharey = 'row')
fig2, ((ax3_nf, ax3_gpr),(ax4_nf, ax4_gpr) ) = plt.subplots(2,2, figsize = (9,6), constrained_layout = True)#, sharey = 'row')
figs = (fig1, fig2)
axes = (ax1_nf, ax2_nf, ax3_nf, ax4_nf)

delta_dict, figs, axes = DA_OL_comparison(gamma, kappa, tau, Cstar_obs_lin_reg, plot_style = 'static', return_figures= True, figs = figs, axes = axes)
display(delta_dict)
#C* plot
axes[0].set_xlabel('')
axes[0].set_title('(a)')
# Cstar_plot_nf = Cstar_obs_lin_reg_nf.reset_index()
# Cstar_plot_nf.plot.scatter(x = 't', y = 'C*', ax = axes[0], marker = "x")
ylim_C_diff = axes[1].get_ylim()
#Q plot
axes[2].set_xlabel('')
axes[2].set_title('(a)')
ylim_Q_diff = axes[3].get_ylim()

GPR itself

In [None]:
if presentation:
    figs = None
    axes = None
else:
    figs = (fig1, fig2)
    axes = (ax1_gpr, ax2_gpr, ax3_gpr, ax4_gpr)
delta_dict, figs, axes = DA_OL_comparison(gamma, kappa, tau, Cstar_obs_GPR['C*'], plot_style = 'static', return_figures=True, figs = figs, axes = axes)
display(delta_dict)
pad_figures_text = Path('Figures/Figures_chapter_DA')
if not os.path.exists(pad_figures_text):
    os.makedirs(pad_figures_text)
if not presentation:
    #C* plot
    axes[0].set_xlabel('')
    axes[0].set_ylabel('')
    axes[1].set_ylabel('')
    axes[0].set_title('(b)')
    axes[1].set_ylim(ylim_C_diff)
    display(figs[0])
    figs[0].savefig(
        pad_figures_text/'Cstar_lr_gpr_comparison.pdf',format = 'pdf', bbox_inches = 'tight'
    )
    #Q plot
    axes[2].set_xlabel('')
    axes[2].set_ylabel('')
    axes[3].set_ylabel('')
    axes[2].set_title('(b)')
    axes[3].set_ylim(ylim_Q_diff)
    display(figs[1])
    figs[1].savefig(
        pad_figures_text/'Q_lr_gpr_comparison.pdf', format = 'pdf', bbox_inches = 'tight'
    )




In [None]:
if presentation:
    if len(figs) == 4: #for if combined figures is F
        fig_diff = figs[2]
        ax_diff = axes[2]
        ax_diff.set_xlabel('Tijd')
        ax_diff.set_title(r'Gaussiaanse Processen: $Q_{out}$ DA - $Q_{out}$ OL')
        fig_diff.savefig(pad_pres/'Q_diff_gpr.svg',format = 'svg')
        display(fig_diff)
    else:
        fig_combined = figs[1]
        fig_combined.suptitle('Gaussian processes')
        display(fig_combined)
        fig_combined.savefig(pad_pres/'Q_DA_vs_OL_gpr.svg',format = 'svg', transparent = True)

# Comparison of different Newtonian Nudging parameters

Possible parameter combinations:
- $\tau$: 5hours, 0.5, 1 or 1.5 days (not more, since at times 3 days between observations => for more than 1.5 days, code should change to include multiple observations)
- $\gamma K$: 0.1, 0.25, 0.5, 0.75, for which higher means a higher strenght of assimilation

In [None]:
# pd_Cstar = Cstar_obs_lin_reg.join(
#     [Cstar_obs_lin_reg_nt, Cstar_obs_ridge_w, Cstar_obs_SVR_lin, Cstar_obs_GPR], rsuffix = ['_lin_reg_nt','_ridge_w','_SVR_lin','_GPR']
# )

pd_Cstar = Cstar_obs_lin_reg.join(Cstar_obs_lin_reg_nf['C*'], rsuffix='_lin_reg_nf')
#pd_Cstar = pd_Cstar.join(Cstar_obs_ridge_w['C*'], rsuffix='_ridge_w')
pd_Cstar = pd_Cstar.join(Cstar_obs_lasso_w['C*'], rsuffix = '_lasso_w')
pd_Cstar = pd_Cstar.join(Cstar_obs_SVR_lin, rsuffix='_SVR_lin')
pd_Cstar = pd_Cstar.join(Cstar_obs_GPR, rsuffix='_GPR')
pd_Cstar = pd_Cstar.rename(columns = {'C*':'C*_lin_reg'})
display(pd_Cstar)

pad = Path('data/data_assimilation')
if not os.path.exists(pad):
    os.makedirs(pad)

In [None]:
taus = [1,2,5,int(0.5*24), int(1*24),int(1.5*24)]
gammas = [0.1,0.25,0.5,0.75]
ml_obs_op_models = ['lin_reg','lin_reg_nf','lasso_w','SVR_lin','GPR']
combos = itertools.product(ml_obs_op_models,gammas,taus)
nr_combiations = len(taus)*len(gammas)*len(ml_obs_op_models)
kappa = 1
if exec_parameter_testing:
    for i, combo in enumerate(combos):
        model_name, gamma, tau = combo
        print(f'Combintaion {i} out of {nr_combiations}: tau = {combo[2]} hours, gamma ={combo[1]} and {combo[0]} as observation operator')
        Cstar_temp = pd_Cstar.iloc[:,pd_Cstar.columns.str.endswith(model_name)]
        Cstar_temp = Cstar_temp.dropna() #to deal with window mehtods
        delta_dict = DA_OL_comparison(kappa, float(gamma), int(tau), Cstar_temp, plot_style = None)
        delta_NSE_dict = delta_dict['NSE']
        if i == 0:
            pd_comparison = pd.DataFrame(delta_NSE_dict, index = pd.MultiIndex.from_tuples([combo], names = ['obs_op_model','gamma','tau']))
        else:
            pd_temp = pd.DataFrame(delta_NSE_dict, index = pd.MultiIndex.from_tuples([combo], names = ['obs_op_model','gamma','tau']))
            pd_comparison = pd.concat([pd_comparison, pd_temp])
    pd_comparison.to_pickle(pad/'pd_comparison.pkl')
else:
    pd_comparison = pd.read_pickle(pad/'pd_comparison.pkl')

In [None]:
max_improv = np.max(pd_comparison.max())
max_deteriation = np.min(pd_comparison.min())
if max_improv < 0:
    warnings.warn('No improvement made!')
limit = np.max([max_improv, np.abs(max_deteriation)])

print(np.max(pd_comparison.max()))
print(np.min(pd_comparison.min()))
pd_comparison.style.background_gradient(cmap = 'coolwarm', vmin =-limit, vmax = limit)#'RdYlGn_r'

In [None]:
pd_comparison_sort_cal = pd_comparison.sort_values('delta_cal',ascending = False)
pd_comparison_sort_cal.style.background_gradient(cmap = 'coolwarm', vmin =-limit, vmax = limit)

In [None]:
pd_comparison_sort_test = pd_comparison.sort_values('delta_test',ascending = False)
pd_comparison_sort_test.style.background_gradient(cmap = 'coolwarm', vmin =-limit, vmax = limit)

In [None]:
periods_score_correlaiton = pd_comparison.corr()
periods_score_correlaiton.style.background_gradient(cmap = 'coolwarm', vmin =- 1, vmax = 1)

In [None]:
pd_comparison_sort_full = pd_comparison.sort_values('delta_full',ascending = False)
pd_comparison_sort_full.style.background_gradient(cmap = 'coolwarm', vmin =-limit, vmax = limit)

In [None]:
pd_comparison.plot.kde()

In [None]:
pd_comparison.groupby('obs_op_model').plot.kde(legend = True)

Make scaterplots of model performance based on $\Kappa \gamma$ and $\tau$

In [None]:
# columns_periods = pd_comparison.columns
# column_names = ['P1','P2','P3','PFull']
# model_names = ['LR full','LR full no forest',r'LaR full $\tau = 30$', r'Linear $\epsilon-$SVR full', 'GPR full']
# unique_models = pd_comparison.index.get_level_values('obs_op_model').unique()

# fig, axes = plt.subplots(len(unique_models),len(columns_periods), figsize = (12,12), constrained_layout = True)
# for i,model in enumerate(unique_models):
#     for j,period in enumerate(columns_periods):
#         print(str(period) + ', ' + model)
#         pd_temp = pd_comparison.loc[(model,), period]
#         pd_temp_unstacked = pd_temp.unstack()
#         xv, yv = np.meshgrid(pd_temp_unstacked.index.values, 
#                              pd_temp_unstacked.columns.values)
#         map = axes[i,j].scatter(xv, yv, c = pd_temp_unstacked.values.T,
#                                 norm = colors.SymLogNorm(vmin = -limit, vmax = limit, linthresh = 1e-4), cmap = 'coolwarm')#, vmin = -limit, vmax = limit, cmap = 'coolwarm'
#         if i == 0:
#             axes[i,j].set_title(column_names[j])
#             # axes[i,j].set_ylabel(r'$\tau$ [h]')

#         if i == len(unique_models) - 1: 
#             axes[i,j].set_xlabel(r'$\gamma$ [-]')
#         if j == len(columns_periods) - 1:
#             plt.colorbar(map, ax = axes[i,j])          
#     plt.setp(axes[i,0], ylabel = model_names[i])
# #plt.setp(axes[:, 0], ylabel='y axis label')
# fig.supylabel(r'$\Delta \rm NSE$', x = 1)
# # fig.savefig(pad_figures_text/'DA_parameters_test.pdf', format = 'pdf', bbox_inches = 'tight')
#fig

In [None]:
columns_periods = pd_comparison.columns
print(columns_periods)
column_names = ['P1','P2','P3','PFull']
model_names = ['LR full','LR full no forest',r'LaR full $\tau = 30$', r'Linear $\epsilon-$SVR full', 'GPR full']
unique_models = pd_comparison.index.get_level_values('obs_op_model').unique()

fig = plt.figure(constrained_layout = True, figsize = (9,9))

subfigs = fig.subfigures(nrows = len(unique_models), ncols = 1)
for row, subfig in enumerate(subfigs):
    subfig.suptitle(model_names[row])
    axes = subfig.subplots(nrows = 1, ncols = 4)
    for col, ax in enumerate(axes):
        pd_temp = pd_comparison.loc[(unique_models[row],), columns_periods[col]]
        pd_temp_unstacked = pd_temp.unstack()
        xv, yv = np.meshgrid(pd_temp_unstacked.index.values, 
                        pd_temp_unstacked.columns.values)
        map = ax.scatter(xv, yv, c = pd_temp_unstacked.values.T,
                                norm = colors.SymLogNorm(vmin = -limit, vmax = limit, linthresh = 1e-4), cmap = 'coolwarm')
        if row == len(model_names) - 1:
            ax.set_xlabel(r'$\gamma$ [-]')
        if col == 0:
            ax.set_ylabel(r'$\tau$ [h]')
    plt.colorbar(map, ax = ax) 
fig.supylabel(r'$\Delta \rm NSE$', x = 1)

Alternative visualisation below

In [None]:
fig = plt.figure(constrained_layout = True, figsize = (9,9))

subfigs = fig.subfigures(nrows = len(unique_models), ncols = 1)
for row, subfig in enumerate(subfigs):
    subfig.suptitle(model_names[row])
    axes = subfig.subplots(nrows = 1, ncols = 4)
    for col, ax in enumerate(axes):
        pd_temp = pd_comparison.loc[(unique_models[row],), columns_periods[col]]
        pd_temp_unstacked = pd_temp.unstack()
        map = ax.imshow(pd_temp_unstacked.values.T,norm = colors.SymLogNorm(vmin = -limit, vmax = limit, linthresh = 1e-2, linscale = 0.3), cmap = 'coolwarm',aspect = 'auto')
        ax.set_xticks(np.arange(pd_temp_unstacked.values.shape[0]))
        ax.set_xticklabels(gammas)
        ax.set_yticks(np.arange(pd_temp_unstacked.values.shape[1]))
        ax.set_yticklabels(taus)
        if row == 0:
            ax.set_title(column_names[col])
        if row == len(model_names) - 1:
            ax.set_xlabel(r'$\gamma$ [-]')
        if col == 0:
            ax.set_ylabel(r'$\tau_a$ [h]')
    plt.colorbar(map, ax = ax) 
fig.supylabel(r'$\Delta \rm NSE$', x = 1)
fig.savefig(pad_figures_text/'DA_parameters_test.pdf', format = 'pdf', bbox_inches = 'tight')

In [None]:
limit_test = np.max(np.abs(pd_temp_unstacked.values))
pd_temp_unstacked.T.style.background_gradient(cmap='coolwarm', vmin = -limit_test, vmax = limit_test)

In [None]:
# unstacked_df = pd_temp.unstack()
# display(unstacked_df)
# xv, yv = np.meshgrid(unstacked_df.index.values, unstacked_df.columns.values)
# plt.scatter(xv, yv, c = unstacked_df.values.T, s= 100)
# plt.colorbar()
# plt.title('')

## Visualisation of time weighing function

In [None]:
def tau_weighing(delta_t_abs, tau):
    if delta_t_abs < tau/2:
        W_t = 1
    elif delta_t_abs < tau:
        W_t = (tau - delta_t_abs)/(tau/2)
    else:
        W_t = 0
    return W_t
weights = [tau_weighing(np.abs(delta_t), 12) for delta_t in np.arange(-20,20,1)]
font_size = 13
if presentation:
    plt.rcParams.update({'font.size': font_size})
fig, ax = plt.subplots()
ax.plot(np.arange(-20,20), weights)
ax.set_ylabel('$W_t$')
ax.set_xlabel('$t - t^*$ [u]')
pad_pres = Path('Figures/presentation_12_04')
if not os.path.exists(pad_pres):
    os.makedirs(pad_pres)
if presentation:
    fig.savefig(pad_pres/'W_t.svg',format = 'svg')
    plt.rcParams.update(matplotlib.rcParamsDefault)

# Extra experiment: improvement if mistake in forcing data aka missing rain

In [None]:
p_zwalm_t = p_zwalm.rename(columns= {'Timestamp':'t'})
p_zwalm_t = p_zwalm_t.set_index('t')
#resample to daily rai
p_zwalm_t_daily = p_zwalm_t.resample('1D').sum()
p_zwalm_t_daily['P_thiessen'].plot()

In [None]:
day_Cstar_obs = Cstar_obs_lin_reg.index.strftime('%Y-%m-%d') #only keep day information
#now select days in the test period
day_Cstar_obs_test = day_Cstar_obs[day_Cstar_obs > begin_all_test.strftime('%Y-%m-%d')]
print(day_Cstar_obs_test)

Now select rain days with observation in test period

0: Wrong direction
1: bad performance even with good forcings

In [None]:
p_test_observation = p_zwalm_t_daily.loc[day_Cstar_obs_test]
p_test_observation_sorted = p_test_observation.sort_values('P_thiessen', ascending = False)
display(p_test_observation_sorted.head(10))
day_max_rain = p_test_observation_sorted.index[2]
print(day_max_rain)

In [None]:
day_max_rain.date()

now make an adjusted rain dataset

In [None]:
bool_dates = []
for i in range(len(p_zwalm_t.index)):
    bool_dates.append(p_zwalm_t.index[i].date() == day_max_rain.date()) #dus zelfde dag
p_zwalm_t_adapted = p_zwalm_t.copy()
p_zwalm_t_adapted.loc[bool_dates,'P_thiessen'] = 0.2*p_zwalm_t.loc[bool_dates,'P_thiessen'].values
    

Define days to inspect

In [None]:
begin_inspect = day_max_rain - pd.DateOffset(days = 1)
end_inspect = day_max_rain + pd.DateOffset(days =4)

In [None]:
# pd_max_rain = p_zwalm_t[bool_dates]
p_zwalm_event_window = p_zwalm_t.loc[begin_inspect:end_inspect]
p_zwalm_adapted_event_window = p_zwalm_t_adapted.loc[begin_inspect:end_inspect]
fig, ax = plt.subplots()
p_zwalm_event_window['P_thiessen'].plot(ax = ax, label = 'Original')
p_zwalm_adapted_event_window['P_thiessen'].plot(ax=ax, label = 'Adapted')
ax.set_ylabel('$P$ [mm/h]')
ax.legend()

In [None]:
Q_obs_event_window = Q_obs_daily.loc[begin_inspect:end_inspect]
fig, ax = plt.subplots()
Q_obs_event_window['Value'].plot(ax = ax)

First check how the model performs in normal conditions: use the linear regression model

In [None]:
gamma = 0.5
tau = 5
delta_dict, figs, axes = DA_OL_comparison(gamma, kappa, tau, Cstar_obs_lin_reg_nf, plot_style = 'static', return_figures= True)

In [None]:

axes[2].set_xlim(begin_inspect, end_inspect)
axes[2].set_ylim(0,8)
axes[2].set_title('pefect forcings')
figs[2]

In [None]:
deltat_out = 24
arguments_PDM = {'P':p_zwalm['P_thiessen'].values,'EP':ep_zwalm['EP_thiessen'].values, 't':p_zwalm_t.index.values,'area':area_zwalm_new, 'deltat':deltat,'deltatout':deltat_out,'parameters':param}
arguments_PDM_adapted = arguments_PDM.copy()
arguments_PDM_adapted['P'] = p_zwalm_t_adapted['P_thiessen'].values

#For DA
arguments_PDM_DA = arguments_PDM.copy()
arguments_PDM_DA['DA'] = True
arguments_PDM_DA['t_obs'] = Cstar_obs_lin_reg.index.values
arguments_PDM_DA['Cstar_obs']  = Cstar_obs_lin_reg_nf.values
arguments_PDM_DA['kappa'] = kappa
arguments_PDM_DA['gamma'] = gamma
arguments_PDM_DA['tau'] = np.timedelta64(tau,'h')
arguments_PDM_DA_adapted = arguments_PDM_DA.copy()
arguments_PDM_DA_adapted['P'] = p_zwalm_t_adapted['P_thiessen'].values
all(arguments_PDM_DA_adapted['P'] == arguments_PDM_DA['P'])

In [None]:
pdm_out_normal = PDM(**arguments_PDM)
pdm_out_normal_DA = PDM(**arguments_PDM_DA)

pdm_out_adapted = PDM(**arguments_PDM_adapted)
pdm_out_adapted_DA = PDM(**arguments_PDM_DA_adapted)

In [None]:
pdm_out_normal = pdm_out_normal.set_index('Time')
pdm_out_normal_DA = pdm_out_normal_DA.set_index('Time')
pdm_out_adapted_DA = pdm_out_adapted_DA.set_index('Time')
pdm_out_adapted = pdm_out_adapted.set_index('Time')

In [None]:
fig, ax = plt.subplots()
Q_obs_daily['Value'].plot(ax =ax ,label = 'Observed')
pdm_out_normal_DA['qmodm3s'].plot(ax = ax, label = 'DA')
pdm_out_normal['qmodm3s'].plot(ax = ax, label = 'OL')
pdm_out_adapted_DA['qmodm3s'].plot(ax = ax, label = 'DA adapted')
pdm_out_adapted['qmodm3s'].plot(ax = ax, label = 'OL adapted')
ax.set_xlim(begin_inspect, end_inspect)
ax.set_ylim(0,2)
ax.set_ylabel('$Q$ [m$^3$/s]')
ax.legend()

In [None]:
fig, ax = plt.subplots()
Cstar_plot = Cstar_obs_lin_reg_nf.reset_index()
Cstar_plot.plot.scatter(x = 't', y = 'C*', ax=ax, label = 'Retrieval')
pdm_out_normal_DA['Cstar'].plot(ax=ax,label = 'DA', color = 'tab:orange')
pdm_out_normal['Cstar'].plot(ax = ax, label = 'OL', color = 'tab:green')
pdm_out_adapted_DA['Cstar'].plot(ax=ax,label = 'DA adapted', color = 'tab:red')
pdm_out_adapted['Cstar'].plot(ax = ax, label = 'OL adapted', color = 'tab:purple')
fig
ax.legend()
ax.set_xlim(begin_inspect, end_inspect)
ax.set_ylim(200,300)
fig

In [None]:
all(pdm_out_adapted['qmodm3s'] == pdm_out_adapted_DA['qmodm3s'])

# A Very simple experiment with S1

Update 31/05/2023: with the change where I update $S_1$ in function of the a posteriori $C*$, this idea is not longer useful

Thesis: GPR and linear regression on full dataset to be discussed! => I can make analogous figures to the ones used previously!

In [None]:
# pdm_out_normal = PDM(**arguments_PDM)

In [None]:
# pdm_out_normal_t = pdm_out_normal.set_index('Time')
# S1 = pdm_out_normal_t['S1']

Load in extra modules

In [None]:
# from functions.ml_utils import general_sklearn_model
# from sklearn.linear_model import LinearRegression, LassoCV
# from sklearn.model_selection import GridSearchCV
# from sklearn.svm import SVR
# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.gaussian_process.kernels import RBF, WhiteKernel
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler

Read in extra data

In [None]:
# ML_data_pad = Path("data/Zwalm_data/ML_data")
# X_train = pd.read_pickle(ML_data_pad/"X_train.pkl")
# X_test = pd.read_pickle(ML_data_pad/"X_test.pkl")
# S1_train = S1.loc[X_train.index]
# S1_test = S1.loc[X_test.index]

## Linear regression

In [None]:
# # X_train_no_forest = X_train#.loc[:,~X_train.columns.str.endswith('Forest')]
# # X_test_no_forest = X_test#.loc[:,~X_test.columns.str.endswith('Forest')]
# linreg_drop_forest, r2_train, r2_test, fig, ax, S1_train_obs, S1_test_obs = general_sklearn_model(
#     LinearRegression(), X_train, X_test,
#     S1_train.values.reshape(-1,1), S1_test.values.reshape(-1,1),
#     X_train.index, X_test.index, S1, normalisation = True, return_predictions = True)

In [None]:
# S1_obs = pd.concat([S1_train_obs,S1_test_obs])

Zie S1_test.txt: gamma = 0.25 and tau = 2 works well!

In [None]:
# fig1, ((ax1_lr, ax1_gpr),(ax2_lr, ax2_gpr) ) = plt.subplots(2,2, figsize = (9,6), constrained_layout = True)#, sharey = 'row')
# fig2, ((ax3_lr, ax3_gpr),(ax4_lr, ax4_gpr) ) = plt.subplots(2,2, figsize = (9,6), constrained_layout = True)#, sharey = 'row')
# figs = (fig1, fig2)
# axes = (ax1_lr, ax2_lr, ax3_lr, ax4_lr)

In [None]:
# tau = 2
# kappa = 1
# gamma = 0.25
# delta_dict, figs, axes = DA_OL_comparison(gamma, kappa, tau, S1_obs, plot_style = 'static', DA_experiment= True, return_figures = True, figs = figs, axes = axes)

In [None]:
# #S1 plot
# axes[0].set_ylabel('$S_1$ [mm]')
# axes[0].set_xlabel('')
# axes[0].set_title('(a)')
# axes[1].set_ylabel(r'$S_{1, \rm DA} - S_{1,\rm OL}$ [mm]')
# ylim_S1_df = axes[1].get_ylim()
# display(figs[0])

# #Q plot
# axes[2].set_xlabel('')
# axes[2].set_title('(a)')
# ylim_Q_lr = axes[3].get_ylim()

# display(figs[1])

## SVR RBF 

In [None]:
# # svr_rbf_eps = SVR(kernel = 'rbf', epsilon = 0.1)
# # svr_gs_rbf_eps = GridSearchCV(svr_rbf_eps, param_grid = {
# #     'C':np.logspace(-10,3,14),
# #     'gamma':np.logspace(-5,5,50)
# # }, scoring = 'r2', cv = 5, n_jobs = -1, verbose = 3
# # )
# svr_rbf = SVR(kernel = 'rbf', C = 1, epsilon = 0.1, gamma ='auto') 
# # if exec_hyperopt_tuning: 
# SVR_out, r2_train, r2_test, fig, ax, S1_train_SVR, S1_test_SVR = general_sklearn_model(
#     svr_rbf, X_train, X_test, S1_train.values.reshape(-1,1), S1_test.values.reshape(-1,1),X_train.index, X_test.index,S1, normalisation = True, return_predictions = True
#     )
#     # svr_gs_tuple_out = general_sklearn_model(
#     #     svr_gs_rbf_rains, X_train_small.drop(['VH_past_agr'], axis = 1), X_test_small.drop(['VH_past_agr'], axis = 1), y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),X_train.index, X_test.index,Cstar,normalisation = True
#     # )
# #     joblib.dump(svr_gs_tuple_out,pad/'svr_optim_rbf_rains.joblib')
# # else:
# #     svr_gs_tuple_out = joblib.load(pad/'svr_optim_rbf_rains.joblib')
# fig

In [None]:
# S1_obs_RBF = pd.concat([S1_train_SVR, S1_test_SVR])
# DA_OL_comparison(gamma, kappa, tau, S1_obs_RBF, plot_style = 'dynamic', DA_experiment= True)

## Gaussian processes

In [None]:
# SEED = 1234
# kernel = RBF(length_scale_bounds=(1e-2,1e2)) + WhiteKernel(noise_level_bounds=(1e-1,1e3))
# gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=100, normalize_y=True, random_state = SEED)
# gpr_pipe = make_pipeline(StandardScaler(), gpr)
# gpr_pipe_out,r2_train,r2_test,fig,ax, S1_train_GPR, S1_test_GPR = general_sklearn_model(
#     gpr_pipe, X_train, X_test, S1_train.values.reshape(-1,1), S1_test.values.reshape(-1,1), X_train.index, X_test.index, S1, #normalisation = True, 
#     return_predictions = True
# )
# fig

In [None]:
# gpr_pipe_out

In [None]:
# S1_obs_GPR = pd.concat([S1_train_GPR, S1_test_GPR])
# axes_gpr = (ax1_gpr, ax2_gpr, ax3_gpr, ax4_gpr)
# delta_dict, figs, axes_gpr = DA_OL_comparison(gamma, kappa, tau, S1_obs_GPR, plot_style = 'static', DA_experiment= True, figs = figs, axes = axes_gpr, return_figures = True)
# print(delta_dict)

In [None]:
# #S1 plot
# axes_gpr[0].set_xlabel('')
# axes_gpr[0].set_title('(b)')
# axes_gpr[0].set_ylabel('')
# axes_gpr[1].set_ylabel('')

# ylim_S1_diff_gpr = axes_gpr[1].get_ylim()
# ymin = min(ylim_S1_diff_gpr[0], ylim_S1_df[0])
# ymax = max(ylim_S1_diff_gpr[1], ylim_S1_df[1])
# axes_gpr[1].set_ylim(ymin, ymax)
# axes[1].set_ylim(ymin, ymax)
# display(figs[0])
# figs[0].savefig(pad_figures_text/'S1DA_S1_comparison_LR_GPR.pdf',format = 'pdf', bbox_inches = 'tight')

# #Q plot
# axes_gpr[2].set_ylabel('')
# axes_gpr[2].set_xlabel('')
# axes_gpr[2].set_title('(b)')
# axes_gpr[3].set_ylabel('')
# ylim_Qdiff_lr = axes[3].get_ylim()
# ylim_Qdiff_gpr = axes_gpr[3].get_ylim()
# ymin_Qdiff = min(ylim_Qdiff_lr[0], ylim_Qdiff_gpr[0])
# ymax_Qdiff = max(ylim_Qdiff_lr[1], ylim_Qdiff_gpr[1])
# axes_gpr[3].set_ylim(ymin_Qdiff, ymax_Qdiff)
# axes[3].set_ylim(ymin_Qdiff, ymax_Qdiff)
# display(figs[1])
# figs[1].savefig(pad_figures_text/'S1DA_Q_comparison_LR_GPR.pdf', format= 'pdf', bbox_inches = 'tight')

# Old experimens only below 

In [None]:
# arrays = [
#     ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
#     ["one", "two", "one", "two", "one", "two", "one", "two"],
# ]
# tuples = list(zip(*arrays))
# print(tuples)
# index = pd.MultiIndex.from_tuples(tuples, names = ['first','second'])
# print(index)
# s = pd.Series(np.random.randn(8),index = index)
# s

In [None]:
# np.arange(-30,30)

In [None]:
# def NewtonianNudging(Cstar_min, Cstar_obs, gamma, Kappa, delta_t, tau):
#     W_t = tau_weighing(np.abs(delta_t),tau)
#     Cstar_plus = Cstar_min + gamma*Kappa*W_t*(Cstar_obs -Cstar_min)
#     if Cstar_plus != Cstar_min:
#         import pdb; pdb.set_trace()
#     return Cstar_plus


In [None]:
# def NN_wrapper(i, t, t_obs, t_a,Cstar,C_star_obs):
#     if np.any(np.abs(t[i] - t_obs) < t_a):
#         t_assimilated=t_obs[np.abs((t[i] - t_obs)) < t_a]
#         print(
#             f'{t[i]} should be assimilated since less than {t_a/2} rmeoved from {t_assimilated}')

In [None]:
# C_star_updated = C_star_mod.copy()
# for i in range(len(C_star_mod)):
#     C_star_min = C_star_mod[i]
#     t_mod_i = t_hour[i]
#     t_assimilated_index = np.abs(t_mod_i - t_obs).argmin()#t_obs[np.abs((t_hour[i] - t_obs)) < t_a]
#     t_assimilated = t_obs[t_assimilated_index]
#     delta_t = t_mod_i - t_assimilated
#     C_star_updated[i] = NewtonianNudging(C_star_min, Cstar_obs_lin_reg.loc[t_assimilated,:].values[0],0.5,1,delta_t,np.timedelta64(24,'h'))

In [None]:
# t_assimilated 
# Cstar_obs_lin_reg.loc[t_assimilated,:].values[0]

In [None]:
# type(Cstar_obs_lin_reg.index.values)

In [None]:
# pd_zwalm_out = PDM(P=p_zwalm['P_thiessen'].values,
#                        EP=ep_zwalm['EP_thiessen'].values,
#                        t=p_zwalm['Timestamp'].values,
#                        area=area_zwalm_new, deltat=deltat, deltatout=deltat_out,
#                        parameters=param, m=3, DA = True, Cstar_obs = Cstar_obs_lin_reg.values.flatten(),t_obs = Cstar_obs_lin_reg.index.values, gamma = 0.5, kappa = 1,  tau = np.timedelta64(12,'h'),)
# pd_zwalm_out_DA = pd_zwalm_out.set_index('Time')
# pd_zwalm_out = PDM(P=p_zwalm['P_thiessen'].values,
#                        EP=ep_zwalm['EP_thiessen'].values,
#                        t=p_zwalm['Timestamp'].values,
#                        area=area_zwalm_new, deltat=deltat, deltatout=deltat_out,
#                        parameters=param, m=3, DA = False)
# pd_zwalm_out = pd_zwalm_out.set_index('Time')

# fig, ax = plt.subplots()
# pd_zwalm_out_DA['Cstar'].plot(ylabel='[mm]', ax = ax, label = 'C* DA')
# pd_zwalm_out['Cstar'].plot(ax = ax, label = 'C* OL')
# ax.legend()
# pd_zwalm_out.tail()

# hvplot.extension('bokeh')
# pd_zwalm_out_DA['Cstar'].hvplot(ylabel='[mm]',
#     label = 'C* DA')*pd_zwalm_out['Cstar'].hvplot(label = 'C* OL')

# Q_obs_daily['Value'].hvplot()

# Q_obs_daily['Value'].hvplot(label = 'Observed')*pd_zwalm_out_DA[
#     'qmodm3s'].hvplot(ylabel='[m^3/s]',label = 'DA')*pd_zwalm_out['qmodm3s'].hvplot(label = 'OL',line_dash = 'dotted', frame_width = 800, frame_height = 400)

# Q_out_diff = pd_zwalm_out_DA['qmodm3s'] - pd_zwalm_out['qmodm3s']
# Q_out_diff.hvplot()