# Simple models for the observation operator

Before applying the complex LSTM model, we'll first check the performance of much simpler ML models: linear regression and ridge regression.

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import os
import dask
import joblib
from sklearn.linear_model import (LinearRegression, Ridge, RidgeCV,
                                   LassoCV, Lasso)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
from pathlib import Path

pad = Path(os.getcwd())
if pad.name == "ml_observation_operator":
    pad_correct = pad.parent
    os.chdir(pad_correct)
from functions.PDM import PDM
from functions.pre_processing import reshape_data, reshaped_to_train_test
from functions.ml_utils import general_sklearn_model
from functions.plotting_functions import plot_Cstar_model
SEED = 1234

os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

exec_hyperopt_tuning = False #SET TO TRUE TO RUN FULL NOTEBOOK

%load_ext autoreload 
%autoreload 2 

In [None]:
os.getcwd()

## Load in data

In [None]:
%run "ml_observation_operator/data_load_in.py"

In [None]:
ML_data_pad = Path("data/Zwalm_data/ML_data")
X_full_all = pd.read_pickle(ML_data_pad/"X_full_all.pkl")

y_train = pd.read_pickle(ML_data_pad/"y_train.pkl")
y_test = pd.read_pickle(ML_data_pad/"y_test.pkl")
y_full = pd.read_pickle(ML_data_pad/"y_full.pkl")

Cstar = pd.read_pickle(ML_data_pad/"Cstar.pkl")


Full set of possible features: Forest, Pasture, Agriculture and a combination of pasture and agricculture

In [None]:
features_corr = X_full_all.corr()
features_corr.style.background_gradient(cmap = 'coolwarm')

Load in both full and the smaller dataset

In [None]:
X_train = pd.read_pickle(ML_data_pad/"X_train.pkl")
X_test = pd.read_pickle(ML_data_pad/"X_test.pkl")
X_full = pd.read_pickle(ML_data_pad/"X_full.pkl")
display(X_full.head())
print(X_full.shape)

X_train_small = pd.read_pickle(ML_data_pad/"X_train_small.pkl")
X_test_small = pd.read_pickle(ML_data_pad/"X_test_small.pkl")
X_full_small = pd.read_pickle(ML_data_pad/"X_full_small.pkl")
display(X_full_small.head())

## EXPERIMENT OF 08/05: DROP DESCENDING (is conveyes the same information as ascending): now incorporated in data_load_in
# X_train = X_train.drop('descending',axis=1)
# X_test = X_test.drop('descending',axis=1)
# X_full = X_full.drop('descending',axis=1)

# X_train_small = X_train_small.drop('descending',axis=1)
# X_test_small = X_test_small.drop('descending',axis=1)
# X_full_small = X_full_small.drop('descending',axis=1)

24/03/2023: drop delta_t feature as experiment (since this will not lead to better perfomance if not multiple timesteps included). From 0.7 to around 0.74 in test score. 

In [None]:
#keep delta_t for the time window methods!
X_train_dt = X_train.copy()
X_test_dt = X_test.copy()
X_full_dt = X_full.copy()

X_train_small_dt = X_train_small.copy()
X_test_small_dt = X_test_small.copy()
X_full_small_dt = X_full.copy()

X_train = X_train.drop('delta_t',axis = 1)
X_test = X_test.drop('delta_t',axis = 1)
X_full = X_full.drop('delta_t',axis = 1)

X_train_small = X_train_small.drop('delta_t',axis = 1)
X_test_small = X_test_small.drop('delta_t',axis = 1)
X_full_small = X_full_small.drop('delta_t',axis = 1)
display(X_train)

# Linear regression

Idea of linear regression as observation operator alreayd applied in Auber(cf. [obsidian](C:\Users\olivi\Documents\ob_obsidian\DA\Aubert_SM_DA_in_conceptual_model.md) and https://www.sciencedirect.com/science/article/pii/S0022169403002294?via%3Dihub )

Include forest in the equation

https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2 

Calculated adjusted $R^2$ (=$\bar{R}^2$) from regular $R^2$ as:
$$
{\displaystyle {\bar {R}}^{2}=1-(1-R^{2}){n-1 \over n-p}}
$$

with $n$ the number of variables  and $p$ the number parameters 

### 1 input, 1 output

Very basic idea: 1 input timestep to 1 output timestep without normalisation (not covered in thesis)

In [None]:
#fig, axes = plt.subplots(2,1, constrained_layout = True)
linreg, r2_train, r2_test, fig, ax = general_sklearn_model(
    LinearRegression(), X_train, X_test, y_train, y_test, X_train.index,X_test.index, Cstar
)

Experiment 08/05: drop the descending one (as it actually conveyes the same information as ascending! they are 100% correlated)

In [None]:
# linreg_norm, r2_train, r2_test, fig, ax = general_sklearn_model(
#     LinearRegression(), X_train.drop(['descending'],axis=1), X_test.drop(['descending'],axis =1), y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index,X_test.index, Cstar, normalisation = True, save_predictions = True, pad = pad
# )
# fig

Add normalisation to input and output

In [None]:
pad = Path('data/ml_obs_op_data/lin_reg/full_data')
font_size = 13
plt.rcParams.update({'font.size': font_size})
linreg_norm, r2_train, r2_test, fig, ax = general_sklearn_model(
    LinearRegression(), X_train, X_test, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index,X_test.index, Cstar, normalisation = True, save_predictions = True, pad = pad
)
ax.set_title('Linear regression')
ax.set_xlabel('Time')
ax.legend(['PDM','Train','Test'],loc = 'upper right')
display(fig)
pad_pres = Path('Figures/presentation_12_04')
if not os.path.exists(pad_pres):
    os.makedirs(pad_pres)
fig.savefig(pad_pres/'lin_reg.svg',format = 'svg', transparent = True)
plt.rcParams.update(matplotlib.rcParamsDefault)

Drop the sin and cos feature

In [None]:
pad = Path('data/ml_obs_op_data/lin_reg/full_data_no_time')
linreg_drop, r2_train, r2_test, fig, ax = general_sklearn_model(
     LinearRegression(), X_train.drop(['year_sin','year_cos'], axis = 1),
     X_test.drop(['year_sin','year_cos'], axis = 1), y_train.values.reshape(-1,1),y_test.values.reshape(-1,1),X_train.index, X_test.index, Cstar, save_predictions = True, pad = pad
)
fig                                             

Drop the forest related features (but keep time related feature)

In [None]:
X_train_no_forest = X_train.loc[:,~X_train.columns.str.endswith('Forest')]
X_test_no_forest = X_test.loc[:,~X_test.columns.str.endswith('Forest')]
pad = Path('data/ml_obs_op_data/lin_reg/full_data_no_forest')
linreg_drop_forest, r2_train, r2_test, fig, ax = general_sklearn_model(
     LinearRegression(), X_train_no_forest, X_test_no_forest,
    y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),
    X_train.index, X_test.index, Cstar, save_predictions = True, pad = pad
)
fig

Conclusion on normalisation: not really necessary in this case, basically no difference in performance! 

In [None]:
coef_dict =  {}
for i, param in enumerate(X_train.columns.to_list()):
    coef_dict[param] = linreg.coef_[i]
pd_coef = pd.DataFrame(coef_dict, index =[0])
pd_coef

Clearly a lot of weight being given to the sinus feature!

Now also try with the dataset with less features: only the lumped pasture and agriculture

In [None]:
linreg_small, r2_train, r2_test, fig, ax = general_sklearn_model(
    LinearRegression(), X_train_small,X_test_small, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index,X_test.index, Cstar, normalisation = True
)
#ax.plot(X_full.index, X_full['year_sin']*150+310, label = 'Sine wave')
ax.legend()
fig

Worse performance than on the more full set

Dropping time

In [None]:
linreg_small, r2_train, r2_test, fig, ax = general_sklearn_model(
    LinearRegression(), X_train_small.drop(['year_sin','year_cos'],axis =1),
    X_test_small.drop(['year_sin','year_cos'],axis =1), y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar,normalisation = True
)
ax.set_title('Normalised Linear Regression: no time features')
fig

So clearly a lot of dependence on the sinus feature to get decent results!

Just an experiment on 08/05/2022: principe component regression (does not work well)

In [None]:
from sklearn.decomposition import PCA
pcregr_pipe = make_pipeline(PCA(n_components = 0.9),LinearRegression())
pcreg_norm, r2_train, r2_test, fig, ax = general_sklearn_model(
    pcregr_pipe, X_train, X_test, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index,X_test.index, Cstar, normalisation = True
)
fig

### several input timesteps, 1 output



In [None]:
seq_length = 5
linreg_window, r2_train, r2_test, fig, ax = general_sklearn_model(
    LinearRegression(), X_train_dt, X_test_dt, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length
)
ax.set_title("Window LinearRegression on max # features")
fig

So linear regression quite clearly overfits on the window data => idea of trying ridge regression

Also try on the smaller dataset

In [None]:
linreg_window_small, r2_train, r2_test, fig, ax = general_sklearn_model(
    LinearRegression(), X_train_small_dt, X_test_small_dt, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length
)
ax.set_title("Window Linear Regression on the lumped dataset")

So worse performance by including more timesteps! 

### Combine several models in one picture

In [None]:
fig, axes = plt.subplots(4,1, figsize = (9,9), constrained_layout = True)
linreg_norm, r2_train, r2_test, fig, ax = general_sklearn_model(
    LinearRegression(), X_train, X_test, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index,X_test.index, Cstar, normalisation = True, fig = fig, ax = axes[0]
)
axes[0].set_title('(a)')
axes[0].set_xlabel('')

linreg_drop, r2_train, r2_test, fig, ax = general_sklearn_model(
     LinearRegression(), X_train.drop(['year_sin','year_cos'], axis = 1),
     X_test.drop(['year_sin','year_cos'], axis = 1), y_train.values.reshape(-1,1),y_test.values.reshape(-1,1),X_train.index, X_test.index, Cstar, fig = fig, ax = axes[1]
)
axes[1].set_title('(b)')
axes[1].set_xlabel('')

linreg_drop_forest, r2_train, r2_test, fig, ax = general_sklearn_model(
     LinearRegression(), X_train_no_forest, X_test_no_forest,
    y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),
    X_train.index, X_test.index, Cstar, fig = fig, ax = axes[2]
)
axes[2].set_title('(c)')
axes[2].set_xlabel('')
linreg_window, r2_train, r2_test, fig, ax = general_sklearn_model(
    LinearRegression(), X_train, X_test, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length, fig = fig, ax = axes[3]
)
axes[3].set_title('(d)')
axes[3].set_xlabel('Time')
figpad = Path('Figures/Figures_chapter_ML_obs_op')
if not os.path.exists(figpad):
    os.makedirs(figpad)
fig.savefig(figpad/'Cstar_pred_LR.pdf',format = 'pdf', bbox_inches = 'tight')
fig

# Ridge and Lasso regression

L2 normalisation. Well explained in https://scikit-learn.org/stable/modules/linear_model.html#ridge-regression-and-classification 

Aslo L1 normalisation ncan be tried with Lasso Regression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html automatic best hyperparamter $\alpha$ (often called $\lambda$) for regularisation by appyling crosss validation. cv = 5 is 5-fold cross-validation within the training set

https://reader.elsevier.com/reader/sd/pii/S0020025511006773?token=CDA1B56A3E63442F4A577A86F3FE9F199758615BFC990A1BA04E59DF8C8A4DDC652ED7FE300E09F449B8B984D40426C5&originRegion=eu-west-1&originCreation=20230510164928  Stresses to NOT shuffle when analysing time series!

Note: scikitlearn solves a slightly different objective function than the one used in the dissertation description. To convert to thesis notation, multiply current alpha value of lasso times $2n_{samples}$ (form training)

### 1 input 1 output

In [None]:
kf = KFold(n_splits = 5, shuffle = False)#, shuffle=True, random_state=SEED) #10/05: siwth to False
n_train = X_train.shape[0]
ridge = RidgeCV(alphas = np.logspace(-3,3,100), cv = kf)
lasso = LassoCV(alphas = np.logspace(-3,3,100), cv = kf)
names = ["ridge","lasso"]
for i, model in enumerate([ridge, lasso]):
     ridge, r2_train, r2_test, fig, ax = general_sklearn_model(
          model, X_train, X_test, y_train,
          y_test, X_train.index, X_test.index, Cstar
     )
     ax.set_title(names[i] + r' with $\alpha = $' +  f'{model.alpha_}')

So quite interesting: in regulrisation, it excludes VHPasture and LAIAgriculture!

Worse than linear regression: both for Lasso and Ridge (but Lasso performs sligthly better)

Include normalisation

In [None]:
ridge = RidgeCV(alphas = np.logspace(-3,3,100), cv = kf)
lasso = LassoCV(alphas = np.logspace(-3,3,100), cv = kf)
models_norm_dict = {}
for i, model in enumerate([ridge, lasso]):
     models_norm_dict[names[i]], r2_train, r2_test, fig, ax = general_sklearn_model(
          model, X_train.values, X_test.values, y_train.values.reshape(-1,1),
          y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True
     )
     ax.set_title('Normalised ' + names[i] + r'with $\alpha = $' +  f'{model.alpha_}')
     display(fig)

In [None]:
alpha_lasso = models_norm_dict['lasso'].alpha_*2*n_train
print(alpha_lasso)

In [None]:
lasso_weights_pd = pd.DataFrame(models_norm_dict['lasso'].coef_.reshape(1,-1),columns = X_full.columns)
lasso_weights_pd 

Lasso regularisatio sets VH pasture and LAI Agriculture to zero! 

~~Slightly better performance with normalisation. Lasso normalisation has highest test score thus far. Shuffle the data despite being normally against the rules here.~~

Drop time features

In [None]:
ridge = RidgeCV(alphas = np.logspace(-3,3,100), cv = kf)
lasso = LassoCV(alphas = np.logspace(-3,3,100), cv = kf)
models_no_DOY_dict = {}
for i, model in enumerate([ridge, lasso]):
    models_no_DOY_dict[names[i]], r2_train, r2_test, fig, ax = general_sklearn_model(
        model , X_train.drop(['year_sin','year_cos'], axis = 1),
        X_test.drop(['year_sin','year_cos'], axis = 1), y_train.values.reshape(-1,1),
        y_test.values.reshape(-1,1),X_train.index, X_test.index, Cstar, normalisation = True
    )
    ax.set_title('Normalised ' + names[i] + r' no time features with $\alpha = $' +  f'{model.alpha_}')
    display(fig)

In [None]:
lasso_weights_noDOY_pd = pd.DataFrame(models_no_DOY_dict['lasso'].coef_.reshape(1,-1),columns = X_full.columns[0:-2])
lasso_weights_noDOY_pd 

So now VH agriculture is set to 0 for Lasso

In [None]:
ridge = RidgeCV(alphas = np.logspace(-3,3,100), cv = kf)
lasso = LassoCV(alphas = np.logspace(-3,3,100), cv = kf)
models_no_forest_dict = {}
for i, model in enumerate([ridge, lasso]):
    models_no_forest_dict[names[i]], r2_train, r2_test, fig, ax = general_sklearn_model(
        model, X_train_no_forest, X_test_no_forest,
        y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),
        X_train.index, X_test.index, Cstar, normalisation = True #normalisation = slightly better performance
    )
    ax.set_title('Normalised RidgeCV drop forest')
    ax.set_title('Normalised ' + names[i] + r' drop forest with $\alpha = $' +  f'{model.alpha_}')
    display(fig)

In [None]:
lasso_weights_no_forest = pd.DataFrame(models_no_forest_dict['lasso'].coef_.reshape(1,-1),columns = X_train_no_forest.columns)
lasso_weights_no_forest

So now it sets LAI agriculture to 0!

### Multiple inputs, 1 output

In [None]:
seq_length = 5 #vs. 30 on the slides for Niko 05/04
ridge_window, r2_train, r2_test, fig, ax = general_sklearn_model(
     ridge, X_train, X_test, y_train.values.reshape(-1,1),y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length
)
ax.set_title(r"Window Ridge regression: $\alpha = $" +f"{ridge_window.alpha_}")
display(fig)

In [None]:
X_full.columns

In [None]:
seq_length = 5 #Adapted to 5 on 08/04/2023 instead of 30
ridge_window, r2_train, r2_test, fig, ax = general_sklearn_model(
     ridge, X_train_dt.drop(['year_sin','year_cos'],axis = 1), X_test_dt.drop(['year_sin','year_cos'],axis = 1), y_train.values.reshape(-1,1),y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length
)
ax.set_title('Ridge window no time features')
display(fig)

In [None]:
seq_length = 5 #vs. 30 on the slides for Niko 05/04
lasso_window, r2_train, r2_test, fig, ax, lasso_tau5_train, lasso_tau5_test = general_sklearn_model(
     lasso, X_train, X_test, y_train.values.reshape(-1,1),y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length, return_predictions=True
)
ax.set_title(r"Lasso regression: $\alpha = $" + f"{lasso_window.alpha_}")
display(fig)

In [None]:
lasso_window.alpha_

In [None]:
lasso_weights_tau_5 = pd.DataFrame(lasso_window.coef_.reshape(seq_length,-1),columns =X_full.columns)
display(lasso_weights_tau_5)
print(np.sum(np.isclose(lasso_weights_tau_5.values,0)))

So interesting: sets for sin and cos nearly all values to zero!

### Cross validation for best model structure

Idea for the full dataset:
- per model (so Ridge, Lasso and normal Linear regression) find the best set of hyperparameters:
    - sequence length of input: 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60
    - forest or no forest
    - time of no time
    - range of alpha values (include 0 = linear regression is included!)
Normalisation is given to inputs to avoid problems with trainig of the algorithms 

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
simplefilter("ignore", category=UserWarning)
nr_folds = 4
n_train = X_train.shape[0]
alpha_range = np.concatenate([np.logspace(-3,3,100),np.array([0])]) 
range_seq_length = np.array([1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60])#2**np.arange(1,7)#np.arange(1,100,10)
range_data_size = ['large','small']
range_forest = [True, False]
range_time_goniometr = [True, False]
model_names = ['Ridge', 'Lasso']
nr_options = len(range_seq_length)*len(range_forest)*len(range_time_goniometr)*len(alpha_range)*len(model_names)*len(range_data_size)
col_names = ['model','seq_length','forest_bool','time_bool','alpha','r2_val_mean','r2_val_sd']
pd_hyperparam = pd.DataFrame(columns=col_names, index = range(0,nr_options))
iter = 0
X_scaler = StandardScaler()
X_scaler.fit(X_train)
X_full_norm = pd.DataFrame(X_scaler.transform(X_full), columns = X_full.columns)
X_scaler_small = StandardScaler()
X_scaler_small.fit(X_train_small)
X_full_small_norm = pd.DataFrame(X_scaler_small.transform(X_full_small), columns = X_full_small.columns)
y_scaler = StandardScaler()
y_scaler.fit(y_train.values.reshape(-1,1))
y_full_norm = y_scaler.transform(y_full.values.reshape(-1,1))
if exec_hyperopt_tuning:
    for seq_length in range_seq_length:
        for data_size in range_data_size:
            for time_goniometr in range_time_goniometr:
                for forest in range_forest:
                    if data_size == 'large':
                        X_temp = X_full_norm.copy()
                    else:
                        X_temp = X_full_small_norm.copy()
                    if not time_goniometr:
                        X_temp = X_temp.drop(['year_sin','year_cos'],axis = 1)
                    if data_size == 'large':
                        if not forest:
                            X_temp = X_temp.loc[:,~X_temp.columns.str.endswith('Forest')]
                    X_window, y_window, t_window = reshape_data(
                        X_temp.values,y_full_norm,
                        X_full.index.values, seq_length
                    )
                    (X_window_train, X_window_test, y_window_train, y_window_test, 
                    t_window_train, t_window_test) = reshaped_to_train_test(
                        X_window, y_window, t_window, seq_length, n_train, output_dim = 2
                    )
                    for alpha in alpha_range:
                        for model_name in model_names:
                            kf = KFold(nr_folds, shuffle = False)#no shuffle!!!
                            r2_val_list = []
                            for i, (train_index, test_index) in enumerate(kf.split(X_window_train)):
                                if model_name == 'Lasso':
                                    model = Lasso(alpha = alpha)
                                else:
                                    model = Ridge(alpha = alpha)
                                #model_temp, r2_train, r2_val, fig, ax = general_sklearn_model(
                                delayed_result = dask.delayed(general_sklearn_model)(
                                    model, X_window_train[train_index], X_window_train[test_index],
                                    y_window_train[train_index],y_window_train[test_index], 
                                    t_window_train[train_index], t_window_train[test_index], Cstar,
                                    print_output = False
                                )
                                r2_val_list.append(delayed_result)
                                #r2_val_list.append(r2_val)
                            r2_vals = dask.compute(*r2_val_list)
                            r2_vals = [r2_vals[i][2] for i in range(len(r2_vals))]
                            pd_hyperparam.iloc[iter,:] = [model_name,seq_length, forest, time_goniometr,
                                                        alpha, np.mean(r2_vals),np.std(r2_vals)]
                            iter = iter + 1 
                            if iter%100 == 0:
                                print(f'Iteration {iter} out of {nr_options} completed')

In [None]:
if exec_hyperopt_tuning:
    pd_hyperparam.to_csv("data/ml_obs_op_data/lin_reg_hyperparam_cv.csv", index = False)
else:
    pd_hyperparam = pd.read_csv("data/ml_obs_op_data/lin_reg_hyperparam_cv.csv")

In [None]:
print(len(pd_hyperparam))
pd_hyperparam.head(10)

In [None]:
hyperparam_best = pd_hyperparam[pd_hyperparam['r2_val_mean'].max() == pd_hyperparam['r2_val_mean']]
hyperparam_best

Retrain a model with the above information on the entire training set!

In [None]:
model = hyperparam_best['model'].values[0]
alpha = hyperparam_best['alpha'].values[0]
seq_length = hyperparam_best['seq_length'].values[0]
time_goniometr = hyperparam_best['time_bool'].values[0]
forest = hyperparam_best['forest_bool'].values[0]
X_temp_train = X_train.copy()
X_temp_test = X_test.copy()
if not time_goniometr:
    X_temp_train = X_temp_train.drop(['year_sin','year_cos'],axis = 1)
    X_temp_test = X_temp_test.drop(['year_sin','year_cos'],axis = 1)
if not forest:
    X_temp_train = X_temp_train.loc[:,~X_temp_train.columns.str.endswith('Forest')]
    X_temp_test = X_temp_test.loc[:,~X_temp_test.columns.str.endswith('Forest')]
if model == 'Ridge':
    model = Ridge(alpha = alpha)
elif model == 'Lasso':
    model = Lasso(alpha = alpha)
out_dict = general_sklearn_model(
    model, X_temp_train, X_temp_test, y_train.values.reshape(-1,1),y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length
)
display(out_dict[3])

So ridge regresion even after optimisation of hyperparameters, does not perform better than a simple linear regression on al features with a window base approach. Note that in fact a simple linear regression without the window base approach outperforms this approach on test data!

In [None]:
pd_hyperparam.sort_values('r2_val_mean',ascending=False).head(10)

In [None]:
pd_hyperparam_lasso = pd_hyperparam[pd_hyperparam['model'] == 'Lasso']
pd_hyperparam_lasso.sort_values('r2_val_mean',ascending=False).head(10)

In [None]:
hyperparam_lasso_best = pd_hyperparam_lasso[
    pd_hyperparam_lasso['r2_val_mean'].max() == pd_hyperparam_lasso['r2_val_mean']
]
model = hyperparam_lasso_best['model'].values[0]
alpha = hyperparam_lasso_best['alpha'].values[0]
seq_length = hyperparam_lasso_best['seq_length'].values[0]
time_goniometr = hyperparam_lasso_best['time_bool'].values[0]
forest = hyperparam_lasso_best['forest_bool'].values[0]
X_temp_train = X_train.copy()
X_temp_test = X_test.copy()
if not time_goniometr:
    X_temp_train = X_temp_train.drop(['year_sin','year_cos'],axis = 1)
    X_temp_test = X_temp_test.drop(['year_sin','year_cos'],axis = 1)
if not forest:
    X_temp_train = X_temp_train.loc[:,~X_temp_train.columns.str.endswith('Forest')]
    X_temp_test = X_temp_test.loc[:,~X_temp_test.columns.str.endswith('Forest')]
if model == 'Ridge':
    model = Ridge(alpha = alpha)
elif model == 'Lasso':
    model = Lasso(alpha = alpha)
out_dict = general_sklearn_model(
    model, X_temp_train, X_temp_test, y_train.values.reshape(-1,1),y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length
)
display(out_dict[3])
display(out_dict[0])

In [None]:
no_time_sorted = pd_hyperparam[pd_hyperparam['time_bool'] == False].sort_values('r2_val_mean',ascending = False)
no_time_sorted.head(5)

In [None]:
hyperparam_best = pd.DataFrame(no_time_sorted.iloc[0,:].values.reshape(1,-1), columns = no_time_sorted.columns)
hyperparam_best

In [None]:
hyperparam_best = pd.DataFrame(no_time_sorted.iloc[0,:].values.reshape(1,-1), columns = no_time_sorted.columns)
model = hyperparam_best['model'].values[0]
alpha = hyperparam_best['alpha'].values[0]
seq_length = hyperparam_best['seq_length'].values[0]
time_goniometr = hyperparam_best['time_bool'].values[0]
forest = hyperparam_best['forest_bool'].values[0]
X_temp_train = X_train.copy()
X_temp_test = X_test.copy()
if not time_goniometr:
    X_temp_train = X_temp_train.drop(['year_sin','year_cos'],axis = 1)
    X_temp_test = X_temp_test.drop(['year_sin','year_cos'],axis = 1)
if not forest:
    X_temp_train = X_temp_train.loc[:,~X_temp_train.columns.str.endswith('Forest')]
    X_temp_test = X_temp_test.loc[:,~X_temp_test.columns.str.endswith('Forest')]
if model == 'Ridge':
    model = Ridge(alpha = alpha)
elif model == 'Lasso':
    model = Lasso(alpha = alpha)
else:
    raise ValueError('model should be Lasso or Ridge')

#save output
pad = Path('data/ml_obs_op_data/ridge/window')
ridge_w_nt, r2_train, r2_test, fig, ax = general_sklearn_model(
    model, X_temp_train, X_temp_test, y_train.values.reshape(-1,1),y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = seq_length, save_predictions=True, pad = pad
)
ax.set_title('Ridge window regression')
display(fig)

Best test performance so far!

In [None]:
ridge_w_nt.coef_.shape

In [None]:
print(f'numer of parameterrs for window length of {seq_length}: {max(ridge_w_nt.coef_.shape) + len(ridge_w_nt.intercept_)}')

Repeat the experiment of no time for Lasso!

In [None]:
no_time_sorted_lasso = pd_hyperparam_lasso[pd_hyperparam_lasso['time_bool'] == False].sort_values('r2_val_mean',ascending = False)
display(no_time_sorted_lasso.head(5))
X_temp_train = X_train.copy()
X_temp_test = X_test.copy()
X_temp_train = X_temp_train.drop(['year_sin','year_cos'],axis = 1)
X_temp_test = X_temp_test.drop(['year_sin','year_cos'],axis = 1)
if not no_time_sorted_lasso['forest_bool'].iloc[0]:
    X_temp_train = X_temp_train.loc[:,~X_temp_train.columns.str.endswith('Forest')]
    X_temp_test = X_temp_test.loc[:,~X_temp_test.columns.str.endswith('Forest')]
model = Lasso(alpha = no_time_sorted_lasso['alpha'].iloc[0])
pad = Path('data/ml_obs_op_data/lasso/window')
lasso_w_nt, r2_train, r2_test, fig, ax, lasso_nt_train, lasso_nt_test = general_sklearn_model(
    model, X_temp_train, X_temp_test, y_train.values.reshape(-1,1),y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True, seq_length = no_time_sorted_lasso['seq_length'].iloc[0], save_predictions=True, pad = pad, return_predictions=True
)
display(fig)

In [None]:
lasso_weights_cv = pd.DataFrame(lasso_w_nt.coef_.reshape(no_time_sorted_lasso['seq_length'].iloc[0],-1),columns =X_full.columns[0:-2])
lasso_weights_cv.style.background_gradient(cmap = 'coolwarm')

In [None]:
np.sum(lasso_weights_cv.values == 0)

So conclusion:
- Simple Linear regression on all features 1 timpestep: good performance
- Lasso regression on all features 1 timpestep: even slightly better performance
- cross validation: does not yield a better performance
- idea: for window trainig time info (with sin and cos) drop out to prevent overfitting on this! this results in the best test performance thusfar when using ridge with seq length of 30

Now plot the two models that will be used in the disseration:

In [None]:
lasso_tau5_train.values.shape

In [None]:
fig, axes = plt.subplots(2,1, figsize = (9,5), constrained_layout = True)
plot_Cstar_model(lasso_tau5_train.values, lasso_tau5_test.values, lasso_tau5_train.index, lasso_tau5_test.index, Cstar, X_full.index, fig, axes[0])
axes[0].set_xlabel('')
axes[0].set_title('(a)')
plot_Cstar_model(lasso_nt_train.values, lasso_nt_test.values, lasso_nt_train.index, lasso_nt_test.index, Cstar, X_full.index, fig, axes[1])
axes[1].set_title('(b)')
axes[1].set_xlabel('Time')
display(fig)
fig.savefig(figpad/'Cstar_pred_lasso.pdf',format = 'pdf', bbox_inches = 'tight')

# Support Vector regression

https://scikit-learn.org/stable/modules/svm.html#svm-regression  

## Linear Kernel

In [None]:
#place to save
pad = Path('data/ml_obs_op_data/SVR')
if not os.path.exists(pad):
    os.makedirs(pad)

# Full dataset
svr = SVR(kernel = 'linear', C = 1, epsilon = 0.1) #default values!
svr_lin, r2_train, r2_test, fig, ax = general_sklearn_model(svr, X_train, X_test, y_train.values.reshape(-1, 1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True)
ax.set_title('Linear SVR on full trainig set')
fig

Now exclude time features

In [None]:
svr_lin_nt, r2_train, r2_test, fig, ax = general_sklearn_model(
    svr, X_train.drop(['year_sin','year_cos'],axis =1), X_test.drop(['year_sin','year_cos'],axis = 1), y_train.values.reshape(-1, 1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True
)
ax.set_title('Linear SVR on full trainig set without time')
fig

Performes worse than normal linear regression


C and epsilon to be optimised => cross validation ideally

In [None]:
svr = SVR(kernel = 'linear')
svr_gs = GridSearchCV(svr, param_grid = {
    'C':np.logspace(-10,3,14),
    'epsilon':np.logspace(-3,1,20),
}, scoring = 'r2', cv = 5, n_jobs = -1
) #5 fold CV without shuffling
if exec_hyperopt_tuning: 
    svr_gs_tuple_out = general_sklearn_model(
        svr_gs, X_train, X_test, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),X_train.index, X_test.index,Cstar,normalisation = True
    )
    joblib.dump(svr_gs_tuple_out,pad/'svr_optim_linear.joblib')
else:
    svr_gs_tuple_out = joblib.load(pad/'svr_optim_linear.joblib')
svr_gs_out = svr_gs_tuple_out[0]
r2_train = svr_gs_tuple_out[1]
r2_test = svr_gs_tuple_out[2]
fig = svr_gs_tuple_out[3]
ax = svr_gs_tuple_out[4]
epsilon  = svr_gs_out.best_estimator_.epsilon
c = svr_gs_out.best_estimator_.C
ax.set_title(r'Optimised linear SVR: $\epsilon$ = '+ f'{epsilon}' + r', $C$ = ' + f'{c}')
if not exec_hyperopt_tuning:
    print(r2_train)
    print(r2_test)
    display(fig)

In [None]:
svr_gs_out.best_estimator_

## Non-linear: RBF kernel

Really analogous to the work of: https://ieeexplore.ieee.org/document/9451176 

In [None]:
svr_rbf = SVR(kernel = 'rbf', C = 1, epsilon = 0.1, gamma ='auto') #default test (auto means gamma = 1/n_features)
svr_rbf_out, r2_train, r2_test, fig, ax = general_sklearn_model(svr_rbf, X_train, X_test, y_train.values.reshape(-1, 1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True)
ax.set_title('SVR RBF full dataset')
print(f'gamma: {1/n_train}')
display(svr_rbf)

In [None]:
svr_rbf_out, r2_train, r2_test, fig, ax = general_sklearn_model(svr_rbf, X_train.drop(['year_sin','year_cos'],axis = 1), X_test.drop(['year_sin','year_cos'],axis = 1), y_train.values.reshape(-1, 1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True)
ax.set_title('SVR RBF full dataset no time features')

Again perform a GridSearch to determine optimal values of $\gamma,\epsilon$ and $\sigma$

In [None]:
svr_rbf = SVR(kernel = 'rbf')
svr_gs_rbf = GridSearchCV(svr_rbf, param_grid = {
    'C':np.logspace(-10,3,14),
    'epsilon':np.logspace(-3,1,20),
    'gamma':np.logspace(-5,5,50)
}, scoring = 'r2', cv = 5, n_jobs = -1, verbose = 3
) #5 fold CV without shuffling
if exec_hyperopt_tuning: 
    svr_gs_tuple_out = general_sklearn_model(
        svr_gs_rbf, X_train, X_test, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),X_train.index, X_test.index,Cstar,normalisation = True
    )
    joblib.dump(svr_gs_tuple_out,pad/'svr_optim_rbf.joblib')
else:
    svr_gs_tuple_out = joblib.load(pad/'svr_optim_rbf.joblib')
svr_gs_rbf_out = svr_gs_tuple_out[0]
r2_train = svr_gs_tuple_out[1]
r2_test = svr_gs_tuple_out[2]
fig = svr_gs_tuple_out[3]
ax = svr_gs_tuple_out[4]
epsilon  = svr_gs_rbf_out.best_estimator_.epsilon
c = svr_gs_rbf_out.best_estimator_.C
gamma = svr_gs_rbf_out.best_estimator_.gamma
ax.set_title(r'Optimised RBF SVR: $\epsilon$ = '+ f'{epsilon}' + r', $C$ = ' + f'{c}' + r', $\gamma$ = ' + f'{gamma}')
if not exec_hyperopt_tuning:
    print(r2_train)
    print(r2_test)
    display(fig)

Apply idea of Rains et al here: only take in VV (+ orbit info) on small dataset combined with LAI

In [None]:
svr_rbf_rains = SVR(kernel = 'rbf', epsilon = 0.1)
svr_gs_rbf_rains = GridSearchCV(svr_rbf_rains, param_grid = {
    'C':[0.001, 0.01, 0.1, 1, 10, 20, 30, 40, 50, 100],
    'gamma':np.logspace(-3,1,5),
}, scoring = 'r2', cv = 5, n_jobs = -1, verbose = 3
)

if exec_hyperopt_tuning: 
    svr_gs_tuple_out = general_sklearn_model(
        svr_gs_rbf_rains, X_train_small.drop(['year_sin','year_cos','VH_past_agr'], axis = 1), X_test_small.drop(['year_sin','year_cos','VH_past_agr'], axis = 1), y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),X_train.index, X_test.index,Cstar,normalisation = True
    )
    # svr_gs_tuple_out = general_sklearn_model(
    #     svr_gs_rbf_rains, X_train_small.drop(['VH_past_agr'], axis = 1), X_test_small.drop(['VH_past_agr'], axis = 1), y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),X_train.index, X_test.index,Cstar,normalisation = True
    # )
    joblib.dump(svr_gs_tuple_out,pad/'svr_optim_rbf_rains.joblib')
else:
    svr_gs_tuple_out = joblib.load(pad/'svr_optim_rbf_rains.joblib')
svr_gs_rbf_out = svr_gs_tuple_out[0]
r2_train = svr_gs_tuple_out[1]
r2_test = svr_gs_tuple_out[2]
fig = svr_gs_tuple_out[3]
ax = svr_gs_tuple_out[4]
epsilon  = svr_gs_rbf_out.best_estimator_.epsilon
c = svr_gs_rbf_out.best_estimator_.C
gamma = svr_gs_rbf_out.best_estimator_.gamma
ax.set_title(r'Optimised RBF SVR: $\epsilon$ = '+ f'{epsilon}' + r', $C$ = ' + f'{c}' + r', $\gamma$ = ' + f'{gamma}')
if not exec_hyperopt_tuning:
    print(r2_train)
    print(r2_test)
display(fig)

Switch it around to forward observation operator idea: on small data set use C* + LAI + DOY -> gamma_0_VV 

In [None]:
# target_gamma0_VV = X_full_small['VV_past_agr']
# Cstar_forward = Cstar.reset_index()
# Cstar_forward = Cstar_forward.rename(columns = {'Time':'t'})
# Cstar_forward = Cstar_forward.set_index('t')
# features_forward = pd.merge(X_full_small, Cstar_forward, how = 'left', on = 't')
# features_forward = features_forward.drop(['VV_past_agr','VH_past_agr'], axis = 1)
# features_forward_small = features_forward.drop(['year_sin','year_cos','ascending'], axis = 1)
# svr_rbf_test = SVR(kernel = 'rbf', epsilon = 0.1)
# svr_gs_rbf_test = GridSearchCV(svr_rbf_test, param_grid = {
#     'C':np.logspace(-10,3,14),
#     'gamma':np.logspace(-5,5,50)
# }, scoring = 'r2', cv = 5, n_jobs = -1, verbose = 3
# )
# svr_test_out = svr_gs_rbf_test.fit(features_forward_small, target_gamma0_VV.values.flatten())

In [None]:
# svr_test_out.best_score_

New idea of 08/05: keep epsilon at 0.1 in hyperparameter tuning (as done by Rains!)

In [None]:
svr_rbf_eps = SVR(kernel = 'rbf', epsilon = 0.1)
svr_gs_rbf_eps = GridSearchCV(svr_rbf_eps, param_grid = {
    'C':np.logspace(-10,3,14),
    'gamma':np.logspace(-5,5,50)
}, scoring = 'r2', cv = 5, n_jobs = -1, verbose = 3
)
if exec_hyperopt_tuning: 
    svr_gs_tuple_out = general_sklearn_model(
        svr_gs_rbf_eps, X_train, X_test, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),X_train.index, X_test.index,Cstar,normalisation = True
    )
    joblib.dump(svr_gs_tuple_out,pad/'svr_optim_rbf_eps.joblib')
else:
    svr_gs_tuple_out = joblib.load(pad/'svr_optim_rbf_eps.joblib')
svr_gs_rbf_eps_out = svr_gs_tuple_out[0]
r2_train = svr_gs_tuple_out[1]
r2_test = svr_gs_tuple_out[2]
fig = svr_gs_tuple_out[3]
ax = svr_gs_tuple_out[4]
epsilon  = svr_gs_rbf_eps_out.best_estimator_.epsilon
c = svr_gs_rbf_eps_out.best_estimator_.C
gamma = svr_gs_rbf_eps_out.best_estimator_.gamma
ax.set_title(r'Optimised RBF SVR: $\epsilon$ = '+ f'{epsilon}' + r', $C$ = ' + f'{c}' + r', $\gamma$ = ' + f'{gamma}')
if not exec_hyperopt_tuning:
    print(r2_train)
    print(r2_test)
display(fig)

Better performance than the previous experiment where $\epsilon$ could vary = this one is used!

Now this approach is also once tried for no time features for comparison

In [None]:
svr_rbf_eps = SVR(kernel = 'rbf', epsilon = 0.1)
svr_gs_rbf_eps = GridSearchCV(svr_rbf_eps, param_grid = {
    'C':np.logspace(-10,3,14),
    'gamma':np.logspace(-5,5,50)
}, scoring = 'r2', cv = 5, n_jobs = -1, verbose = 3
)
exec_hyperopt_tuning = True
if exec_hyperopt_tuning: 
    svr_gs_tuple_out = general_sklearn_model(
        svr_gs_rbf_eps, X_train.drop(['year_sin','year_cos'],axis = 1), X_test.drop(['year_sin','year_cos'],axis =1), y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),X_train.index, X_test.index,Cstar,normalisation = True
    )
    joblib.dump(svr_gs_tuple_out,pad/'svr_optim_rbf_eps_no_time.joblib')
else:
    svr_gs_tuple_out = joblib.load(pad/'svr_optim_rbf_eps_no_time.joblib')
svr_gs_rbf_eps_out = svr_gs_tuple_out[0]
r2_train = svr_gs_tuple_out[1]
r2_test = svr_gs_tuple_out[2]
fig = svr_gs_tuple_out[3]
ax = svr_gs_tuple_out[4]
epsilon  = svr_gs_rbf_eps_out.best_estimator_.epsilon
c = svr_gs_rbf_eps_out.best_estimator_.C
gamma = svr_gs_rbf_eps_out.best_estimator_.gamma
ax.set_title(r'Optimised RBF SVR, no DOY: $\epsilon$ = '+ f'{epsilon}' + r', $C$ = ' + f'{c}' + r', $\gamma$ = ' + f'{gamma}')
if not exec_hyperopt_tuning:
    print(r2_train)
    print(r2_test)
display(fig)

# Gaussian processes

Bayesian, non-linear model, also using the RBF kernel

In [None]:
#Pad to save to later
pad = Path('data/ml_obs_op_data/GPR')
font_size = 13
presentation_plot = False
if presentation_plot:
    fig, ax = plt.subplots()
    plt.rcParams.update({'font.size': font_size})
else:
    fig, ax = plt.subplots(figsize = (9,4))
kernel = RBF(length_scale_bounds=(1e-2,1e2)) + WhiteKernel(noise_level_bounds=(1e-1,1e3))
gpr = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=100, normalize_y=True, random_state = SEED)
gpr_pipe = make_pipeline(StandardScaler(),gpr)
gpr_pipe_out,r2_train,r2_test,fig,ax = general_sklearn_model(
    gpr_pipe, X_train, X_test, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, #normalisation = True, 
    save_predictions =True, pad = pad, fig = fig, ax = ax
)
#Also ask the standard deviations! 
y_mean, y_std = gpr_pipe_out.predict(X_full, return_std = True)
ax.fill_between(X_full.index, y_mean - 2*y_std, y_mean + 2*y_std, color = 'lightgrey')
# ax.set_title('GPR with RBF kernel: full dataset')
ax.set_xlabel('Time')
ax.legend(['PDM','Train','Test','95% CI'],loc = 'lower left')
if presentation_plot:
    ax.set_title('Gaussian processes')
    fig.savefig(pad_pres/'gpr.svg',format = 'svg', transparent = True)
    display(fig)
    plt.rcParams.update(matplotlib.rcParamsDefault)
else:
    ax.set_title('')
    fig.savefig(figpad/'gpr_predictions.pdf', format = 'pdf', bbox_inches = 'tight')
    display(fig)

In [None]:
gpr_pipe_out[1].kernel_

In [None]:
#Drop the time features
gpr_pipe_out,r2_train,r2_test,fig,ax = general_sklearn_model(
    gpr_pipe, X_train.drop(['year_sin','year_cos'],axis = 1), X_test.drop(['year_sin','year_cos'],axis =1), y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True
)
ax.set_title('GPR with RBF kernel: full dataset no time features')
display(fig)
display(gpr_pipe_out[1].kernel_)

In [None]:
# Drop the forest features
gpr_pipe_out,r2_train,r2_test,fig,ax = general_sklearn_model(
    gpr_pipe, X_train_no_forest, X_test_no_forest, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True
)
ax.set_title('GPR with RBF kernel: full dataset no forest features')
display(fig)
display(gpr_pipe_out[1].kernel_)

In [None]:
#Small dataset
gpr_pipe_out,r2_train,r2_test,fig,ax = general_sklearn_model(
    gpr_pipe, X_train_small, X_test_small, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True
)
ax.set_title('GPR with RBF kernel: small dataset')
display(fig)
display(gpr_pipe_out[1].kernel_)

In [None]:
gpr_pipe_out,r2_train,r2_test,fig,ax = general_sklearn_model(
    gpr_pipe, X_train_small.drop(['year_sin','year_cos'],axis = 1), X_test_small.drop(['year_sin','year_cos'],axis = 1), y_train.values.reshape(-1,1), y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True
)
ax.set_title('GPR with RBF kernel: small dataset no DOY')
display(fig)
display(gpr_pipe_out[1].kernel_)

Note that performance on small dataset without time features is almost just as bad as the linear methods!

In lost notebook also based on CV hyperparameters were determined, will not be repeated here (did not yield superior results). The method based on CV is also mentioned in book on GP for ML

## An experiment: observation operator: from 