# Simple models for the observation operator

Before applying the complex LSTM model, we'll first check the performance of much simpler ML models: linear regression and ridge regression.

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import os
import dask
from sklearn.linear_model import (LinearRegression, Ridge, RidgeCV,
                                   LassoCV, Lasso)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
from pathlib import Path

pad = Path(os.getcwd())
if pad.name == "ml_observation_operator":
    pad_correct = pad.parent
    os.chdir(pad_correct)
from functions.PDM import PDM
from functions.pre_processing import reshape_data, reshaped_to_train_test
from functions.ml_utils import general_sklearn_model
SEED = 1234

os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)

exec_hyperopt_tuning = True

%load_ext autoreload 
%autoreload 2 

In [None]:
os.getcwd()

## Load in data

In [None]:
%run "ml_observation_operator/data_load_in.py"

In [None]:
ML_data_pad = Path("data/Zwalm_data/ML_data")
X_full_all = pd.read_pickle(ML_data_pad/"X_full_all.pkl")

y_train = pd.read_pickle(ML_data_pad/"y_train.pkl")
y_test = pd.read_pickle(ML_data_pad/"y_test.pkl")
y_full = pd.read_pickle(ML_data_pad/"y_full.pkl")

Cstar = pd.read_pickle(ML_data_pad/"Cstar.pkl")


Full set of possible features: Forest, Pasture, Agriculture and a combination of pasture and agricculture

In [None]:
features_corr = X_full_all.corr()
features_corr.style.background_gradient(cmap = 'coolwarm')

Load in both full and the smaller dataset

In [None]:
X_train = pd.read_pickle(ML_data_pad/"X_train.pkl")
X_test = pd.read_pickle(ML_data_pad/"X_test.pkl")
X_full = pd.read_pickle(ML_data_pad/"X_full.pkl")
display(X_full.head())

X_train_small = pd.read_pickle(ML_data_pad/"X_train_small.pkl")
X_test_small = pd.read_pickle(ML_data_pad/"X_test_small.pkl")
X_full_small = pd.read_pickle(ML_data_pad/"X_full_small.pkl")
display(X_full_small.head())

In [None]:
X_train = X_train_all.iloc[:,~X_train_all.columns.str.endswith('past_agr')]
X_test = X_test_all.iloc[:,~X_test_all.columns.str.endswith('past_agr')]
X_full = X_full_all.iloc[:,~X_full_all.columns.str.endswith('past_agr')]


X_train_small = X_train_all.iloc[:,~X_train_all.columns.str.endswith(('Forest','Pasture','Agriculture'))]
X_test_small = X_test_all.iloc[:,~X_test_all.columns.str.endswith(('Forest','Pasture','Agriculture'))]
X_full_small = X_full_all.iloc[:,~X_full_all.columns.str.endswith(('Forest','Pasture','Agriculture'))]
display(X_full_small.head())

24/03/2023: drop delta_t feature as experiment (since this will not lead to better perfomance if not multiple timesteps included). From 0.7 to around 0.74 in test score. 

In [None]:
X_train = X_train.drop('delta_t',axis = 1)
X_test = X_test.drop('delta_t',axis = 1)
X_full = X_full.drop('delta_t',axis = 1)

X_train_small = X_train_small.drop('delta_t',axis = 1)
X_test_small = X_test_small.drop('delta_t',axis = 1)
X_full_small = X_full_small.drop('delta_t',axis = 1)

## Linear regression

Idea of linear regression as observation operator alreayd applied in Auber(cf. [obsidian](C:\Users\olivi\Documents\ob_obsidian\DA\Aubert_SM_DA_in_conceptual_model.md) and https://www.sciencedirect.com/science/article/pii/S0022169403002294?via%3Dihub )

Include forest in the equation

https://en.wikipedia.org/wiki/Coefficient_of_determination#Adjusted_R2 

Calculated adjusted $R^2$ (=$\bar{R}^2$) from regular $R^2$ as:
$$
{\displaystyle {\bar {R}}^{2}=1-(1-R^{2}){n-1 \over n-p}}
$$

with $n$ the number of variables  and $p$ the number parameters 

### 1 input, 1 output

Very basic idea: 1 input timestep to 1 output timestep without normalisation

In [None]:
linreg, r2_train, r2_test, fig, ax = general_sklearn_model(LinearRegression(), X_train,
                                                X_test, y_train, y_test, X_train.index,
                                                X_test.index, Cstar)

Add normalisation to input and output

In [None]:
linreg_norm, r2_train, r2_test, fig, ax = general_sklearn_model(LinearRegression(), X_train,
                                                X_test, y_train.values.reshape(-1,1), 
                                                y_test.values.reshape(-1,1), X_train.index,
                                                  X_test.index, Cstar, normalisation = True)

Drop the sin and cos feature

In [None]:
linreg_drop, r2_train, r2_test, fig, ax = general_sklearn_model(
     LinearRegression(), X_train.drop(['year_sin','year_cos'], axis = 1),
     X_test.drop(['year_sin','year_cos'], axis = 1), y_train.values.reshape(-1,1),
     y_test.values.reshape(-1,1),X_train.index, X_test.index, Cstar
)
                                                

Drop the forest related features (but keep time related feature)

In [None]:
type(X_train.index)

In [None]:
X_train.index.union(X_test.index)

In [None]:
X_train_no_forest = X_train.loc[:,~X_train.columns.str.endswith('Forest')]
X_test_no_forest = X_test.loc[:,~X_test.columns.str.endswith('Forest')]
linreg_drop_forest, r2_train, r2_test, fig, ax = general_sklearn_model(
     LinearRegression(), X_train_no_forest, X_test_no_forest,
    y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),
    X_train.index, X_test.index, Cstar
)

Not sure how to calculate R2 adjusted on test data

Conclusion on normalisation: not really necessary in this case, basically no difference in performance! 

In [None]:
coef_dict =  {}
for i, param in enumerate(X_train.columns.to_list()):
    coef_dict[param] = linreg.coef_[i]
pd_coef = pd.DataFrame(coef_dict, index =[0])
pd_coef

Clearly a lot of weight being given to the sinus feature!

Now also try with the dataset with less features: only the lumped pasture and agriculture

In [None]:
linreg_small, r2_train, r2_test, fig, ax = general_sklearn_model(LinearRegression(), X_train_small,
                                                X_test_small, y_train, y_test, X_train.index,
                                                X_test.index, Cstar)
#ax.plot(X_full.index, X_full['year_sin']*150+310, label = 'Sine wave')
ax.legend()

Worse performance than on the more full set

In [None]:
linreg_small, r2_train, r2_test, fig, ax = general_sklearn_model(LinearRegression(), X_train_small,
                                                X_test_small, y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),
                                                X_train.index, X_test.index, Cstar, normalisation= True)

Dropping time

In [None]:
linreg_small, r2_train, r2_test, fig, ax = general_sklearn_model(LinearRegression(), X_train_small.drop(['year_sin','year_cos'],axis =1),
                                                X_test_small.drop(['year_sin','year_cos'],axis =1), y_train.values.reshape(-1,1), 
                                                y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar,normalisation = True)
ax.set_title('Normalised Linear Regression: no time features')

So clearly a lot of dependence on the sinus feature to get decent results!

### several input timesteps, 1 output



In [None]:
seq_length = 5
X_window, y_window, t_window = reshape_data(
    X_full.values,y_full.values.reshape(-1,1),
    X_full.index.values, seq_length
)
print(X_window.shape)
print(t_window.shape)

n_train = X_train.shape[0]
(X_window_train, X_window_test, y_window_train, y_window_test, 
t_window_train, t_window_test) = reshaped_to_train_test(
    X_window, y_window, t_window, seq_length, n_train, 2
)

print(X_window_train.shape)
print(y_window_train.shape)
print(X_window_test.shape)
print(y_window_test.shape)

linreg_window, r2_train, r2_test, fig, ax = general_sklearn_model(
     LinearRegression(), X_window_train, X_window_test,
     y_window_train,
     y_window_test, t_window_train, X_test.index, Cstar
)
ax.set_title("Window LinearRegression on max # features")

So linear regression quite clearly overfits on the window data => idea of trying ridge regression

Also try on the smaller dataset

In [None]:
X_full_small

In [None]:
X_window, y_window, t_window = reshape_data(
    X_full_small.values,y_full.values.reshape(-1,1),
    X_full_small.index.values, seq_length
)
n_train = X_train.shape[0]
(X_window_train, X_window_test, y_window_train, y_window_test, 
t_window_train, t_window_test) = reshaped_to_train_test(
    X_window, y_window, t_window, seq_length, n_train, 2
)
linreg_window_small, r2_train, r2_test, fig, ax = general_sklearn_model(
     LinearRegression(), X_window_train, X_window_test,
     y_window_train,
     y_window_test, t_window_train, X_test.index, Cstar
)
ax.set_title("Window Linear Regression on the lumped dataset")

So worse performance by including more timesteps! 

## Ridge and Lasso regression

L2 normalisation. Well explained in https://scikit-learn.org/stable/modules/linear_model.html#ridge-regression-and-classification 

Aslo L1 normalisation ncan be tried with Lasso Regression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html automatic best hyperparamter $\alpha$ (often called $\lambda$) for regularisation by appyling crosss validation. cv = 5 is 5-fold cross-validation within the training set

### 1 input 1 output

In [None]:
kf = KFold(n_splits = 5, shuffle=True) #prevent shuffling the dat
ridge = RidgeCV(alphas = np.logspace(-3,3,100), cv = kf)
lasso = LassoCV(alphas = np.logspace(-3,3,100), cv = kf)
names = ["ridge","lasso"]
for i, model in enumerate([ridge, lasso]):
     ridge, r2_train, r2_test, fig, ax = general_sklearn_model(
          model, X_train, X_test, y_train,
          y_test, X_train.index, X_test.index, Cstar
     )
     ax.set_title(names[i] + r' with $\alpha = $' +  f'{model.alpha_}')

Worse than linear regression: both for Lasso and Ridge (but Lasso performs sligthly better)

Include normalisation

In [None]:
ridge = RidgeCV(alphas = np.logspace(-3,3,100), cv = kf)
lasso = LassoCV(alphas = np.logspace(-3,3,100), cv = kf)
for i, model in enumerate([ridge, lasso]):
     ridge_norm, r2_train, r2_test, fig, ax = general_sklearn_model(
          model, X_train.values, X_test.values, y_train.values.reshape(-1,1),
          y_test.values.reshape(-1,1), X_train.index, X_test.index, Cstar, normalisation = True
     )
     ax.set_title('Normalised ' + names[i] + r'with $\alpha = $' +  f'{model.alpha_}')

Slightly better performance with normalisation. Lasso normalisation has highest test score thusfar

Drop time features

In [None]:
ridge = RidgeCV(alphas = np.logspace(-3,3,100), cv = kf)
lasso = LassoCV(alphas = np.logspace(-3,3,100), cv = kf)
for i, model in enumerate([ridge, lasso]):
    ridge_drop, r2_train, r2_test, fig, ax = general_sklearn_model(
        model , X_train.drop(['year_sin','year_cos'], axis = 1),
        X_test.drop(['year_sin','year_cos'], axis = 1), y_train.values.reshape(-1,1),
        y_test.values.reshape(-1,1),X_train.index, X_test.index, Cstar, normalisation = True
    )
    ax.set_title('Normalised ' + names[i] + r' no time features with $\alpha = $' +  f'{model.alpha_}')

In [None]:
ridge = RidgeCV(alphas = np.logspace(-3,3,100), cv = kf)
lasso = LassoCV(alphas = np.logspace(-3,3,100), cv = kf)
for i, model in enumerate([ridge, lasso]):
    linreg_drop_forest, r2_train, r2_test, fig, ax = general_sklearn_model(
        model, X_train_no_forest, X_test_no_forest,
        y_train.values.reshape(-1,1), y_test.values.reshape(-1,1),
        X_train.index, X_test.index, Cstar, normalisation = True #normalisation = slightly better performance
    )
    ax.set_title('Normalised RidgeCV drop forest')
    ax.set_title('Normalised ' + names[i] + r' drop forest with $\alpha = $' +  f'{model.alpha_}')

### Multiple inputs, 1 output

In [None]:
ridge_window, r2_train, r2_test, fig, ax = general_sklearn_model(
     ridge, X_window_train, X_window_test,
     y_window_train,
     y_window_test, t_window_train, X_test.index, Cstar
)
ax.set_title("Window Ridge regression")

In [None]:
X_full.columns

In [None]:
seq_length = 30
X_window, y_window, t_window = reshape_data(
    X_full.drop(['year_sin','year_cos'],axis = 1).values,y_full.values.reshape(-1,1),
    X_full.index.values, seq_length
)
print(X_window.shape)
print(t_window.shape)

n_train = X_train.shape[0]
(X_window_train, X_window_test, y_window_train, y_window_test, 
t_window_train, t_window_test) = reshaped_to_train_test(
    X_window, y_window, t_window, seq_length, n_train, 2
)

In [None]:
ridge

In [None]:
ridge_window_drop, r2_train, r2_test, fig, ax = general_sklearn_model(
     ridge, X_window_train, X_window_test,
     y_window_train,
     y_window_test, t_window_train, X_test.index, Cstar
)
ax.set_title('Ridge window no time features')

In [None]:
ridge_window_drop.coef_.shape

### Cross validation for best model structure

Idea for the full dataset:
- per model (so Ridge, Lasso and normal Linear regression) find the best set of hyperparameters:
    - sequence length of input: 1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60
    - forest or no forest
    - time of no time
    - range of alpha values (include 0 = linear regression is included!)
Normalisation is given to inputs to avoid problems with trainig of the algorithms 

In [None]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)
simplefilter("ignore", category=UserWarning)
nr_folds = 4
n_train = X_train.shape[0]
alpha_range = np.concatenate([np.logspace(-3,3,100),np.array([0])]) 
range_seq_length = np.array([1, 2, 3, 4, 5, 10, 20, 30, 40, 50, 60])#2**np.arange(1,7)#np.arange(1,100,10)
range_forest = [True, False]
range_time_goniometr = [True, False]
model_names = ['Ridge', 'Lasso']
nr_options = len(range_seq_length)*len(range_forest)*len(range_time_goniometr)*len(alpha_range)*len(model_names)
col_names = ['model','seq_length','forest_bool','time_bool','alpha','r2_val_mean','r2_val_sd']
pd_hyperparam = pd.DataFrame(columns=col_names, index = range(0,nr_options))
iter = 0
X_scaler = StandardScaler()
X_scaler.fit(X_train)
X_full_norm = pd.DataFrame(X_scaler.transform(X_full), columns = X_full.columns)
y_scaler = StandardScaler()
y_scaler.fit(y_train.values.reshape(-1,1))
y_full_norm = y_scaler.transform(y_full.values.reshape(-1,1))
if exec_hyperopt_tuning:
    for seq_length in range_seq_length:
        for forest in range_forest:
            for time_goniometr in range_time_goniometr:
                X_temp = X_full_norm.copy()
                if not time_goniometr:
                    X_temp = X_temp.drop(['year_sin','year_cos'],axis = 1)
                if not forest:
                    X_temp = X_temp.loc[:,~X_temp.columns.str.endswith('Forest')]
                X_window, y_window, t_window = reshape_data(
                    X_temp.values,y_full_norm,
                    X_full.index.values, seq_length
                )
                (X_window_train, X_window_test, y_window_train, y_window_test, 
                t_window_train, t_window_test) = reshaped_to_train_test(
                    X_window, y_window, t_window, seq_length, n_train, output_dim = 2
                )
                for alpha in alpha_range:
                    for model_name in model_names:
                        kf = KFold(nr_folds, shuffle = False)
                        r2_val_list = []
                        for i, (train_index, test_index) in enumerate(kf.split(X_window_train)):
                            if model_name == 'Lasso':
                                model = Lasso(alpha = alpha)
                            else:
                                model = Ridge(alpha = alpha)
                            #model_temp, r2_train, r2_val, fig, ax = general_sklearn_model(
                            delayed_result = dask.delayed(general_sklearn_model)(
                                model, X_window_train[train_index], X_window_train[test_index],
                                y_window_train[train_index],y_window_train[test_index], 
                                t_window_train[train_index], t_window_train[test_index], Cstar,
                                print_output = False
                            )
                            r2_val_list.append(delayed_result)
                            #r2_val_list.append(r2_val)
                        r2_vals = dask.compute(*r2_val_list)
                        r2_vals = [r2_vals[i][2] for i in range(len(r2_vals))]
                        pd_hyperparam.iloc[iter,:] = [model_name,seq_length, forest, time_goniometr,
                                                    alpha, np.mean(r2_vals),np.std(r2_vals)]
                        iter = iter + 1 
                        if iter%100 == 0:
                            print(f'Iteration {iter} out of {nr_options} completed')

In [None]:
if exec_hyperopt_tuning:
    pd_hyperparam.to_csv("data/ml_obs_op_data/lin_reg_hyperparam_cv.csv", index = False)
else:
    pd_hyperparam = pd.read_csv("data/ml_obs_op_data/lin_rge_hyperparam_cv.csv")

In [None]:
pd_hyperparam.head(10)

In [None]:
iter 

In [None]:
hyperparam_best = pd_hyperparam[pd_hyperparam['r2_val_mean'].max() == pd_hyperparam['r2_val_mean']]
hyperparam_best

Retrain a model with the above information on the entire training set!

In [None]:
alpha = hyperparam_best['alpha'].values[0]
seq_length = hyperparam_best['seq_length'].values[0]
time_goniometr = hyperparam_best['time_bool'].values[0]
forest = hyperparam_best['forest_bool'].values[0]
X_temp = X_full_norm.copy()
if not time_goniometr:
    X_temp = X_temp.drop(['year_sin','year_cos'],axis = 1)
if not forest:
    X_temp = X_temp.loc[:,~X_temp.columns.str.endswith('Forest')]
X_window, y_window, t_window = reshape_data(
    X_temp.values, y_full_norm,
    X_temp.index.values, seq_length
)
(X_window_train, X_window_test, y_window_train, y_window_test, 
t_window_train, t_window_test) = reshaped_to_train_test(
    X_window, y_window, t_window, seq_length, n_train, output_dim = 2
)
ridge = Ridge(alpha=alpha)#LOOCV as the default here (fastest)
ridge.fit(X_window_train, y_window_train)
y_train_hat = ridge.predict(X_window_train)
y_test_hat = ridge.predict(X_window_test)
y_train_hat = y_scaler.inverse_transform(y_train_hat)
y_test_hat = y_scaler.inverse_transform(y_test_hat)
fig, ax = plt.subplots()
Cstar.plot(ax=ax)
ax.plot(X_train_all.index[t_window_train], y_train_hat, label = 'Train')
ax.plot(X_full_all.index[t_window_test], y_test_hat, label = 'Test')
ax.legend()
ax.set_ylabel('C* [mm]')
r2_train = r2_score(y_window_train,y_scaler.transform(y_train_hat))
r2_test = r2_score(y_window_test, y_scaler.transform(y_test_hat))
print(f'trainig R2: {r2_train}')
print(f'test R2: {r2_test}')

So ridge regresion even after optimisation of hyperparameters, does not perform better than a simple linear regression on al features with a window base approach. Note that in fact a simple linear regression without the window base approach outperforms this approach on test data!

In [None]:
pd_hyperparam.sort_values('r2_val_mean',ascending=False).head(10)

In [None]:
pd_hyperparam_lasso = pd_hyperparam[pd_hyperparam['model'] == 'Lasso']
pd_hyperparam_lasso.sort_values('r2_val_mean',ascending=False).head(10)

In [None]:
hyperparam_lasso_best = pd_hyperparam_lasso[
    pd_hyperparam_lasso['r2_val_mean'].max() == pd_hyperparam_lasso['r2_val_mean']
]
alpha = hyperparam_lasso_best['alpha'].values[0]
seq_length = hyperparam_lasso_best['seq_length'].values[0]
time_goniometr = hyperparam_lasso_best['time_bool'].values[0]
forest = hyperparam_lasso_best['forest_bool'].values[0]
X_temp = X_full_norm.copy()
if not time_goniometr:
    X_temp = X_temp.drop(['year_sin','year_cos'],axis = 1)
if not forest:
    X_temp = X_temp.loc[:,~X_temp.columns.str.endswith('Forest')]
X_window, y_window, t_window = reshape_data(
    X_temp.values, y_full_norm,
    X_temp.index.values, seq_length
)
(X_window_train, X_window_test, y_window_train, y_window_test, 
t_window_train, t_window_test) = reshaped_to_train_test(
    X_window, y_window, t_window, seq_length, n_train, output_dim = 2
)
lasso = Lasso(alpha=alpha)#LOOCV as the default here (fastest)
lasso.fit(X_window_train, y_window_train)
y_train_hat = lasso.predict(X_window_train)
y_test_hat = lasso.predict(X_window_test)
y_train_hat = y_scaler.inverse_transform(y_train_hat.reshape(-1,1))
y_test_hat = y_scaler.inverse_transform(y_test_hat.reshape(-1,1))
fig, ax = plt.subplots()
Cstar.plot(ax=ax)
ax.plot(X_train_all.index[t_window_train], y_train_hat, label = 'Train')
ax.plot(X_full_all.index[t_window_test], y_test_hat, label = 'Test')
ax.legend()
ax.set_ylabel('C* [mm]')
r2_train = r2_score(y_window_train,y_scaler.transform(y_train_hat))
r2_test = r2_score(y_window_test, y_scaler.transform(y_test_hat))
print(f'trainig R2: {r2_train}')
print(f'test R2: {r2_test}')

In [None]:
no_time_sorted = pd_hyperparam[pd_hyperparam['time_bool'] == False].sort_values('r2_val_mean',ascending = False)
no_time_sorted.head(10)

In [None]:
hyperparam_best = pd.DataFrame(no_time_sorted.iloc[0,:].values.reshape(1,-1), columns = no_time_sorted.columns)
hyperparam_best

In [None]:
hyperparam_best = pd.DataFrame(no_time_sorted.iloc[0,:].values.reshape(1,-1), columns = no_time_sorted.columns)
alpha = hyperparam_best['alpha'].values[0]
seq_length = hyperparam_best['seq_length'].values[0]
time_goniometr = hyperparam_best['time_bool'].values[0]
forest = hyperparam_best['forest_bool'].values[0]
X_temp = X_full_norm.copy()
if not time_goniometr:
    X_temp = X_temp.drop(['year_sin','year_cos'],axis = 1)
if not forest:
    X_temp = X_temp.loc[:,~X_temp.columns.str.endswith('Forest')]
X_window, y_window, t_window = reshape_data(
    X_temp.values, y_full_norm,
    X_temp.index.values, seq_length
)
(X_window_train, X_window_test, y_window_train, y_window_test, 
t_window_train, t_window_test) = reshaped_to_train_test(
    X_window, y_window, t_window, seq_length, n_train, output_dim = 2
)
ridge = Ridge(alpha=alpha)#LOOCV as the default here (fastest)
ridge.fit(X_window_train, y_window_train)
y_train_hat = ridge.predict(X_window_train)
y_test_hat = ridge.predict(X_window_test)
y_train_hat = y_scaler.inverse_transform(y_train_hat)
y_test_hat = y_scaler.inverse_transform(y_test_hat)
fig, ax = plt.subplots()
Cstar.plot(ax=ax)
ax.plot(X_train_all.index[t_window_train], y_train_hat, label = 'Train')
ax.plot(X_full_all.index[t_window_test], y_test_hat, label = 'Test')
ax.legend()
ax.set_ylabel('C* [mm]')
r2_train = r2_score(y_window_train,y_scaler.transform(y_train_hat))
r2_test = r2_score(y_window_test, y_scaler.transform(y_test_hat))
print(f'trainig R2: {r2_train}')
print(f'test R2: {r2_test}')

Best test performance so far!

In [None]:
ridge.coef_.shape

In [None]:
print(f'numer of parameterrs for window length of {seq_length}: {max(ridge.coef_.shape) + len(ridge.intercept_)}')

So conclusion:
- Simple Linear regression on all features 1 timpestep: good performance
- Lasso regression on all features 1 timpestep: even slightly better performance
- cross validation: does not yield a better performance
- idea: for window trainig time info (with sin and cos) drop out to prevent overfitting on this! this results in the best test performance thusfar when using ridge with seq length of 30

## Support Vector regression

Skip as of now!

https://scikit-learn.org/stable/modules/svm.html#svm-regression


C and epsilon to be optimised => cross validation ideally

In [None]:
svr_nonlin = make_pipeline(StandardScaler(), SVR(kernel = 'linear', C = 100, epsilon = 0.2))
svr_nonlin.fit(X_train, y_train)
y_train_SVR = svr_nonlin.predict(X_train)
r2_SVR = r2_score(y_train, y_train_SVR)
print('trainig score SVR: ' + str(r2_SVR))

y_test_SVR = svr_nonlin.predict(X_test)
r2_SVR_test = r2_score(y_test, y_test_SVR)
print('trainig score SVR: ' + str(r2_SVR_test))

In [None]:
fig,ax = plt.subplots()
y_full.plot(ax = ax, ylabel = 'C* [mm]', label = 'PDM')
plt.plot(X_train.index, y_train_SVR, label = 'Train', alpha = 0.7)
plt.plot(X_test.index, y_test_SVR, label = 'Test', alpha = 0.7)
ax.legend()
ax.set_title('SVR')