# manoeuvring model Bias-variance tradeoff


In [None]:
# %load imports.py

import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
%reload_kedro
%config Completer.use_jedi = False  ## (To fix autocomplete)
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from src.models.vmm import ModelSimulator
import matplotlib.pyplot as plt
from src.visualization.plot import track_plots, plot, captive_plot
import kedro
import numpy as np
import os.path
import anyconfig

import matplotlib
matplotlib.rcParams["figure.figsize"] = (15,4)
from src.symbols import *

# Read configs:
conf_path = os.path.join("../conf/base/")
runs_globals_path = os.path.join(
    conf_path,
    "runs_globals.yml",
)

runs_globals = anyconfig.load(runs_globals_path)
model_test_ids = runs_globals["model_test_ids"]

join_globals_path = os.path.join(
    conf_path,
    "join_globals.yml",
)

joins = runs_globals["joins"]
join_runs_dict = anyconfig.load(join_globals_path)

globals_path = os.path.join(
    conf_path,
    "globals.yml",
)
global_variables = anyconfig.load(globals_path)



vmm_names = global_variables["vmms"]
only_joined = global_variables[
    "only_joined"
]  # (regress/predict with only models from joined runs)S

In [None]:
from wPCC_pipeline.pipelines.motion_regression.nodes import fit_motions
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import seaborn as sns
from src.bias_variance_tradeoff import (train_test_split, 
                                        train_test_split_run, 
                                        train_test_split_exteme, 
                                        train_predict, 
                                        pivot_mean, variances)

In [None]:
join = "joined"
data = catalog.load(f"{ join }.data_ek_smooth")

added_masses = catalog.load("added_masses")
ship_data = catalog.load("ship_data")
exclude_parameters = catalog.load("params:motion_regression.exclude_parameters")

runs_meta_data = catalog.load("runs_meta_data")
runs_meta_data.sort_values(by='description', inplace=True)

vmms = {}
for vmm_name in vmm_names:
    vmms[vmm_name] = catalog.load(f"{vmm_name}")


In [None]:
data.head()

In [None]:
runs_meta_data.head()

In [None]:
fig,ax=plt.subplots()
fig.set_size_inches(10,5)

data_ = data.copy()
data_['beta'] = -np.arctan2(data_['v'],data['u'])
data_['beta_deg'] = np.rad2deg(data_['beta'])
data_['r_deg'] = np.rad2deg(data_['r'])
data_['delta_deg'] = np.rad2deg(data_['delta'])

mask = runs_meta_data.index.isin(data_['id'].unique())
runs_meta_data_selected = runs_meta_data.loc[mask].copy()
mask = runs_meta_data_selected.duplicated(subset='description', keep='last')
runs_meta_data_selected = runs_meta_data_selected.loc[~mask]
data_groups = data_.groupby(by='id')

for id, meta_data in runs_meta_data_selected.iterrows():
    
    if not id in data_groups.groups.keys():
        continue
    
    df_ = data_groups.get_group(id)
    description = runs_meta_data.loc[id]['description']
    df_.plot(x='beta_deg', y='r_deg', ax=ax, label=description)
    
ax.set_ylabel(r'$r$ (yaw rate) $[deg/s]$')
ax.set_xlabel(r'$\beta$ (drift angle) $[deg]$');


In [None]:
fig,ax=plt.subplots()
fig.set_size_inches(10,5)

for id, meta_data in runs_meta_data_selected.iterrows():
    
    if not id in data_groups.groups.keys():
        continue
    
    df_ = data_groups.get_group(id)
    description = runs_meta_data.loc[id]['description']
    df_.plot(x='delta_deg', y='u', ax=ax, label=description)
    
ax.set_ylabel(r'$u$ (longitudinal speed) $[m/s]$')
ax.set_xlabel(r'$\delta$ (rudder angle) $[deg]$');

In [None]:
for key in ['r1d']:
    
    fig,ax=plt.subplots()
    fig.set_size_inches(10,5)
    
    for id, df_ in data.groupby(by='id'):
        df_.plot(y=key, ax=ax, label=id)
    
    ax.set_title(key)
    

In [None]:
np.random.seed(42)

regressions = {}

for vmm_name, vmm in vmms.items():

    regression, parameters = fit_motions(data=data, 
                                     added_masses=added_masses, 
                                     ship_data=ship_data, 
                                     vmm=vmm, 
                                     exclude_parameters=exclude_parameters)
    regressions[vmm_name] = regression

In [None]:
regression = regressions['vmm_martins_simple']

In [None]:
X_train, y_train, X_test, y_test, train_data = train_test_split_run(X=regression.X_N, y=regression.y_N, id=data['id'])

fig,ax=plt.subplots()
y_train.plot(ax=ax, label='train')
y_test.plot(ax=ax, label='test')
ax.legend()

Train many models on samples from the training set, then calculate bias and variance on the test set.

In [None]:
df_sample_predictions = train_predict(X_test=X_test, y_test=y_test, train_data=train_data, train_ratio=0.3)

In [None]:
df_sample_predictions.head()

In [None]:
df_sample_predictions['residual'] = df_sample_predictions['y_hat'] - df_sample_predictions['z']
df_sample_predictions['residual^2'] = df_sample_predictions['residual']**2

In [None]:
df_sample_predictions['residual'].hist(bins=100)

In [None]:
f_hats = pivot_mean(df_sample_predictions, key='y_hat').transpose()
f_hats.head()

In [None]:
fig,ax=plt.subplots()
mask = df_sample_predictions['i'].isin(np.arange(10))
for i, df_ in df_sample_predictions.loc[mask].groupby(by='i'):
    df_.plot(x='x', y='y_hat', alpha=0.7, ax=ax)

f_hats.plot(ax=ax, label='$\hat{f}(x)$')
y_test.plot(ax=ax, label='y')
ax.legend()

In [None]:
df_sample_predictions = pd.DataFrame()

ids=[22771, 22772, 22773]
for vmm_name, regression in regressions.items():
    
    X_train, y_train, X_test, y_test, train_data = train_test_split_run(X=regression.X_X, 
                                                                       y=regression.y_N, 
                                                                       id=data['id'], 
                                                                       ids=ids)
    
    df_ = train_predict(train_data, X_test=X_test, y_test=y_test, train_ratio=0.2, N_trainings=10,)
    df_['vmm'] = vmm_name
    df_sample_predictions = df_sample_predictions.append(df_)
    
df_sample_predictions.sort_values(by=['parameters','x'], inplace=True)

df_sample_predictions['residual'] = df_sample_predictions['y_hat'] - df_sample_predictions['z']
df_sample_predictions['residual^2'] = df_sample_predictions['residual']**2

In [None]:
vmm_groups = df_sample_predictions.groupby(by='vmm', sort=False)
f_hats = vmm_groups.apply(pivot_mean, key='y_hat').transpose()
f_hats.head()

In [None]:
fig,ax=plt.subplots()
f_hats.plot(ax=ax)
y_test.plot(ax=ax, label='y', lw=3, zorder=10)
ax.set_title('$\hat{f}(x)$')
ax.legend()

In [None]:
MSEs = vmm_groups.apply(pivot_mean, key='residual^2').transpose()
MSEs.head()


In [None]:
fig,ax=plt.subplots()
MSEs.plot(ax=ax)
ax.set_title('MSE')
ax.legend()

In [None]:
bias = f_hats.sub(y_test.values, axis=0)

In [None]:
fig,ax=plt.subplots()
bias.plot(ax=ax)
ax.set_title('bias')
ax.legend();

In [None]:
df_variances = vmm_groups.apply(variances).transpose()

In [None]:
fig,ax=plt.subplots()
df_variances.plot(ax=ax)
ax.set_title('variance')
ax.legend();

In [None]:
df_errors = pd.DataFrame()
df_errors['MSE'] = MSEs.mean()
df_errors['bias^2'] = (bias**2).mean()
df_errors['variance'] = df_variances.mean()

In [None]:
df_errors

In [None]:
fig,ax=plt.subplots()
df_errors.plot.bar(y=['bias^2', 'variance'], stacked=True, ax=ax)
ax.set_title('MSE');

## Extreme test

In [None]:
X_train, y_train, X_test, y_test, train_data = train_test_split_exteme(X=regression.X_N, y=regression.y_N, data=data, 
                                                                          min_ratio=0.05, 
                                                                          max_ratio=0.05,
                                                                          min_keys=['u'],
                                                                          max_keys=['v','r'])


fig,ax=plt.subplots()

y_train.plot(ax=ax, label='train', style='.')
y_test.plot(ax=ax, label='test', style='.')
ax.legend();


In [None]:
%%time
df_sample_predictions = pd.DataFrame()

for vmm_name, regression in regressions.items():
    
    X_train, y_train, X_test, y_test, train_data = train_test_split_exteme(X=regression.X_N,
                                                                        y=regression.y_N,
                                                                        data=data,
                                                                        min_ratio=0.05, 
                                                                        max_ratio=0.05,
                                                                        min_keys=['u'],
                                                                        max_keys=['v','r'])
    
    df_ = train_predict(train_data, X_test=X_test, y_test=y_test, train_ratio=0.01, N_trainings=10)
    df_['vmm'] = vmm_name
    df_sample_predictions = df_sample_predictions.append(df_)
    
df_sample_predictions.sort_values(by=['parameters','x'], inplace=True)

df_sample_predictions['residual'] = df_sample_predictions['y_hat'] - df_sample_predictions['z']
df_sample_predictions['residual^2'] = df_sample_predictions['residual']**2

In [None]:
vmm_groups = df_sample_predictions.groupby(by='vmm', sort=False)
f_hats = vmm_groups.apply(pivot_mean, key='y_hat').transpose()
bias = f_hats.sub(y_test.values, axis=0)
MSEs = vmm_groups.apply(pivot_mean, key='residual^2').transpose()
df_variances = vmm_groups.apply(variances).transpose()

df_errors = pd.DataFrame()
df_errors['MSE'] = MSEs.mean()
df_errors['bias^2'] = (bias**2).mean()
df_errors['variance'] = df_variances.mean()
df_errors

In [None]:
fig,ax=plt.subplots()
df_errors.plot.bar(y=['bias^2', 'variance'], stacked=True, ax=ax)
ax.set_title('MSE');

## Random test

In [None]:
X_train, y_train, X_test, y_test, train_data = train_test_split(X=regression.X_N,
                                                                y=regression.y_N, 
                                                                test_ratio=0.01)


fig,ax=plt.subplots()

y_train.plot(ax=ax, label='train', style='.')
y_test.plot(ax=ax, label='test', style='.')
ax.legend();

In [None]:
df_sample_predictions = pd.DataFrame()

for vmm_name, regression in regressions.items():
    
    X_train, y_train, X_test, y_test, train_data = train_test_split(X=regression.X_N,
                                                                    y=regression.y_N, 
                                                                    test_ratio=0.01)
    
    df_ = train_predict(train_data, X_test=X_test, y_test=y_test, train_ratio=0.005, N_trainings=10)
    df_['vmm'] = vmm_name
    df_sample_predictions = df_sample_predictions.append(df_, ignore_index=True)
    
df_sample_predictions.sort_values(by=['parameters','x'], inplace=True)

df_sample_predictions['residual'] = df_sample_predictions['y_hat'] - df_sample_predictions['z']
df_sample_predictions['residual^2'] = df_sample_predictions['residual']**2

In [None]:
vmm_groups = df_sample_predictions.groupby(by='vmm', sort=False)
f_hats = vmm_groups.apply(pivot_mean, key='y_hat').transpose()
bias = f_hats.sub(y_test.values, axis=0)
MSEs = vmm_groups.apply(pivot_mean, key='residual^2').transpose()
df_variances = vmm_groups.apply(variances).transpose()

df_errors = pd.DataFrame()
df_errors['MSE'] = MSEs.mean()
df_errors['bias^2'] = (bias**2).mean()
df_errors['variance'] = df_variances.mean()
df_errors

In [None]:
fig,ax=plt.subplots()
df_errors.plot.bar(y=['bias^2', 'variance'], stacked=True, ax=ax)
ax.set_title('MSE');

In [None]:
df_sample_predictions = pd.DataFrame()

train_ratios = 0.5**(np.arange(3,9))

for vmm_name, regression in regressions.items():
    
    X_train, y_train, X_test, y_test, train_data = train_test_split(X=regression.X_N,
                                                                    y=regression.y_N, 
                                                                    test_ratio=0.01)
    
    for train_ratio in train_ratios:
        df_ = train_predict(train_data, X_test=X_test, y_test=y_test, train_ratio=train_ratio, N_trainings=10)
        df_['vmm'] = vmm_name
        df_['train_ratio'] = train_ratio
        df_sample_predictions = df_sample_predictions.append(df_, ignore_index=True)
    
df_sample_predictions.sort_values(by=['parameters','x'], inplace=True)

df_sample_predictions['residual'] = df_sample_predictions['y_hat'] - df_sample_predictions['z']
df_sample_predictions['residual^2'] = df_sample_predictions['residual']**2

In [None]:
vmm_groups = df_sample_predictions.groupby(by=['train_ratio','vmm'], sort=False)
f_hats = vmm_groups.apply(pivot_mean, key='y_hat').transpose()
bias = f_hats.sub(y_test.values, axis=0)
MSEs = vmm_groups.apply(pivot_mean, key='residual^2').transpose()
df_variances = vmm_groups.apply(variances).transpose()

df_errors = pd.DataFrame()
df_errors['MSE'] = MSEs.mean()
df_errors['bias^2'] = (bias**2).mean()
df_errors['variance'] = df_variances.mean()
df_errors

In [None]:
sns.relplot(x='train_ratio', y='MSE', hue='vmm', kind='line', data=df_errors)

In [None]:
sns.relplot(x='train_ratio', y='variance', hue='vmm', kind='line', data=df_errors)