# Central Limit Theorem: Correct model

In [None]:
# %load imports.py
%load_ext autoreload
%autoreload 2
%reload_kedro
%config Completer.use_jedi = False  ## (To fix autocomplete)
import pandas as pd
from vessel_manoeuvring_models.models.vmm import ModelSimulator
import matplotlib.pyplot as plt
from vessel_manoeuvring_models.visualization.plot import track_plots, plot, captive_plot
import kedro
import numpy as np
import os.path
import anyconfig

import matplotlib
matplotlib.rcParams["figure.figsize"] = (15,4)
from vessel_manoeuvring_models.symbols import *

# Read configs:
conf_path = os.path.join("../conf/base/")
runs_globals_path = os.path.join(
    conf_path,
    "runs_globals.yml",
)

runs_globals = anyconfig.load(runs_globals_path)
model_test_ids = runs_globals["model_test_ids"]

join_globals_path = os.path.join(
    conf_path,
    "join_globals.yml",
)

joins = runs_globals["joins"]
join_runs_dict = anyconfig.load(join_globals_path)

globals_path = os.path.join(
    conf_path,
    "globals.yml",
)
global_variables = anyconfig.load(globals_path)



vmms = global_variables["vmms"]
only_joined = global_variables[
    "only_joined"
]  # (regress/predict with only models from joined runs)S

In [None]:
from scipy.stats import norm
from sklearn.model_selection import RepeatedKFold
import statsmodels.api as sm
import seaborn as sns

In [None]:
C_1,C_2, x, y = sp.symbols("C_1 C_2 x y")

In [None]:
eq_linear = sp.Eq(y, C_1 + C_2*x)
eq_linear

In [None]:
epsilon, z = sp.symbols("epsilon z")


In [None]:
eq_linear_measure = sp.Eq(z, y + epsilon)
eq_linear_measure

In [None]:
solution = sp.solve(eq_linear, y, dict=True)[0][y]
lambda_y = sp.lambdify(list(solution.free_symbols), solution)
lambda_y

In [None]:
solution = sp.solve((eq_linear_measure, eq_linear), z, y)[z]
lambda_z = sp.lambdify(list(solution.free_symbols), solution)
lambda_z

In [None]:
scale = 2
epsilon_ = norm(loc=0, scale=scale)
C_1_ = 1
C_2_ = 1

N = 200
data = pd.DataFrame()
data['x'] = np.linspace(0,5,N)
np.random.seed(42)
data['epsilon'] = epsilon_.rvs(size=N)
data['y'] = lambda_y(C_1=C_1_, C_2=C_2_, x=data['x'])
data['z'] = lambda_z(C_1=C_1_, C_2=C_2_, epsilon=data['epsilon'], x=data['x'])


In [None]:
fig,ax=plt.subplots()
data.plot(x='x', y='y', style='--', ax=ax);
data.plot(x='x', y='z', style='.', ax=ax);

In [None]:
X = pd.DataFrame(index=data.index)
X['C_1'] = 1
X['C_2'] = data['x']

model = sm.OLS(data['z'], X, hasconst=True)
result = model.fit()

result.summary()

In [None]:
fig,ax=plt.subplots()
data.plot(x='x', y='y', style='--', ax=ax);
data.plot(x='x', y='z', style='.', ax=ax);

ax.plot(data['x'],result.predict(X),':', label='prediction')
ax.legend()

In [None]:
errors = data['z'] - result.predict(X)
errors.hist(bins=30)

In [None]:
random_state = 42
rkf = RepeatedKFold(n_splits=2, n_repeats=100, random_state=random_state)

df_parameters = pd.DataFrame()

for train, test in rkf.split(X):
    
    X_train = X.iloc[train]
    y_train = data['z'].iloc[train]
    
    model_ = sm.OLS(y_train, X_train, hasconst=True)
    result_ = model_.fit()
    df_parameters = df_parameters.append(result_.params, ignore_index=True)

In [None]:
df_parameters.mean()

In [None]:
df_parameters.std()

In [None]:
keys = ['C_1','C_2']
for key in keys:
    grid = sns.displot(df_parameters, x=key, kind="kde")
    
    rv = norm(loc=df_parameters[key].mean(), scale=df_parameters[key].std())
    ax = grid.ax
    x = np.linspace(data['x'].min(),data['x'].max(),500)
    ax.plot(x, rv.pdf(x), 'r-', label='std')
    ax.legend()
    
    rv2 = norm(loc=result.params[key], scale=result.bse[key])
    ax = grid.ax
    x = np.linspace(data['x'].min(),data['x'].max(),500)
    ax.plot(x, rv2.pdf(x), 'r-', label='bse')
    ax.legend()