# Train test split KVLCC2 HSVA feature importance

In [None]:
# %load imports.py
%load_ext autoreload
%autoreload 2
%reload_kedro
%config Completer.use_jedi = False  ## (To fix autocomplete)
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from src.models.vmm import ModelSimulator
import matplotlib.pyplot as plt
from src.visualization.plot import track_plots, plot, captive_plot
import kedro
import numpy as np
import os.path
import anyconfig

import matplotlib
matplotlib.rcParams["figure.figsize"] = (10,7)
from src.symbols import *

# Read configs:
conf_path = os.path.join("../conf/base/")
runs_globals_path = os.path.join(
    conf_path,
    "runs_globals.yml",
)

runs_globals = anyconfig.load(runs_globals_path)
model_test_ids = runs_globals["model_test_ids"]

join_globals_path = os.path.join(
    conf_path,
    "join_globals.yml",
)

joins = runs_globals["joins"]
join_runs_dict = anyconfig.load(join_globals_path)

globals_path = os.path.join(
    conf_path,
    "globals.yml",
)
global_variables = anyconfig.load(globals_path)



vmm_names = global_variables["vmms"]

from wPCC_pipeline.pipelines.motion_regression.nodes import predict_force, fit_motions, create_model_from_motion_regression
from wPCC_pipeline.pipelines.prediction.nodes import simulate_euler
from src.prime_system import PrimeSystem

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
from src.parameters import df_parameters
p = df_parameters['symbol']

In [None]:
ship="kvlcc2_hsva"
#vmm_name = "vmm_martins_simple"
vmm_name = "vmm_abkowitz"
vmm = catalog.load(vmm_name)

ship_data = catalog.load(f"{ship}.ship_data")

regression = catalog.load(f"{ship}.updated.{vmm_name}.joined.regression")
regression.diff_eq_Y.exclude_parameters.pop('Ydelta')
regression.diff_eq_Y.exclude_parameters.pop('Ythrustdelta')


data = catalog.load(f"{ship}.updated.joined.data_ek_smooth")

added_masses = catalog.load(f"{ship}.added_masses")
exclude_parameters = catalog.load(f"params:{ship}.motion_regression.exclude_parameters")
data_with_force = predict_force(data=data, added_masses=added_masses, ship_parameters=ship_data, vmm=vmm)

In [None]:
from src.bias_variance_tradeoff import train_test_split_exteme, train_test_split_run
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
ids = list(data_with_force['id'].unique())
ids_train = ids.copy()
id_test = 'HSVA_CPMC_KVLCC2_Z_35_05'
ids_train.remove(id_test)
mask = data_with_force['id'].isin(ids_train)
data_train = data_with_force.loc[mask].copy()

In [None]:
ps = PrimeSystem(**ship_data)
#data['U'] = np.sqrt(data['u']**2 + data['v']**2)
data_prime = ps.prime(data_train, U=data_with_force['U'])

In [None]:
Xs = {}
ys = {}

Xs['X'], ys['X'] = regression.diff_eq_X.calculate_features_and_label(data=data_prime, y=data_prime['fx'])
Xs['Y'], ys['Y'] = regression.diff_eq_Y.calculate_features_and_label(data=data_prime, y=data_prime['fy'])
Xs['N'], ys['N'] = regression.diff_eq_N.calculate_features_and_label(data=data_prime, y=data_prime['mz'])

dofs = list(Xs.keys())

In [None]:
def feature_imporance(X,y):
    
    scaler = MinMaxScaler()
    #scaler = StandardScaler()
    scaler.fit(X)
    X_transform = pd.DataFrame(scaler.transform(X), columns=X.columns, index=X.index)
    linear_regression = LinearRegression(fit_intercept=False)
    linear_regression.fit(X=X_transform, y=y)
    
    coeffs = pd.Series(linear_regression.coef_, index=X_transform.columns)
    importance = coeffs.abs().sort_values(ascending=False)
    importance = importance/importance.sum()
    return importance

importances = {}

for dof,X in Xs.items():
    y = ys[dof]
    importances[dof] = feature_imporance(X,y)

In [None]:
for dof in dofs:
    fig,ax=plt.subplots()
    importances[dof].plot.bar(ax=ax)
    fig.suptitle(dof)

In [None]:
min_importance = 0.01
removes = []
for dof in dofs:
    mask = importances[dof] < min_importance
    removes+=list(importances[dof].loc[mask].index)

In [None]:
subs = [(p[remove],0) for remove in removes]
X_eq_simplified = regression.X_eq.subs(subs)
Y_eq_simplified = regression.Y_eq.subs(subs)
N_eq_simplified = regression.N_eq.subs(subs)

display(X_eq_simplified)
display(Y_eq_simplified)
display(N_eq_simplified)

In [None]:
X.drop(columns=list(set(removes) & set(X.columns)))

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class BestFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self,k):
        super().__init__()
        self.k=k
            
    def fit(self, X, y):
        
        importance = feature_imporance(X,y)
    
        if self.k > len(importance):
            k = len(importance)
        else:
            k = self.k
            
        self.features = list(importance.index[0:k])
        
        return self
    
    def transform(self, X, y=None):
        # Perform arbitary transformation
        return X[self.features].copy()

In [None]:
best_features = BestFeatures(k=3)
best_features.fit(X=X, y=y)
best_features.transform(X)

In [None]:
regression.diff_eq_Y.exclude_parameters

In [None]:
def create_model(k):
    
    best_features = BestFeatures(k=k)
    linear_regression = LinearRegression(fit_intercept=False)
    
    steps = [
        ('select',best_features),
        ('regression',linear_regression),
    ]
    
    pipeline = Pipeline(steps)
    return pipeline

def predict(model, X_test, dof, data):

    y_pred = model.predict(X_test)        
    y_pred = pd.Series(y_pred, index=X_test.index)
    
    return y_pred
        
def vary_k(X_train, y_train, X_test, y_test, dof, data):
    
    scores = {}
    for k in range(1,X.shape[1]):
    
        select_k_best = SelectKBest(score_func=f_regression, k=k)
        
        linear_regression = LinearRegression(fit_intercept=False)
        
        
        model = create_model(k=k)
        model.fit(X=X_train, y=y_train)
        
        y_pred = predict(model=model, X_test=X_test, dof=dof, data=data)
        
        scores[k] = r2_score(y_true=y_test, y_pred=y_pred)
        scores = pd.Series(scores)
    
    return scores

In [None]:
ids = list(data.id.unique())

scores_all = {dof: pd.DataFrame() for dof in Xs.keys()}

for seed in range(1,10):
    
    np.random.seed(seed)
    ids_test = np.random.choice(ids, size=int(np.ceil(len(ids)*0.3)), replace=False)
    ids_train = list(set(ids) - set(ids_test))
    parameters = {}
    
    for dof, X in Xs.items():
        
        y = ys[dof]
        
        X_train, y_train, X_test, y_test, train_data = train_test_split_run(X=X, y=y, id=data_train.id, ids=ids_test)
           
        scores = vary_k(X_train, y_train, X_test, y_test, dof, data_prime)
        
        scores_all[dof][seed] = scores
        

In [None]:
fig,axes=plt.subplots(nrows=3)
fig.set_size_inches(15,15)

for ax,dof in zip(axes,Xs.keys()):
    
    scores_all[dof].plot(ax=ax, style=':')
    scores_all[dof].mean(axis=1).plot(ax=ax, style='k.-', lw=2, label='mean')
    
    ax.grid(True)
    ax.set_xticks(np.arange(scores.index[0], scores.index[-1], 1));
    ax.set_title(dof)
    ax.set_ylim(0.2,1)
    

In [None]:
ks = {
'X':5,
'Y':10,
'N':10,
}


parameters = {}

for dof, X in Xs.items():
    
    k = ks[dof]
    model = create_model(k=k)
    
    #X_train, y_train, X_test, y_test, train_data = train_test_split_run(X=X, y=y, id=data_train.id, ids=ids)
    X_train = X
    y_train = y = ys[dof]
    model.fit(X=X_train, y=y_train)
    
    select_k_best = model['select']
    parameter_names = select_k_best.features
    linear_regression = model['regression']
    parameter_values = linear_regression.coef_
    parameters.update({name:value for name,value in zip(parameter_names, parameter_values)})
    
    y_pred = predict(model=model, X_test=X_train, dof=dof, data=data_prime)
    
    fig,ax=plt.subplots()
    y_train.plot(style='-', ax=ax, label='train')
    y_pred.plot(style='-', ax=ax, alpha=0.5, label='pred')
    ax.set_title(dof)
    ax.legend()

In [None]:
ship_model = catalog.load(f"{ship}.updated.{vmm_name}.joined.model")
df_parameters = pd.DataFrame()
df_parameters['original'] = ship_model.parameters
df_parameters['selected'] = 0
df_parameters['selected'].update(added_masses)
df_parameters['selected'].update(parameters)
df_parameters['selected'].update(regression.exclude_parameters)
df_parameters['symbol'] = p


ship_model2 = ship_model.copy()
ship_model2.parameters = df_parameters['selected']

df_parameters['selected'].plot.bar()

In [None]:
mask = df_parameters['selected']==0
subs = [(symbol,0) for symbol in df_parameters.loc[mask,'symbol']]
X_eq_simplified = regression.X_eq.subs(subs)
Y_eq_simplified = regression.Y_eq.subs(subs)
N_eq_simplified = regression.N_eq.subs(subs)

display(X_eq_simplified)
display(Y_eq_simplified)
display(N_eq_simplified)


In [None]:
df_test = catalog.load(f'{ship}.updated.{id_test}.data_ek_smooth')
ek = catalog.load(f"{ship}.{vmm_name}.ek")

In [None]:
df_predict = simulate_euler(data=df_test, model=ship_model, ek=ek)

In [None]:
df_predict2 = simulate_euler(data=df_test, model=ship_model2, ek=ek)

In [None]:
dataframes = {
    'Experiment' : df_test,
    'Prediction' : df_predict,
    'Prediction2' : df_predict2,
    
}

styles = {
    'Experiment' : {'style':'r--'},
    'Prediction' : {'style':'g-'},
    'Prediction2' : {'style':'b-'},
}

In [None]:
track_plots(dataframes, lpp=ship_data['L'], beam=ship_data['B'],  styles=styles, N=7);

In [None]:
plot(dataframes=dataframes, keys=['delta','psi','y0','u','v','r'], ncols=1, styles=styles);

In [None]:
parameters

In [None]:
import statsmodels.api as sm
import seaborn as sns

X = pd.DataFrame()
N = 1000
x1 = X['x1'] = np.linspace(0,1,N)
x2 = X['x2'] = X['x1'] + np.random.normal(scale=0.05, size=N)
y = x1 + x2 + np.random.normal(scale=0.1, size=N)

data = X.copy()
data['y'] = y

sns.pairplot(data=data)

In [None]:
data.corr()

In [None]:
model1 = sm.OLS(y,X)
result = model1.fit()
result.summary()

In [None]:
y2 = y.diff()[1:].copy()
X2 = X.diff().iloc[1:].copy()
model2 = sm.OLS(y2,X2)
result2 = model2.fit()
result2.summary()

In [None]:
X3 = X[['x1']]
model3 = sm.OLS(y,X3)
result3 = model3.fit()
result3.summary()

In [None]:
X.corr()

In [None]:
X2.corr()

In [None]:
result.condition_number

In [None]:
result2.condition_number