# Roll damping data exploration

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option("display.max_columns", None)
import numpy as np
import os
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 15, 5

import data
import copy
from mdldb.mdl_db import MDLDataBase
from mdldb.tables import Base, Model, LoadingCondition, Run, RolldecayLinear, RolldecayDirect, RolldecayNorwegian
from mdldb.tables import Min, Mean, Max, Std, Ship
from mdldb import mdl_to_evaluation
from evaluation.run_dynamic import RunDynamic
from evaluation.run_manoeuvring import RunZigZag
from rolldecay.bis_system import BisSystem

from rolldecayestimators.direct_estimator import DirectEstimator
from rolldecayestimators.direct_linear_estimator import DirectLinearEstimator
from rolldecayestimators.norwegian_estimator import NorwegianEstimator
from rolldecayestimators.transformers import CutTransformer, LowpassFilterDerivatorTransformer, ScaleFactorTransformer, OffsetTransformer
#from rolldecay.equations_lambdify import calculate_acceleration, calculate_velocity
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

import signal_lab
from sqlalchemy.inspection import inspect
import seaborn as sns
import docs

In [None]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///' + data.mdl_db_path)
db = MDLDataBase(engine=engine)

In [None]:
sql="""
SELECT * from
rolldecay_direct_improved
INNER JOIN run
ON rolldecay_direct_improved.run_id == run.id
    INNER JOIN loading_conditions
    ON (run.loading_condition_id == loading_conditions.id)
        INNER JOIN models
        ON run.model_number == models.model_number
            INNER JOIN ships
            ON models.ship_name == ships.name

"""
df_rolldecay = pd.read_sql(sql, con=engine, index_col='run_id',)
df_rolldecay = df_rolldecay.loc[:,~df_rolldecay.columns.duplicated()]
#df_rolldecay = remove_outliers(df_rolldecay)
df_rolldecay.describe()

In [None]:
df_rolldecay.columns

In [None]:
description = pd.read_sql_table('description', con=db.engine, index_col='id')

In [None]:
df = df_rolldecay.dropna(subset=['omega0'])

In [None]:
len(df)

In [None]:
mask = df['score'] > 0.95
df = df.loc[mask]

In [None]:
df.sort_values(by='ship_speed', inplace=True)
for model_number, model_group in df.groupby(by='model_number'):
    
    fig,ax=plt.subplots()
    fig.set_dpi(50)
    fig.set_size_inches(10,5)
    model_group.plot(x='ship_speed', y='mean_damping', style='o-', ax=ax)
    ax.set_title(model_number)
    

In [None]:
data = df.copy()
units = description.loc[data.columns]['unit']
data['ship_speed']*=1.852/3.6
units['ship_speed']=r'm/s'

data['g']=9.81
data['rho']=1000
units['g']=r'm/s**2'
units['rho']=r'kg/m**3'

data['omega0_hat'] = data['omega0']*np.sqrt(data['beam']/(2*data['g']))
units['omega0_hat'] = '-'

bis_system = BisSystem(lpp=data['lpp'], volume=data['Volume'], units=units)
data = bis_system.df_to_bis(data)


In [None]:
df.hist('score', bins = 50)

In [None]:
data.hist('omega0', bins = 50)

In [None]:
data.hist('omega0_hat', bins = 50)

In [None]:
data.hist('zeta', bins = 50)

In [None]:
data.hist('d', bins = 50)

In [None]:
data.hist('mean_damping', bins = 50)

In [None]:
X=data.drop(columns=['zeta','d','omega0','mean_damping'])

In [None]:
variance_treshold = VarianceThreshold(0.001)
X_ = variance_treshold.fit_transform(X)

In [None]:
X.shape

In [None]:
X_.shape

In [None]:
X_ = variance_treshold.fit_transform(X)
X_.shape

In [None]:
X.columns[variance_treshold.get_support()]

In [None]:
GM = data['gm']
rxx = data['KXX']
kxx = rxx/data['beam']
rho = data['rho']
m = rho*data['Volume']
Ixx = m*rxx**2
data['Ixx']=Ixx
if not 'Ixx' in important:
    important.append('Ixx')
g = data['g']
omega0 = data['omega0']
data['Ixx_tot'] = Ixx_tot = GM*g*m/(omega0**2)
data['Ixx_added'] = Ixx_added = Ixx_tot - Ixx

In [None]:
mask = data['Ixx_added']>0
data = data.loc[mask].copy()

## Omega0 regression

In [None]:
y_key = 'omega0_hat'

fig,ax=plt.subplots()
data[y_key].hist(bins=50, ax=ax)
ax.set_title('Historgram: %s' % y_key)

In [None]:
ks = np.arange(1,10)
degrees = np.arange(1,3)
results = pd.DataFrame()

variance_treshold = VarianceThreshold(0.0001)
standard_scaler = StandardScaler()

y = data[y_key]
X = data[important]

for k in ks:
    for degree in degrees:
        select_k_best = SelectKBest(k=k, score_func=f_regression)
        
       
        polynomial_features = PolynomialFeatures(degree=degree)
        linear_regression = LinearRegression()
        
        steps=[
            ('polynomial_feature', polynomial_features),
            ('standard_scaler', standard_scaler),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
        ]
        
        model = Pipeline(steps=steps)
        score = cross_val_score(estimator=model,X=X,y=y,cv=5).mean()
        s = pd.Series()
        s['k'] = k
        s['degree'] = degree
        s['score'] = score
        results = results.append(s, ignore_index=True)
        
        

In [None]:
results.sort_values(by='score', ascending=False, inplace=True)

In [None]:
results.head()

In [None]:
k = int(results.iloc[0]['k'])
degree = int(results.iloc[0]['degree'])

In [None]:
degree

In [None]:
select_k_best = SelectKBest(k=k, score_func=f_regression)

standard_scaler = StandardScaler()
polynomial_features = PolynomialFeatures(degree=degree)
linear_regression = LinearRegression()

steps=[
    ('polynomial_feature', polynomial_features),
    ('standard_scaler', standard_scaler),
    ('variance_treshold',variance_treshold),
    ('select_k_best',select_k_best),
    ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
model.fit(X=X_train, y=y_train)
score = model.score(X=X_test, y=y_test)
score

In [None]:
fig,ax = plt.subplots()
ax.plot(y_test,model.predict(X_test),'.', alpha=0.5)
ax.set_title('Prediction of %s' % y_key)
ax.set_xlabel('test: %s' % y_key)
ax.set_ylabel('predicted: %s' % y_key)


## damping regression

In [None]:
mask = data['ship_speed']==0
data_0 = data.loc[mask].copy()

In [None]:
y_key = 'mean_damping'

fig,ax=plt.subplots()
data_0[y_key].hist(bins=50, ax=ax)
ax.set_title('Historgram: %s' % y_key)

In [None]:
ks = np.arange(1,10)
degrees = np.arange(1,3)
results = pd.DataFrame()

variance_treshold = VarianceThreshold(0.0001)
standard_scaler = StandardScaler()

y = data_0[y_key]
X2 = data_0[important]

for k in ks:
    for degree in degrees:
        select_k_best = SelectKBest(k=k, score_func=f_regression)
        
       
        polynomial_features = PolynomialFeatures(degree=degree)
        linear_regression = LinearRegression()
        
        steps=[
            ('polynomial_feature', polynomial_features),
            ('standard_scaler', standard_scaler),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
        ]
        
        model = Pipeline(steps=steps)
        score = cross_val_score(estimator=model,X=X2,y=y,cv=3).mean()
        s = pd.Series()
        s['k'] = k
        s['degree'] = degree
        s['score'] = score
        results = results.append(s, ignore_index=True)
        
        

In [None]:
results.sort_values(by='score', ascending=False, inplace=True)

In [None]:
results.head()

In [None]:
k = int(results.iloc[0]['k'])
degree = int(results.iloc[0]['degree'])

In [None]:
degree

In [None]:
select_k_best = SelectKBest(k=k, score_func=f_regression)

standard_scaler = StandardScaler()
polynomial_features = PolynomialFeatures(degree=degree)
linear_regression = LinearRegression()

steps=[
    ('polynomial_feature', polynomial_features),
    ('standard_scaler', standard_scaler),
    ('variance_treshold',variance_treshold),
    ('select_k_best',select_k_best),
    ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.33)
model.fit(X=X_train, y=y_train)
score = model.score(X=X_test, y=y_test)
score

In [None]:
fig,ax = plt.subplots()
ax.plot(y_test,model.predict(X_test),'.', alpha=0.5)
ax.set_title('Prediction of %s' % y_key)
ax.set_xlabel('test: %s' % y_key)
ax.set_ylabel('predicted: %s' % y_key)
