# Data prepare for regression

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from jupyterthemes import jtplot
jtplot.style(theme='onedork', context='notebook', ticks=True, grid=False)

In [None]:
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option("display.max_columns", None)
import numpy as np
import os
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 15, 5

import data
import copy
from rolldecay.bis_system import BisSystem
from rolldecay import database
from rolldecayestimators import equations, symbols

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

import signal_lab
from sqlalchemy.inspection import inspect
import seaborn as sns
import docs
from sympy.physics.vector.printing import vpprint, vlatex
from IPython.display import display, Math, Latex
import sympy as sp
from rolldecayestimators.substitute_dynamic_symbols import lambdify,run

In [None]:
df_rolldecay = database.load(rolldecay_table_name='rolldecay_quadratic_b', limit_score=0.90, 
                             exclude_table_name='rolldecay_exclude')
db = database.get_db()

In [None]:
df_rolldecay.columns

In [None]:
df_rolldecay.describe()

In [None]:
df_rolldecay.isnull().sum().sort_values(ascending=False)

In [None]:
description = pd.read_sql_table('description', con=db.engine, index_col='id')

In [None]:
description

In [None]:
interesting=[
'A0',
'AR',
'beam',
'BKB',
'BKL',
'BKX',
'BR',
'BRA',
'BTT1',
'CP',
'CW',
'Volume',
'gm',
'HSKEG',
'ASKEG',
'IRUD',
'kg',
'KXX',
'lcg',
'lpp',
'LSKEG',
'RH',
'RHBL',
'RTYPE',
'TA',
'TF',
'TWIN',
'ship_type_id',
'ship_speed',
]

In [None]:
description.loc[interesting]

In [None]:
description.loc['phi_start'] = {'description':'roll at start of test','unit':'rad'}
description.loc['phi_stop'] = {'description':'roll at end of test','unit':'rad'}
description.loc['Disp'] = {'description':'Ship discplacement','unit':'m3'}


skip = [
    'model_number',
    'loading_condition_id',
    'B_1A',
    'B_2A',
    'B_3A',
    'C_1A',
    'C_3A',
    'C_5A',
    'B_1A',
    'B_1',
    'B_2',
    'B_3',
    'C_1',
    'C_3',
    'C_5',
    'A_44',
    'omega0_fft',
    'omega0',
    'score',
    'id',
    'project_number',
    'series_number',
    'run_number',
    'test_number',
    'scale_factor',
    'g',
    'rho',
    'B_1_hat',
    'B_2_hat',
]

In [None]:
def froude_scale(data,description, skip=[]):
    scale_factor = data['scale_factor']
    
    denominators = {
        '-':None,
        'm':scale_factor,
        'm2':scale_factor**2,
        'm3':scale_factor**3,
        'knots':np.sqrt(scale_factor),
        'knot':np.sqrt(scale_factor),
        'degrees':1,
        'rad':1,
        'rad/s':1/np.sqrt(scale_factor),
        'degrees/s':1/np.sqrt(scale_factor),
        'kW':scale_factor**2/np.sqrt(scale_factor),
        'rpm':1/np.sqrt(scale_factor),
    }
    
    model_data = pd.DataFrame(index=data.index)
    for key,value in data.items():
        if key in skip or data[key].dtype=='object':
            model_data[key] = value
            continue
            
        unit = description.loc[key]['unit']
        denominator = denominators[unit]
        if denominator is None:
            nondimensional_value = value
        else:
            nondimensional_value = value/denominator
        
        model_data[key] = nondimensional_value
    
    return model_data
    
    
    

In [None]:
df_model = froude_scale(data=df_rolldecay, description=description, skip=skip)

In [None]:
mask = df_model[interesting].notnull().all(axis=1)
mask.sum()

In [None]:
df_model[interesting].isnull().sum()

In [None]:
remove = ['BTT1','BR','BRA','HSKEG','ASKEG','LSKEG','RHBL','RTYPE','lcg','BKX',]
add = ['ship_speed',]
important = list(set(interesting) - set(remove)) + add


In [None]:
mask = df_model[important].notnull().all(axis=1)
mask.sum()

In [None]:
df_model[important].isnull().sum()

In [None]:
df = df_model.loc[mask].copy()

In [None]:
df = df.dropna(subset=['omega0'])

In [None]:
len(df)

In [None]:
mask = df['score'] > 0.95
df = df.loc[mask]

In [None]:
important

## Bis system

In [None]:
#data = df[list(set(important) | set(['omega0','B_1','B_2',]))].copy()
#units = description.loc[data.columns]['unit']
#data['ship_speed']*=1.852/3.6
#units['ship_speed']=r'm/s'
#
#data['g']=9.81
#data['rho']=1000
#units['g']=r'm/s**2'
#units['rho']=r'kg/m**3'
#
#data['omega0_hat'] = data['omega0']*np.sqrt(data['beam']/(2*data['g']))
#units['omega0_hat'] = '-'
#
#bis_system = BisSystem(lpp=data['lpp'], volume=data['Volume'], units=units)
#data = bis_system.df_to_bis(data)
#

In [None]:
data = df[list(set(important))].copy()

In [None]:
df.hist('score', bins = 50)

In [None]:
df.hist('omega0', bins = 50)

In [None]:
Math(vlatex(equations.B44_hat_equation))

In [None]:
B44_hat_equation_quadratic = equations.B44_hat_equation.subs(symbols.B_44,sp.solve(equations.b44_quadratic_equation,symbols.B_44)[0])
Math(vlatex(B44_hat_equation_quadratic))

In [None]:
B44_lambda = lambdify(sp.solve(B44_hat_equation_quadratic,symbols.B_44_hat)[0])

In [None]:
B44_lambda

In [None]:
equations.B_1_hat_equation

In [None]:
equations.B_2_hat_equation

In [None]:
B_1_hat_lambda = lambdify(sp.solve(equations.B_1_hat_equation, symbols.B_1_hat)[0])
B_2_hat_lambda = lambdify(sp.solve(equations.B_2_hat_equation, symbols.B_2_hat)[0])

df['g']=9.81
df['rho']=1000
df['Disp']=df['Volume']
df['B_1_hat'] = run(function=B_1_hat_lambda, inputs=df)
df['B_2_hat'] = run(function=B_2_hat_lambda, inputs=df)

In [None]:
omega_hat_equation = equations.omega_hat_equation.subs(symbols.omega,symbols.omega0)

In [None]:
omega_hat_equation

In [None]:
omega0_lambda = lambdify(sp.solve(omega_hat_equation,symbols.omega_hat)[0])

In [None]:
df['omega0_hat']=run(function=omega0_lambda,inputs=df)

In [None]:
df.hist('omega0_hat', bins = 50)

In [None]:
df.hist('B_1_hat', bins = 50)

In [None]:
df.hist('B_2_hat', bins = 50)

In [None]:
X=data

In [None]:
variance_treshold = VarianceThreshold(0.001)
X_ = variance_treshold.fit_transform(X)

In [None]:
X.shape

In [None]:
X_.shape

In [None]:
X_ = variance_treshold.fit_transform(X)
X_.shape

In [None]:
X.columns[variance_treshold.get_support()]

In [None]:
#GM = data['gm']
#rxx = data['KXX']
#kxx = rxx/data['beam']
#rho = data['rho']
#m = rho*data['Volume']
#Ixx = m*rxx**2
#data['Ixx']=Ixx
#if not 'Ixx' in important:
#    important.append('Ixx')
#g = data['g']
#omega0 = data['omega0']
#data['Ixx_tot'] = Ixx_tot = GM*g*m/(omega0**2)
#data['Ixx_added'] = Ixx_added = Ixx_tot - Ixx

In [None]:
#mask = data['Ixx_added']>0
#data = data.loc[mask].copy()

In [None]:
important = list(set(important))  # Only unique values
X=data[important]

In [None]:
renamers = {
    'CP' : 'C_p', 
    'IRUD' : 'I_RUD', 
    'BKL' : 'BK_L', 
    'gm' : 'GM', 
    'A0' : 'A_0', 
    'ship_type_id' : 'ship_type_id', 
    'Volume' : 'Disp', 
    'Ixx' : 'I_xx', 
    'BKB' : 'BK_B',
    'KXX' : 'K_xx', 
    'RH' : 'R_h', 
    'AR' : 'A_R', 
    'TWIN' : 'TWIN', 
    'kg': 'kg', 
    'CW' : 'C_W', 
    'beam' : 'beam', 
    'TF' : 'T_F', 
    'ship_speed' : 'V', 
    'TA' : 'T_A',
    'lpp' : 'L_pp',
}
X.rename(columns=renamers, inplace=True)

In [None]:
X['T'] = (X['T_A'] + X['T_F'])/2
X.drop(columns=['T_A','T_F'], inplace=True)

In [None]:
X.head()

In [None]:
X.drop(columns='ship_type_id', inplace=True)

In [None]:
X.to_pickle('data.sav')

In [None]:
y_s = ['omega0_hat','B_1_hat','B_2_hat']
y = df[y_s].copy()
y.to_pickle('y.sav')