# Data prepare for regression

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from jupyterthemes import jtplot
jtplot.style(theme='onedork', context='notebook', ticks=True, grid=False)

In [None]:
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option("display.max_columns", None)
import numpy as np
import os
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 15, 5

import data
import copy
from rolldecay.bis_system import BisSystem
from rolldecay import database

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

import signal_lab
from sqlalchemy.inspection import inspect
import seaborn as sns
import docs

In [None]:
df_rolldecay = database.load(rolldecay_table_name='rolldecay_direct', limit_score=0.90, 
                             exclude_table_name='rolldecay_exclude')
db = database.get_db()

In [None]:
df_rolldecay.columns

In [None]:
df_rolldecay.describe()

In [None]:
df_rolldecay.isnull().sum().sort_values(ascending=False)

In [None]:
description = pd.read_sql_table('description', con=db.engine, index_col='id')

In [None]:
description

In [None]:
interesting=[
'A0',
'AR',
'beam',
'BKB',
'BKL',
'BKX',
'BR',
'BRA',
'BTT1',
'CP',
'CW',
'Volume',
'gm',
'HSKEG',
'ASKEG',
'IRUD',
'kg',
'KXX',
'lcg',
'lpp',
'LSKEG',
'RH',
'RHBL',
'RTYPE',
'TA',
'TF',
'TWIN',
'ship_type_id',
'ship_speed',
'zeta',
'd',
'omega0',
'mean_damping',	
]

In [None]:
description.loc[interesting]

In [None]:
mask = df_rolldecay[interesting].notnull().all(axis=1)
mask.sum()

In [None]:
df_rolldecay[interesting].isnull().sum()

In [None]:
remove = ['BTT1','BR','BRA','HSKEG','ASKEG','LSKEG','RHBL','RTYPE','lcg','BKX','zeta','d','omega0','mean_damping']
add = ['ship_speed',]
important = list(set(interesting) - set(remove)) + add


In [None]:
mask = df_rolldecay[important].notnull().all(axis=1)
mask.sum()

In [None]:
df_rolldecay[important].isnull().sum()

In [None]:
df = df_rolldecay.loc[mask].copy()

In [None]:
df = df.dropna(subset=['omega0'])

In [None]:
len(df)

In [None]:
mask = df['score'] > 0.95
df = df.loc[mask]

In [None]:
important

In [None]:
data = df[list(set(important) | set(['zeta','d','omega0','mean_damping',]))].copy()
units = description.loc[data.columns]['unit']
data['ship_speed']*=1.852/3.6
units['ship_speed']=r'm/s'

data['g']=9.81
data['rho']=1000
units['g']=r'm/s**2'
units['rho']=r'kg/m**3'

data['omega0_hat'] = data['omega0']*np.sqrt(data['beam']/(2*data['g']))
units['omega0_hat'] = '-'

bis_system = BisSystem(lpp=data['lpp'], volume=data['Volume'], units=units)
data = bis_system.df_to_bis(data)


In [None]:
df.hist('score', bins = 50)

In [None]:
data.hist('omega0', bins = 50)

In [None]:
data.hist('omega0_hat', bins = 50)

In [None]:
data.hist('zeta', bins = 50)

In [None]:
data.hist('d', bins = 50)

In [None]:
data.hist('mean_damping', bins = 50)

In [None]:
X=data.drop(columns=['zeta','d','omega0','mean_damping'])

In [None]:
variance_treshold = VarianceThreshold(0.001)
X_ = variance_treshold.fit_transform(X)

In [None]:
X.shape

In [None]:
X_.shape

In [None]:
X_ = variance_treshold.fit_transform(X)
X_.shape

In [None]:
X.columns[variance_treshold.get_support()]

In [None]:
GM = data['gm']
rxx = data['KXX']
kxx = rxx/data['beam']
rho = data['rho']
m = rho*data['Volume']
Ixx = m*rxx**2
data['Ixx']=Ixx
if not 'Ixx' in important:
    important.append('Ixx')
g = data['g']
omega0 = data['omega0']
data['Ixx_tot'] = Ixx_tot = GM*g*m/(omega0**2)
data['Ixx_added'] = Ixx_added = Ixx_tot - Ixx

In [None]:
mask = data['Ixx_added']>0
data = data.loc[mask].copy()

In [None]:
important = list(set(important))  # Only unique values
X=data[important]

In [None]:
renamers = {
    'CP' : 'C_p', 
    'IRUD' : 'I_RUD', 
    'BKL' : 'BK_L', 
    'gm' : 'GM', 
    'A0' : 'A_0', 
    'ship_type_id' : 'ship_type_id', 
    'Volume' : 'Disp', 
    'Ixx' : 'I_xx', 
    'BKB' : 'BK_B',
    'KXX' : 'K_xx', 
    'RH' : 'R_h', 
    'AR' : 'A_R', 
    'TWIN' : 'TWIN', 
    'kg': 'kg', 
    'CW' : 'C_W', 
    'beam' : 'beam', 
    'TF' : 'T_F', 
    'ship_speed' : 'V', 
    'TA' : 'T_A',
    'lpp' : 'L_pp',
}
X.rename(columns=renamers, inplace=True)

In [None]:
X['T'] = (X['T_A'] + X['T_F'])/2
X.drop(columns=['T_A','T_F'], inplace=True)

In [None]:
X.head()

In [None]:
X.to_pickle('data.sav')

In [None]:
y_s = ['omega0_hat','zeta','d']
y = data[y_s].copy()
y.to_pickle('y.sav')