# $\hat{B_1}$ damping regression with a little help from Ikeda

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from jupyterthemes import jtplot
jtplot.style(theme='onedork', context='notebook', ticks=True, grid=False)

In [None]:
import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option("display.max_columns", None)
import numpy as np
import os
import matplotlib.pyplot as plt
from pylab import rcParams
rcParams['figure.figsize'] = 15, 5

#import data
import copy
from rolldecay.bis_system import BisSystem
from rolldecay import database

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

import signal_lab
from sqlalchemy.inspection import inspect
import seaborn as sns
import docs
import pickle
from rolldecayestimators.polynom_estimator import Polynom
from rolldecayestimators import symbols
from rolldecay import database
import rolldecayestimators.simplified_ikeda as simplified_ikeda
from rolldecayestimators.ikeda_estimator import IkedaQuadraticEstimator
import rolldecayestimators.lambdas as lambdas
from rolldecayestimators.substitute_dynamic_symbols import run, lambdify
import rolldecayestimators.equations as equations
import rolldecayestimators.symbols as symbols
import sympy as sp
from rolldecay.paper_writing import save_fig
from rolldecay.froude_scaling import froude_scale
from sklearn.metrics import r2_score

In [None]:
data = pd.read_pickle('data.sav')
y_s = pd.read_pickle('y.sav')

In [None]:
db = database.get_db()

In [None]:
df_ikeda = database.load(rolldecay_table_name='rolldecay_simplified_ikeda', limit_score=0.5, 
                             exclude_table_name='rolldecay_exclude')

df_rolldecay = database.load(rolldecay_table_name='rolldecay_cubic_b', limit_score=0.9, 
                             exclude_table_name='rolldecay_exclude')

df_rolldecay['ship_speed']*=1.852/3.6
df_ikeda['ship_speed']*=1.852/3.6

description = pd.read_sql_table('description', con=db.engine, index_col='id')
description.loc['ship_speed','unit']='m/s'
description.loc['VDES','unit']='m/s'
description.loc['Disp'] = {'description':'Ship discplacement','unit':'m3'}

T_f=df_rolldecay['TF']
T_a=df_rolldecay['TA']
L_pp=df_rolldecay['lpp']
df_rolldecay['trim']=np.arctan((T_a-T_f)/L_pp)
mask = df_rolldecay['trim'].abs() < np.deg2rad(0.3)
df_rolldecay=df_rolldecay.loc[mask].copy()

df_ikeda['Disp']=df_ikeda['Volume']
df_rolldecay['Disp']=df_rolldecay['Volume']

skip=[]
df_ikeda = froude_scale(data=df_ikeda, description=description, skip=skip)
df_rolldecay = froude_scale(data=df_rolldecay, description=description, skip=skip)

phi_a = np.deg2rad(3)
g = 9.81
rho=1000
df_ikeda['B_e'] = run(function=lambdas.B_e_lambda, inputs=df_ikeda, phi_a=phi_a)
df_ikeda['B_e_hat'] = run(function=lambdas.B_e_hat_lambda, inputs=df_ikeda, g=g, rho=rho)
df_rolldecay['B_e'] = run(function=lambdas.B_e_lambda, inputs=df_rolldecay, phi_a=phi_a)
df_rolldecay['B_e_hat'] = run(function=lambdas.B_e_hat_lambda, inputs=df_rolldecay, g=g, rho=rho)

df_rolldecay['omega0_hat'] = run(function=lambdas.omega0_lambda, inputs=df_rolldecay, g=g)
df_ikeda['omega0_hat'] = run(function=lambdas.omega0_lambda, inputs=df_ikeda, g=g)

df_ikeda['scale_factor']=df_ikeda['lpp']
df_ikeda = froude_scale(data=df_ikeda, description=description, skip=skip)
df_rolldecay['scale_factor']=df_rolldecay['lpp']
df_rolldecay = froude_scale(data=df_rolldecay, description=description, skip=skip)

In [None]:
df_ikeda.head()

In [None]:
df_rolldecay.head()

In [None]:
df_compare = pd.merge(left=df_rolldecay, right=df_ikeda, how = 'inner',left_index=True, right_index=True,
                      suffixes=('','_ikeda'))

In [None]:
df_compare.plot(x='omega0_hat', y='omega0_hat_ikeda', style='o', alpha=0.5)

In [None]:
fig,ax=plt.subplots()
df_compare.plot(x='B_e_hat', y='B_e_hat_ikeda', ax=ax, style='o', alpha=0.5)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')

In [None]:
df_compare['error'] = (df_compare['B_e_hat']-df_compare['B_e_hat_ikeda']).abs()

In [None]:
fig,ax=plt.subplots()
df_compare.plot(x='TA', y='error', style='o', alpha=0.5,ax=ax)
df_compare.plot(x='omega0_hat', y='error', style='o', alpha=0.5)

In [None]:
fig,ax=plt.subplots()
mask=((df_compare['TA']>0.035) & (df_compare['omega0_hat']<0.63))
df_compare_good = df_compare.loc[mask].copy()
df_compare_good.plot(x='B_e_hat', y='B_e_hat_ikeda', ax=ax, style='o', alpha=0.5)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')
              



In [None]:
r2_score(y_true=df_compare_good['B_e_hat'], y_pred=df_compare_good.loc[mask]['B_e_hat_ikeda'])

In [None]:
df_ikeda.head()

In [None]:
data = df_compare.copy()
data['T'] = (data['TA']+data['TF'] )/2
data['CB'] = data['Disp']/(data['lpp']*data['T']*data['beam'])
data['OG'] = (-data.kg + data['T'])
renamers = {
    'CP' : 'C_p',
    'CB' : 'C_b',
    'IRUD' : 'I_RUD', 
    'BKL' : 'BK_L', 
    'gm' : 'GM', 
    'A0' : 'A_0', 
    'ship_type_id' : 'ship_type_id', 
    'Volume' : 'Disp', 
    'Ixx' : 'I_xx', 
    'BKB' : 'BK_B',
    'KXX' : 'K_xx', 
    'RH' : 'R_h', 
    'AR' : 'A_R', 
    'TWIN' : 'TWIN', 
    'kg': 'kg', 
    'CW' : 'C_W', 
    'beam' : 'beam', 
    'TF' : 'T_F', 
    'ship_speed' : 'V', 
    'TA' : 'T_A',
    'lpp' : 'L_pp',
}
data.rename(columns=renamers, inplace=True)

In [None]:
ikeda_parameters = [
        'beam',
        'T',
        'BK_L',
        'BK_B',
        'OG',
        'omega0_hat',        
        'C_b',
        'A_0',
        'V']

In [None]:
data[ikeda_parameters].head()

## Pure polynom ikeda parameters

In [None]:
variance_treshold = VarianceThreshold(0.000)
#standard_scaler = StandardScaler()


y = data['B_e_hat']
X = data[ikeda_parameters]
       
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

ks = np.arange(1,11,1)
scores = []
stds = []
for k in ks:
    select_k_best = SelectKBest(k=k, score_func=f_regression)
    steps=[
            ('polynomial_feature', polynomial_features),
            #('standard_scaler', standard_scaler),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
    ]
    
    model = Pipeline(steps=steps)
    model.fit(X=X, y=y)
    cv=5
    score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
    std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()
    
    scores.append(score)
    stds.append(std)
    
scores = np.array(scores)
stds = np.array(stds)

In [None]:
fig,ax = plt.subplots()
ax.plot(ks,scores-stds,'.-')
ax.plot(ks,scores,'.-')
ax.plot(ks,scores+stds,'.-')


In [None]:
select_k_best = SelectKBest(k=10, score_func=f_regression)
steps=[
        ('polynomial_feature', polynomial_features),
        #('standard_scaler', standard_scaler),
        ('variance_treshold',variance_treshold),
        ('select_k_best',select_k_best),
        ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
model.fit(X=X, y=y)
cv=5
score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()

In [None]:
score

In [None]:
std

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_e_hat)
polynom.fit(X=X, y=y)
polynom.equation

In [None]:
data['B_e_regression_polynom'] = model.predict(X=X)


In [None]:
fig,ax=plt.subplots()
data.plot(x='B_e_hat', y=['B_e_hat_ikeda','B_e_regression_polynom'], ax=ax, style='o', alpha=0.4)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')

In [None]:
fig,ax=plt.subplots()
mask = ((data['B_e_regression_polynom'] < 0.0029) & 
        (data['B_e_regression_polynom'] > 0.0026))
df_strange=data.loc[mask].copy()
df_strange.loc[mask].plot(x='B_e_hat', y=['B_e_regression_polynom'], ax=ax, style='o', alpha=0.4)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')
ax.grid()

In [None]:
df_strange.head()

In [None]:
df_strange['T'].max()

In [None]:
df_strange['omega0_hat'].min()

In [None]:
df_strange['V'].hist()

In [None]:
mask = ((data['V'].round(decimals=2)==0) )
data_zero = data.loc[mask].copy()
data_zero['error'] = data_zero['B_e_hat']-data_zero['B_e_hat_ikeda']

In [None]:


fig,ax=plt.subplots()
data_zero.plot(x='B_e_hat', y=['B_e_hat_ikeda'], ax=ax, style='o', alpha=0.4)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')
ax.grid()

In [None]:
data_zero.head()

In [None]:
data_zero.plot(x='T', y='error', style='o', alpha=0.4)
data_zero.plot(x='error', y=['B_W_HAT','B_E_HAT','B_F_HAT','B_BK_HAT'], style='o', alpha=0.4)

In [None]:
r2_score(y_true=data_zero['B_e_hat'], y_pred=data_zero['B_e_hat_ikeda'])

In [None]:
data_zero['B_e_hat_ikeda2'] = data_zero['B_e_hat_ikeda']-data_zero['B_W_HAT']

In [None]:
r2_score(y_true=data_zero['B_e_hat'], y_pred=data_zero['B_e_hat_ikeda2'])

In [None]:
fig,ax=plt.subplots()
data_zero.plot(x='B_e_hat', y=['B_e_hat_ikeda2'], ax=ax, style='o', alpha=0.4)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')
ax.grid()

In [None]:
additional_parameters = [
'C_p',
'C_b',
'I_RUD', 
'BK_L', 
'GM', 
'A_0',  
'K_xx', 
'A_R', 
'TWIN', 
'kg', 
'C_W', 
'beam', 
]

#parameters = list(set(ikeda_parameters) | set(additional_parameters) | set(['B_F_HAT']))
parameters = list(set(ikeda_parameters) | set(additional_parameters) )

In [None]:
variance_treshold = VarianceThreshold(0.000)

y_key='B_e_hat'
data_=data_zero[parameters+[y_key]].copy()
data_.dropna(inplace=True)
y = data_[y_key]
X = data_[parameters].copy()
       
polynomial_features = PolynomialFeatures(degree=1)
linear_regression = LinearRegression()

ks = np.arange(1,17,1)
scores = []
stds = []
for k in ks:
    select_k_best = SelectKBest(k=k, score_func=f_regression)
    steps=[
            ('polynomial_feature', polynomial_features),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
    ]
    
    model = Pipeline(steps=steps)
    model.fit(X=X, y=y)
    cv=5
    score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
    std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()
    
    scores.append(score)
    stds.append(std)
    
scores = np.array(scores)
stds = np.array(stds)

In [None]:
np.max(scores)

In [None]:
fig,ax = plt.subplots()
ax.plot(ks,scores-stds,'.-')
ax.plot(ks,scores,'.-')
ax.plot(ks,scores+stds,'.-')
ax.grid()

In [None]:
index = np.argmax(scores)
k=ks[index]
k

In [None]:
select_k_best = SelectKBest(k=5, score_func=f_regression)
steps=[
        ('polynomial_feature', polynomial_features),
        #('standard_scaler', standard_scaler),
        ('variance_treshold',variance_treshold),
        ('select_k_best',select_k_best),
        ('linear_regression', linear_regression)
]

model_zero = Pipeline(steps=steps)
model_zero.fit(X=X, y=y)
cv=5
score = cross_val_score(estimator=model_zero,X=X,y=y,cv=cv).mean()
std = cross_val_score(estimator=model_zero,X=X,y=y,cv=cv).std()

In [None]:
score

In [None]:
polynom_zero = Polynom(model=model_zero, columns=X.columns, y_symbol=symbols.B_e_hat)
polynom_zero.fit(X=X, y=y)
polynom_zero.equation

In [None]:
polynom_zero.score(X=X, y=y)

In [None]:
data_['B_e_hat_regression'] = polynom_zero.predict(data_[parameters])

In [None]:
fig,ax=plt.subplots()
data_.plot(x='B_e_hat', y=['B_e_hat_regression'], ax=ax, style='o', alpha=0.4)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')
ax.grid()

In [None]:
data['B_e_hat0']=polynom_zero.predict(data)

In [None]:
mask=data['V'].round(decimals=2)>0
data_speed=data.loc[mask].copy()

In [None]:
data_speed['B_e_hat0'].hist(bins=30)

In [None]:
#data_speed['speed_factor']=data_speed['B_e_hat']/(data_speed['B_e_hat0']*data_speed['V'])
data_speed['speed_factor']=data_speed['B_e_hat']/data_speed['B_e_hat0']

In [None]:
data_speed['speed_factor'].hist(bins=30)

In [None]:
mask = data_speed['speed_factor'] < data_speed['speed_factor'].quantile(0.96)
data_speed=data_speed.loc[mask].copy()

In [None]:
data_speed['speed_factor'].hist(bins=30)

In [None]:
parameters = list(set(ikeda_parameters) | set(additional_parameters) | set(['B_L_HAT']))

In [None]:
variance_treshold = VarianceThreshold(0.000)

y_key='speed_factor'
data_=data_speed[parameters+[y_key]].copy()
data_.dropna(inplace=True)
y = data_[y_key]
X = data_[parameters].copy()
       
polynomial_features = PolynomialFeatures(degree=1)
linear_regression = LinearRegression()

ks = np.arange(1,10,1)
scores = []
stds = []
for k in ks:
    select_k_best = SelectKBest(k=k, score_func=f_regression)
    steps=[
            ('polynomial_feature', polynomial_features),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
    ]
    
    model = Pipeline(steps=steps)
    model.fit(X=X, y=y)
    cv=5
    score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
    std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()
    
    scores.append(score)
    stds.append(std)
    
scores = np.array(scores)
stds = np.array(stds)

In [None]:
np.max(scores)

In [None]:
fig,ax = plt.subplots()
ax.plot(ks,scores-stds,'.-')
ax.plot(ks,scores,'.-')
ax.plot(ks,scores+stds,'.-')
ax.grid()

In [None]:
select_k_best = SelectKBest(k=3, score_func=f_regression)
steps=[
        ('polynomial_feature', polynomial_features),
        #('standard_scaler', standard_scaler),
        ('variance_treshold',variance_treshold),
        ('select_k_best',select_k_best),
        ('linear_regression', linear_regression)
]

model_speed = Pipeline(steps=steps)
model_speed.fit(X=X, y=y)
cv=5
score = cross_val_score(estimator=model_speed,X=X,y=y,cv=cv).mean()
std = cross_val_score(estimator=model_speed,X=X,y=y,cv=cv).std()

In [None]:
score

In [None]:
polynom_speed = Polynom(model=model_speed, columns=X.columns, y_symbol=symbols.B_e_hat)
polynom_speed.fit(X=X, y=y)
polynom_speed.equation

In [None]:
data_speed['speed_factor_regression'] = polynom_speed.predict(data_speed)

In [None]:
fig,ax=plt.subplots()
data_speed.plot(x='speed_factor', y=['speed_factor_regression'], ax=ax, style='o', alpha=0.4)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')
ax.grid()

In [None]:
data['speed_factor_regression'] = polynom_speed.predict(data)

In [None]:
data['B_e_hat_speed_regression']=data['B_e_hat0']*data['speed_factor_regression'] 

In [None]:
data[['B_e_hat_speed_regression','B_e_hat0','speed_factor_regression','V','B_L_HAT']]

In [None]:
fig,ax=plt.subplots()
data.plot(x='B_e_hat', y=['B_e_hat_speed_regression'], ax=ax, style='o', alpha=0.4)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')
ax.grid()

In [None]:
mask = data['B_e_hat_speed_regression'].notnull()
data_ = data.loc[mask].copy()

r2_score(y_true=data_['B_e_hat'], y_pred=data_['B_e_hat_speed_regression'])

In [None]:
data_['error'] = data_['B_e_hat'] - data_['B_e_hat_regression'] 

In [None]:
data_.sort_values(by='B_e_hat_regression', ascending=False).iloc[0]

## Combine methods

In [None]:
bad_mask=((data['T']<0.035) | (data['omega0_hat']>0.63))
data_bad = data.loc[bad_mask].copy()

In [None]:
r2_score(y_true=data_bad['B_e_hat'], y_pred=data_bad['B_e_hat_ikeda'])

In [None]:
parameters = list(set(ikeda_parameters) )

In [None]:
variance_treshold = VarianceThreshold(0.00)
#standard_scaler = StandardScaler()

y_key = 'B_e_hat'
data_=data_bad[parameters+[y_key]].copy()
data_.dropna(inplace=True)
y = data_['B_e_hat']
X = data_[parameters]
#X = data_[ikeda_parameters]


       
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

ks = np.arange(1,13,1)
scores = []
stds = []
for k in ks:
    select_k_best = SelectKBest(k=k, score_func=f_regression)
    steps=[
            ('polynomial_feature', polynomial_features),
            #('standard_scaler', standard_scaler),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
    ]
    
    model = Pipeline(steps=steps)
    model.fit(X=X, y=y)
    cv=5
    score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
    std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()
    
    scores.append(score)
    stds.append(std)
    
scores = np.array(scores)
stds = np.array(stds)

In [None]:
fig,ax = plt.subplots()
ax.plot(ks,scores-stds,'.-')
ax.plot(ks,scores,'.-')
ax.plot(ks,scores+stds,'.-')

In [None]:
select_k_best = SelectKBest(k=7, score_func=f_regression)
steps=[
        ('polynomial_feature', polynomial_features),
        #('standard_scaler', standard_scaler),
        ('variance_treshold',variance_treshold),
        ('select_k_best',select_k_best),
        ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
model.fit(X=X, y=y)
cv=5
score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()

In [None]:
score

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_e_hat)
polynom.fit(X=X, y=y)
polynom.equation

In [None]:
data['B_e_hat_combine'] = model.predict(data[parameters])
data.loc[~bad_mask,'B_e_hat_combine'] = data.loc[~bad_mask,'B_e_hat_ikeda']


In [None]:
r2_score(y_true=data['B_e_hat'], y_pred=data['B_e_hat_combine'])

In [None]:
fig,ax=plt.subplots()
data.plot(x='B_e_hat', y=['B_e_hat_ikeda','B_e_hat_combine'], ax=ax, style='o', alpha=0.4)

xlim = ax.get_xlim()
ylim = ax.get_ylim()
lim = np.max([xlim[1],ylim[1]])
ax.set_xlim(0,lim)
ax.set_ylim(0,lim)
ax.plot([0,lim],[0,lim],'r-')
ax.set_aspect('equal', 'box')

## Pure polynom additional parameters

In [None]:
additional_parameters = [
 'C_W',
 'K_xx',
 'I_RUD',
 'C_p',
 'A_0',
 'TWIN',
 'A_R',
]

In [None]:
parameters = list(set(ikeda_parameters) | set(additional_parameters))

In [None]:
variance_treshold = VarianceThreshold(0.00)
#standard_scaler = StandardScaler()

y_key = 'B_e_hat'
data_=data[parameters+[y_key]].copy()
data_.dropna(inplace=True)
y = data_['B_e_hat']
X = data_[parameters]
#X = data_[ikeda_parameters]


       
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

ks = np.arange(1,11,1)
scores = []
stds = []
for k in ks:
    select_k_best = SelectKBest(k=k, score_func=f_regression)
    steps=[
            ('polynomial_feature', polynomial_features),
            #('standard_scaler', standard_scaler),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
    ]
    
    model = Pipeline(steps=steps)
    model.fit(X=X, y=y)
    cv=5
    score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
    std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()
    
    scores.append(score)
    stds.append(std)
    
scores = np.array(scores)
stds = np.array(stds)

In [None]:
fig,ax = plt.subplots()
ax.plot(ks,scores-stds,'.-')
ax.plot(ks,scores,'.-')
ax.plot(ks,scores+stds,'.-')

In [None]:
select_k_best = SelectKBest(k=9, score_func=f_regression)
steps=[
        ('polynomial_feature', polynomial_features),
        #('standard_scaler', standard_scaler),
        ('variance_treshold',variance_treshold),
        ('select_k_best',select_k_best),
        ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
model.fit(X=X, y=y)
cv=5
score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()

In [None]:
score

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_e_hat)
polynom.fit(X=X, y=y)
polynom.equation

## Include Ikeda results

In [None]:
parameters = list(set(ikeda_parameters)
                  | set(['B_BK_HAT','B_W_HAT','B_E_HAT','B_L_HAT']))

In [None]:
variance_treshold = VarianceThreshold(0.00)
#standard_scaler = StandardScaler()

y_key = 'B_e_hat'
data_=data[parameters+[y_key]].copy()
data_.dropna(inplace=True)
y = data_['B_e_hat']
X = data_[parameters]


       
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

ks = np.arange(1,16,1)
scores = []
stds = []
for k in ks:
    select_k_best = SelectKBest(k=k, score_func=f_regression)
    steps=[
            ('polynomial_feature', polynomial_features),
            #('standard_scaler', standard_scaler),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
    ]
    
    model = Pipeline(steps=steps)
    model.fit(X=X, y=y)
    cv=5
    score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
    std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()
    
    scores.append(score)
    stds.append(std)
    
scores = np.array(scores)
stds = np.array(stds)

In [None]:
fig,ax = plt.subplots()
ax.plot(ks,scores-stds,'.-')
ax.plot(ks,scores,'.-')
ax.plot(ks,scores+stds,'.-')

In [None]:
select_k_best = SelectKBest(k=14, score_func=f_regression)
steps=[
        ('polynomial_feature', polynomial_features),
        ('variance_treshold',variance_treshold),
        ('select_k_best',select_k_best),
        ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
model.fit(X=X, y=y)
cv=5
score = cross_val_score(estimator=model,X=X,y=y,cv=cv).mean()
std = cross_val_score(estimator=model,X=X,y=y,cv=cv).std()

In [None]:
score

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_e_hat)
polynom.fit(X=X, y=y)
polynom.equation

In [None]:
variance_treshold = VarianceThreshold(0.001)
standard_scaler = StandardScaler()


y = y_s['B_e_hat']
#X = data[important]
#X=data[ikeda_parameters]
       
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

ks = np.arange(1,30)
scores = []
for k in ks:
    select_k_best = SelectKBest(k=k, score_func=f_regression)

    steps=[
        ('polynomial_feature', polynomial_features),
        #('standard_scaler', standard_scaler),
        ('variance_treshold',variance_treshold),
        ('select_k_best',select_k_best),
        ('linear_regression', linear_regression)
    ]
    
    model = Pipeline(steps=steps)
    model.fit(X=X, y=y)
    scores.append(cross_val_score(estimator=model,X=X,y=y,cv=5).mean())

In [None]:
fig,ax=plt.subplots()
ax.plot(ks,scores)

In [None]:
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()


scores = []
select_k_best = SelectKBest(k=10, score_func=f_regression)

steps=[
    ('polynomial_feature', polynomial_features),
    #('standard_scaler', standard_scaler),
    ('variance_treshold',variance_treshold),
    ('select_k_best',select_k_best),
    ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
model.fit(X=X, y=y)
cross_val_score(estimator=model,X=X,y=y,cv=5).mean()

In [None]:
df_ikeda.head()

In [None]:
X2 = X.copy()
X2['B_1_hat'] = df_ikeda['B_1_hat']
X2['B_2_hat'] = df_ikeda['B_2_hat']

polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()


scores = []
select_k_best = SelectKBest(k=10, score_func=f_regression)

steps=[
    ('polynomial_feature', polynomial_features),
    #('standard_scaler', standard_scaler),
    ('variance_treshold',variance_treshold),
    ('select_k_best',select_k_best),
    ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
model.fit(X=X2, y=y)
cross_val_score(estimator=model,X=X2,y=y,cv=5).mean()

In [None]:
r2_score(y_true=df_compare['B_e_hat'], y_pred=df_compare['B_e_hat_ikeda'])

In [None]:
y_key = 'B_1_hat'

fig,ax=plt.subplots()
y_s[y_key].hist(bins=50, ax=ax)
ax.set_title('Historgram: %s' % y_key)

In [None]:
df_ikeda = database.load(rolldecay_table_name='rolldecay_simplified_ikeda', limit_score=0.90, 
                             exclude_table_name='rolldecay_exclude')

In [None]:
data.head()

In [None]:
variance_treshold = VarianceThreshold(0.000)
standard_scaler = StandardScaler()


y = y_s[y_key]
#X = data[important]
X=data.drop(columns=['I_RUD','TWIN']).copy()  # Handle categorical data later
       
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

ks = np.arange(1,30)
scores = []
for k in ks:
    select_k_best = SelectKBest(k=k, score_func=f_regression)

    steps=[
        ('polynomial_feature', polynomial_features),
        ('standard_scaler', standard_scaler),
        ('variance_treshold',variance_treshold),
        ('select_k_best',select_k_best),
        ('linear_regression', linear_regression)
    ]
    
    model = Pipeline(steps=steps)
    model.fit(X=X, y=y)
    scores.append(model.score(X=X, y=y))
    #scores.append(cross_val_score(estimator=model,X=X,y=y,cv=4).mean())
    
    

In [None]:
fig,ax=plt.subplots()
ax.plot(ks,scores)

In [None]:
variance_treshold = VarianceThreshold(0.000)
standard_scaler = StandardScaler()
select_k_best = SelectKBest(k=20, score_func=f_regression)

y = y_s[y_key]
#X = data[important]
X=data.drop(columns=['I_RUD','TWIN']).copy()  # Handle categorical data later
       
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

steps=[
    ('polynomial_feature', polynomial_features),
    ('standard_scaler', standard_scaler),
    ('variance_treshold',variance_treshold),
    ('select_k_best',select_k_best),
    ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
#cross_val_score(estimator=model,X=X,y=y,cv=5).mean()
model.fit(X=X, y=y)
model.score(X=X, y=y)

In [None]:
fig,ax = plt.subplots()
ax.plot(y,model.predict(X),'o', alpha=0.5)
ax.set_title('Prediction of %s' % y_key)
ax.set_xlabel('test: %s' % y_key)
ax.set_ylabel('predicted: %s' % y_key)

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_1_hat)
polynom.fit(X=X, y=y)
polynom.equation

In [None]:
ks = np.arange(1,10)
degrees = np.arange(1,3)
results = pd.DataFrame()

variance_treshold = VarianceThreshold(0.0001)
standard_scaler = StandardScaler()

y = y_s[y_key]
#X = data[important]
X=data.drop(columns=['I_RUD','TWIN']).copy()  # Handle categorical data later


for k in ks:
    for degree in degrees:
        select_k_best = SelectKBest(k=k, score_func=f_regression)
        
       
        polynomial_features = PolynomialFeatures(degree=degree)
        linear_regression = LinearRegression()
        
        steps=[
            ('polynomial_feature', polynomial_features),
            ('standard_scaler', standard_scaler),
            ('variance_treshold',variance_treshold),
            ('select_k_best',select_k_best),
            ('linear_regression', linear_regression)
        ]
        
        model = Pipeline(steps=steps)
        score = cross_val_score(estimator=model,X=X,y=y,cv=5).mean()
        s = pd.Series()
        s['k'] = k
        s['degree'] = degree
        s['score'] = score
        results = results.append(s, ignore_index=True)
        
        

In [None]:
results.sort_values(by='score', ascending=False, inplace=True)

In [None]:
results.head()

In [None]:
k = int(results.iloc[0]['k'])
degree = int(results.iloc[0]['degree'])

In [None]:
select_k_best = SelectKBest(k=k, score_func=f_regression)

standard_scaler = StandardScaler()
polynomial_features = PolynomialFeatures(degree=degree)
linear_regression = LinearRegression()

steps=[
    ('polynomial_feature', polynomial_features),
    ('standard_scaler', standard_scaler),
    ('variance_treshold',variance_treshold),
    ('select_k_best',select_k_best),
    ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
model.fit(X=X_train, y=y_train)
score = model.score(X=X_test, y=y_test)
score

In [None]:
fig,ax = plt.subplots()
ax.plot(y_test,model.predict(X_test),'.', alpha=0.5)
ax.set_title('Prediction of %s' % y_key)
ax.set_xlabel('test: %s' % y_key)
ax.set_ylabel('predicted: %s' % y_key)


In [None]:
fig,ax = plt.subplots()
ax.plot(y,model.predict(X),'o', alpha=0.6)
ax.set_title('Prediction of %s' % y_key)
ax.set_xlabel('test: %s' % y_key)
ax.set_ylabel('predicted: %s' % y_key)

In [None]:
import pickle
filename = 'B_1_hat_model.sav'
model.fit(X=X, y=y)
model.keys=list(X.columns)
pickle.dump(model, open(filename, 'wb'))

In [None]:
model.score(X=X,y=y)

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_1_hat)
polynom.fit(X=X, y=y)

score = polynom.score(X=X, y=y)
score

In [None]:
polynom.equation

In [None]:
polynom.save('B_1_hat_polynom.sym')

In [None]:
X.describe()

In [None]:
select_k_best = SelectKBest(k=9, score_func=f_regression)
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()
variance_treshold = VarianceThreshold(0.000001)

steps=[
    ('polynomial_feature', polynomial_features),
    ('standard_scaler', standard_scaler),
    ('variance_treshold',variance_treshold),
    ('select_k_best',select_k_best),
    ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
score = cross_val_score(estimator=model,X=X,y=y,cv=5).mean()
score

In [None]:
model.fit(X=X,y=y)

In [None]:
model.score(X=X,y=y)

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_1_hat)
polynom.fit(X=X, y=y)

In [None]:
polynom.equation

In [None]:
X.head()

In [None]:
mask = X['V']==0
X_0=X.loc[mask]
y_0=y.loc[mask]

In [None]:
y_0.hist()

In [None]:
model.fit(X=X_0,y=y_0)

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_1_hat)
polynom.fit(X=X_0, y=y_0)

In [None]:
polynom.equation

In [None]:
fig,ax = plt.subplots()
ax.plot(y_0,model.predict(X_0),'o', alpha=0.6)
ax.set_title('Prediction of %s' % y_key)
ax.set_xlabel('test: %s' % y_key)
ax.set_ylabel('predicted: %s' % y_key)

In [None]:
X['B_1_hat0']=model.predict(X=X)

In [None]:
select_k_best = SelectKBest(k=9, score_func=f_regression)
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

steps=[
    ('polynomial_feature', polynomial_features),
    ('standard_scaler', standard_scaler),
    ('variance_treshold',variance_treshold),
    ('select_k_best',select_k_best),
    ('linear_regression', linear_regression)
]

model = Pipeline(steps=steps)
score = cross_val_score(estimator=model,X=X,y=y,cv=5).mean()
score

In [None]:
model.fit(X=X,y=y)

In [None]:
polynom = Polynom(model=model, columns=X.columns, y_symbol=symbols.B_1_hat)
polynom.fit(X=X, y=y)

In [None]:
polynom.equation

In [None]:
model.score(X=X,y=y)