# Regression on yacht hydrodynamics data set

![](https://www.blur.se/images/df4.jpg)
I have previsouly fitted a polynomial to this dataset which I wrote about on [my blog](https://martinlarsalbert.github.io/blog/machine%20learning/regression/2020/08/17/regression_yacht_hydrodynamics_data_set.html).





In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import altair as alt
from io import StringIO
import re
import urllib

## Load dataset
[Dataset](https://archive.ics.uci.edu/ml/datasets/Yacht+Hydrodynamics) from from Technical University of Delft.

In [None]:
#collapse
columns = [
'lcg',   
'cp',    
'volume',
'b/d',   
'l/b',   
'fn',    
'r', 
]

data_url = r'http://archive.ics.uci.edu/ml/machine-learning-databases/00243/yacht_hydrodynamics.data'
with urllib.request.urlopen(data_url) as file:
    s_raw=file.read().decode("utf-8")
    
# remove some dirt:
regexp = re.compile(r' \n', flags=re.DOTALL)
s1 = regexp.sub('\n', s_raw)

regexp = re.compile(r' +', flags=re.DOTALL)
s2 = regexp.sub(' ', s1)
s2[0:200]
s=s2

data = StringIO(s)
data = pd.read_csv(data, sep=' ', encoding='utf-8', names=columns)

features = list(set(columns)-set(['r']))
label = 'r'

In [None]:
data.describe()

In [None]:
data.dtypes

In [None]:
data.isnull().any()

In [None]:
sns.pairplot(data=data)

In [None]:
from sklearn.preprocessing import StandardScaler

X=data[features].copy()
y=data[label].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

standard_scaler = StandardScaler()
X_train_scaled = standard_scaler.fit_transform(X_train)
X_test_scaled = standard_scaler.transform(X_test)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train_scaled, y_train)
importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': model.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)

## Obtain importances from a tree-based model

In [None]:
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[['fn']] 

In [None]:
fn_selector = FeatureSelector(feature_names=[['fn']])
fn_selector.fit(X_train)
fn_selector.transform(X_train)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold

polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

steps = [
    ('fn_selector', fn_selector),
    ('polynomial_features', polynomial_features),
    ('linear_regression', linear_regression),
]

model_benchmark = Pipeline(steps=steps)


# define the grid
grid = dict()
grid['polynomial_features__degree'] = [i for i in range(1, 6)]


# define the grid search
cv = RepeatedKFold(n_splits=5, n_repeats=2, random_state=1)
search = GridSearchCV(estimator=model_benchmark, param_grid=grid, scoring='r2', n_jobs=-1, cv=cv)

# perform the search

search_result = search.fit(X_train, y_train)

model_benchmark = search_result.best_estimator_

In [None]:
search_result.best_estimator_

In [None]:
model_benchmark_ = Pipeline(steps=[
    ('polynomial_features', PolynomialFeatures(degree=4)),
    ('linear_regression', LinearRegression()),
])
model_benchmark_.fit(X_train[['fn']], y_train)

In [None]:
search_result.best_score_

In [None]:
model_benchmark.score(X_test, y_test)

In [None]:
model_benchmark_.score(X_test[['fn']], y_test)

In [None]:
y_pred = model_benchmark.predict(X_test)
fig,ax=plt.subplots()
x = X_test['fn']
ax.plot(x,y_test, '+', label='test')
ax.plot(x,y_pred, '.', label='prediction')


In [None]:
l = model_benchmark_['linear_regression']
l.coef_

so a 4th degree polynomial on froude number **fn** can give a very good prediction.