## Analyzing pilot boat

In [None]:
import src.data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
pd.set_option("display.max_columns", None)

In [None]:
reader = pd.read_csv(src.data.path_lots, chunksize=100000, iterator=True)  # Loading a small part of the data
df_raw = next(reader)
df_raw.set_index('Timestamp [UTC]', inplace=True)

In [None]:
df_raw.describe()

In [None]:
mask = df_raw.std() > 0
interesting=df_raw.columns[mask]
df = df_raw[interesting].copy()

mask = df['Speed over ground (kts)']>5
df=df.loc[mask]
df = df.iloc[0:10000]


In [None]:
df.describe()

In [None]:
df.columns

In [None]:
interesting = [
              'Consumption ME all (L/h)',
               'Economy (L/nm)',
               'Speed over ground (kts)',
              'Engine speed ME1 (rpm)',
              'Engine speed ME2 (rpm)',
              
              ]
data = df[interesting].copy()
data.dropna(how='any', inplace = True)

In [None]:
g = sns.PairGrid(data)
g.map_upper(sns.scatterplot)
g.map_lower(sns.scatterplot)
g.map_diag(sns.kdeplot, lw=3, legend=False);

In [None]:
#g = sns.PairGrid(data)
#g.map_upper(sns.kdeplot)
#g.map_lower(sns.kdeplot)
#g.map_diag(sns.kdeplot, lw=3, legend=False);

## Regression

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression

select_k_best = SelectKBest(score_func=f_regression, k=4)
standard_scaler = StandardScaler()
polynomial_features = PolynomialFeatures(degree=2)
linear_regression = LinearRegression()

steps = [
    ('scaler', standard_scaler),
    ('polynomial_features', polynomial_features),
    ('select_k_best', select_k_best),
    ('linear_regression', linear_regression),
]

pipeline_polynomial_scaled_selection = Pipeline(steps=steps)

In [None]:
n = 3
degree = 3
polynomial_features = PolynomialFeatures(degree=degree)
polynomial_features.fit(np.array([np.arange(n)]))
polynomial_features.get_feature_names()

In [None]:
#X = data[['Speed over ground (kts)']].copy()
#X['Engine speed'] = (data['Engine speed ME1 (rpm)'] + data['Engine speed ME2 (rpm)'])/2
#y = data['Consumption ME all (L/h)']                        

X = data[['Consumption ME all (L/h)']].copy()
X['Engine speed'] = (data['Engine speed ME1 (rpm)'] + data['Engine speed ME2 (rpm)'])/2
y = data['Speed over ground (kts)']       

In [None]:
from sklearn.model_selection import GridSearchCV

# Decide how many k
max_degree = 5
no_features = len(X.columns)
polynomial_features = PolynomialFeatures(degree=max_degree)
polynomial_features.fit(np.array([np.arange(no_features)]))
max_k = len(polynomial_features.get_feature_names())


# define the grid
grid = dict()
grid['select_k_best__k'] = [i for i in range(1, max_k+1)]
grid['polynomial_features__degree'] = [i for i in range(1, max_degree+1)]

from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)

# define the grid search
search = GridSearchCV(estimator=pipeline_polynomial_scaled_selection, param_grid=grid, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
#search = GridSearchCV(estimator=pipeline_polynomial_scaled_selection, param_grid=grid, scoring='r2', n_jobs=-1, cv=cv)
# perform the search
search_result = search.fit(X, y)

In [None]:
model = search_result.best_estimator_

In [None]:
search_result.cv_results_['mean_test_score']

In [None]:
search_result.cv_results_['std_test_score']

In [None]:
search_result.cv_results_['params']

In [None]:
cv_results_ = search_result.cv_results_
df_search = pd.DataFrame()
for param, mean_test_score, std_test_score in zip(cv_results_['params'],cv_results_['mean_test_score'],cv_results_['std_test_score']):
    r_ = pd.Series(param)
    r_['mean_test_score'] = mean_test_score
    r_['std_test_score'] = std_test_score
    df_search = df_search.append(r_, ignore_index=True)


In [None]:


for k, group in df_search.groupby('select_k_best__k'):
    
    fig,ax=plt.subplots()
    label = 'k:%i' % k
    ax.errorbar(group['polynomial_features__degree'], group['mean_test_score'], yerr=group['std_test_score'], linestyle='None', marker='.')
    
    ax.set_title(label)
    ax.grid()
    

In [None]:
import sklearn

def find_polynomial_feature(model):
    found = False
    for part in model:
        if isinstance(part, PolynomialFeatures):
            polynomial_features = part
            found = True
            break
    if not found:
        raise ValueError('model pipeline must contain an instance of PolynomialFeatures')
    
    return polynomial_features

def find_select_k_best(model):
    found = False
    for part in model:
        if isinstance(part, SelectKBest):
            select_k_best = part
            found = True
            break
    if not found:
        raise ValueError('model pipeline must contain an instance of SelectKBest')
        
    return select_k_best

def model_to_string(model:sklearn.pipeline.Pipeline, feature_names:list, divide=' '):
    
    # Find polynomial features:
    polynomial_features = find_polynomial_feature(model=model)
    
    # Find select_k_best:
    select_k_best = find_select_k_best(model=model)
    
    polynomial_feature_names = np.array(polynomial_features.get_feature_names())
    best_polynomial_feature_names = polynomial_feature_names[select_k_best.get_support()]
    
    predictor = model[-1]  # Last item in the pipeline is assumed to be the precictor
    coefficients = predictor.coef_
    interception = predictor.intercept_
    
    x_names = ['x%i'%i for i in range(len(feature_names))]
    
    expression = ''
    expression+='%f' % interception
    for part,coefficient in zip(best_polynomial_feature_names,coefficients):
        
        nice_part = part.replace(' ','*')
        super_nice_part = nice_part
        for feature_name,x in zip(feature_names,x_names):
            super_nice_part=super_nice_part.replace(x,feature_name)
        
        if coefficient==0:
            continue
        elif coefficient<0:
            sign=''
        else:
            sign='+'
        
        sub_part = '%s%s%s%f*%s' % (divide,sign,divide,coefficient,super_nice_part)
    
        
        expression+=sub_part
    
    return expression

print(model_to_string(model=model, feature_names=features))

In [None]:
scores = cross_val_score(estimator=model, X=X, y=y, scoring='r2', cv=cv, n_jobs=-1)

fig,ax = plt.subplots()
ax.hist(scores);
ax.set_xlabel('score')
ax.set_ylabel('occurances')
ax.set_title('Histogram over cross validations');

In [None]:
result = X.copy()
result['y_true'] = y
result['y_pred'] = model.predict(X=X)

fig,axes=plt.subplots(ncols=len(X.columns))
for x,ax in zip(X, axes):
    result.plot(x=x, y='y_true', ax=ax, style='.')
    result.plot(x=x, y='y_pred', ax=ax, style='.', alpha=0.2)
    

## Speed model

In [None]:
X = data[['Speed over ground (kts)']].copy()
y = data['Consumption ME all (L/h)']  

In [None]:
from sklearn.model_selection import GridSearchCV

# define the grid
grid = dict()
grid['select_k_best__k'] = [i for i in range(1, X.shape[1]+1)]
grid['polynomial_features__degree'] = [i for i in range(1, 10)]

from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)

# define the grid search
search = GridSearchCV(estimator=pipeline_polynomial_scaled_selection, param_grid=grid, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)
#search = GridSearchCV(estimator=pipeline_polynomial_scaled_selection, param_grid=grid, scoring='r2', n_jobs=-1, cv=cv)
# perform the search
search_result = search.fit(X, y)