# Importing packages

In [1]:
from utils.model_summary_functions import feature_importance, metrics, summary_plot,multivariate_importance

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pickle

from sklearn.metrics import mean_squared_error, f1_score, accuracy_score, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, train_test_split, KFold, StratifiedShuffleSplit, TimeSeriesSplit, RepeatedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor,GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.feature_selection import SelectKBest, f_regression, chi2
from sklearn.linear_model import LassoLarsCV,LassoCV, Lasso
from sklearn.decomposition import PCA

from yellowbrick.model_selection import FeatureImportances
from yellowbrick.regressor import residuals_plot
from yellowbrick.regressor import prediction_error
from yellowbrick.model_selection import learning_curve
from yellowbrick.model_selection import feature_importances

import tensorflow as tf

In [3]:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [4]:
import warnings  # Ignore alert messages
warnings.filterwarnings('ignore')

In [5]:
plt.style.use('default')

plt.rcParams.update({
    'font.size': 16,
    'axes.linewidth': 2,
    'axes.titlesize': 20,
    'axes.edgecolor': 'black',
    'axes.labelsize': 20,
    'axes.grid': True,
    'lines.linewidth': 1.5,
    'lines.markersize': 6,
    'figure.figsize': (15, 6),
    'xtick.labelsize': 14,
    'ytick.labelsize': 14,
    'font.family': 'Arial',
    'legend.fontsize': 13,
    'legend.framealpha': 1,
    'legend.edgecolor': 'black',
    'legend.shadow': False,
    'legend.fancybox': True,
    'legend.frameon': True,
})

# Importing dataset after EDA

In [6]:
df = pd.read_csv("Processed_data/FRA3-FRA6_cleaned_feature_engineered.csv.zip")
df

Unnamed: 0,Trail_right_z_sec_susp,Lead_left_z_sec_susp,Trail_left_z_sec_susp,Lead_right_z_sec_susp,Triaxial_x,Triaxial_z,Lead_z_accel_right_side,Susp_travel_left,Susp_travel_right,Pitch_accel,...,Triaxial_x_RMA_55 samples,Triaxial_x_MA_55 samples,Triaxial_x_RMS_55 samples,Triaxial_z_mean_55 samples,Susp_travel_mean_55 samples,Susp_travel_skew_55 samples,Pitch_left_susp_mean_55 samples,Bounce_susp_mean_55 samples,Front_Roll_susp_skew_55 samples,Vert_irreg_right_rail
0,-0.034100,0.030900,0.021800,-0.0218,-0.000116,-0.000749,-0.001817,0.030900,-0.034100,-0.000459,...,0.008054,0.000065,0.000457,-0.000293,0.004538,-1.143003,0.000911,0.001606,0.941525,-0.0712
1,-0.035600,0.031600,0.022800,-0.0228,-0.000139,-0.000749,-0.001812,0.031600,-0.035600,-0.000542,...,0.008238,0.000068,0.000457,-0.000307,0.003883,-1.156428,0.000941,0.001485,0.893110,-0.0753
2,-0.036700,0.032000,0.023700,-0.0239,-0.000162,-0.000727,-0.001605,0.032000,-0.036700,-0.000613,...,0.008436,0.000071,0.000458,-0.000320,0.003208,-1.139831,0.000970,0.001362,0.843661,-0.0791
3,-0.037000,0.031400,0.024400,-0.0248,-0.000182,-0.000650,-0.000912,0.031400,-0.037000,-0.000654,...,0.008647,0.000075,0.000459,-0.000332,0.002527,-1.100159,0.000993,0.001242,0.787577,-0.0827
4,-0.037300,0.030900,0.025100,-0.0257,-0.000202,-0.000574,-0.000219,0.030900,-0.037300,-0.000695,...,0.008870,0.000079,0.000459,-0.000342,0.001840,-1.048016,0.001013,0.001125,0.728316,-0.0858
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68096,-0.111903,-0.231038,0.133286,0.0503,0.001834,0.000079,0.004336,-0.231038,-0.111903,-0.003638,...,0.053665,0.002880,0.004202,-0.009225,-0.722654,-0.835219,-0.063069,-0.147773,-0.773285,0.0486
68097,-0.113222,-0.225703,0.140285,0.0422,0.001846,-0.000082,0.003567,-0.225703,-0.113222,-0.003363,...,0.053726,0.002887,0.004204,-0.008988,-0.693827,-0.847119,-0.063281,-0.143451,-0.788629,0.0511
68098,-0.114092,-0.220937,0.148758,0.0347,0.001823,-0.000321,0.002644,-0.220937,-0.114092,-0.003317,...,0.053553,0.002868,0.004196,-0.008838,-0.666117,-0.856692,-0.063529,-0.139384,-0.803025,0.0535
68099,-0.114961,-0.216170,0.157232,0.0272,0.001800,-0.000559,0.001721,-0.216170,-0.114961,-0.003272,...,0.053190,0.002829,0.004170,-0.008807,-0.639543,-0.864403,-0.063821,-0.135586,-0.816822,0.0558


In [7]:
df_aux = df.copy()

In [8]:
def create_scaled_dataset(X,y,shuffle=False):
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=0,shuffle=shuffle) #Shuffle was set to false to allow correct plotting of the PSD

    std_scaler = StandardScaler()
    X_scaled_train = pd.DataFrame(std_scaler.fit_transform(X_train),columns=X_train.columns,index=X_train.index)
    X_scaled_test = pd.DataFrame(std_scaler.transform(X_test),columns=X_train.columns,index=X_test.index)
    return X_scaled_train,X_scaled_test, y_train,y_test

In [9]:
y = df_aux['Vert_irreg_right_rail']
X = df_aux.drop(['Vert_irreg_right_rail'],axis=1)

X_scaled_train2,X_scaled_test2,y_train2,y_test2 = create_scaled_dataset(X,y)

# Loading models

In [10]:
def load_model(name, path):
    import pickle
    filename = path + '/' + name
    loaded_model = pickle.load(open(filename, 'rb'))
    return loaded_model

In [12]:
#path = 'models/ml_models'
#knn = load_model('KNN_Regressor.sav',path=path)

knn = tf.keras.models.load_model("models/NN/Vanilla_NN")

In [15]:
yhat = knn.predict(X_scaled_test2)

In [16]:
metrics(y_test2,yhat)

MSE:0.40499918028143816 
MAE:0.4556203196011942 
R2:0.9865518661168124


# Bootstrap confidence intervals

Bootstrapping is used to infer population results from averaged statistical measures calculated on multiple bags of random samples with replacement. It can be used to infer population results of machine learning models trained on random samples with replacement. 

When a machine learning model is built using bootstrapped data, the model is trained on the bootstrapped data and then tested on the out of bag (OOB) data. The OOB is the portion of the original population that has never been selected in any of the random samples. Because the model has not seen this data before, the model’s quality can be accurately assessed by testing it. If the model performs well on this OOB test data, that indicates that it should also perform similarly well on new data that it's later exposed to. 

For this section, we will bootstrap 1,000 times in order to obtain a high level of reliability in our statistics.

This section was created based on the references:
- https://machinelearningmastery.com/a-gentle-introduction-to-the-bootstrap-method/
- https://machinelearningmastery.com/calculate-bootstrap-confidence-intervals-machine-learning-results-python/

In [None]:
from sklearn.utils import resample
from tqdm import tqdm

from sklearn.neighbors import KNeighborsRegressor

In [None]:
X_train, X_test, y_train, y_test = create_scaled_dataset(X, y,shuffle=True)

df_train = pd.concat([X_train,y_train],axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

In [None]:
yhat = knn.predict(X_test)
metrics(y_test, yhat)

In [None]:
bootstrap_metrics = list()

# Bootstrap parameters
n_iterations = 500

for i in tqdm(range(n_iterations)):
    # Creating training and test set (Out of bag samples)
    train = resample(df_train, replace=True, n_samples=len(df_train))
    
    # Fit the model
    model = KNeighborsRegressor(n_neighbors=3)
    model.fit(train.iloc[:,:-1], train.iloc[:,-1])
    
    # Evaluate the model
    yhat = model.predict(df_test.iloc[:,:-1])
    score = np.sqrt(mean_squared_error(df_test.iloc[:, -1], yhat))
    #print(score)
    bootstrap_metrics.append(score)
    
bootstrap_metrics = pd.DataFrame(bootstrap_metrics)
bootstrap_metrics

In [None]:
lower_bound = bootstrap_metrics[0].quantile(0.025)
upper_bound = bootstrap_metrics[0].quantile(0.975)
mean = bootstrap_metrics[0].mean()

print('Model RMSE will be in the confidence interval of {1} and {2} with {0}% confidence'.format(alpha*100, np.round(lower_bound, 3), np.round(upper_bound,3)))
print('The mean RMSE was {} mm'.format(np.round(mean,3)))

In [None]:
plt.hist(bootstrap_metrics, bins=50, density=True)

plt.axvline(x=lower_bound, color='m', linestyle='--',label='2.5% confidence level')
plt.axvline(x=mean, color='r', linestyle='--', label='mean value')
plt.axvline(x=upper_bound, color='m', linestyle='--',label='97.5% confidence level')

plt.show()