In [28]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [30]:
#imports aller Classifier

# third-party libraries
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm

#sklearn models
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ridge_regression, LogisticRegression, Lasso, LinearRegression
# sklearn tools
from sklearn.metrics import mean_squared_error, root_mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
#tsfresh
from tsfresh.feature_extraction import feature_calculators, MinimalFCParameters, EfficientFCParameters
#xgboost
from xgboost import XGBRegressor
# Bayesion Optimizer
from bayes_opt import BayesianOptimization

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [31]:
# source code
from src.data_loading import load_data, load_config
from src.data_splitting import train_val_split_by_group
from src.data_cleaning import clean_data, format_dtype
from src.rolling_window_creator import calculate_RUL, RollingWindowDatasetCreator



In [32]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [33]:
np.random.seed(42)

# Paths

In [34]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
os.chdir("/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling") # set working directory to root of project
os.getcwd() # check current working directory

'/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling'

In [35]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [36]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [37]:
#Load Dataset 1
train_data_1, test_data_1, test_RUL_data_1 = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)

2024-06-01 18:53:26 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-06-01 18:53:26 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-06-01 18:53:26 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-06-01 18:53:26 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-06-01 18:53:26 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


In [38]:
#Load Dataset 2
train_data_2, test_data_2, test_RUL_data_2 = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)

2024-06-01 18:53:31 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m
2024-06-01 18:53:31 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m
2024-06-01 18:53:31 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m
2024-06-01 18:53:31 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m
2024-06-01 18:53:31 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


In [39]:
#Load Dataset 3
train_data_3, test_data_3, test_RUL_data_3 = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)

2024-06-01 18:53:32 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 3...[0m
2024-06-01 18:53:32 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 3.[0m
2024-06-01 18:53:32 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (24720, 26)[0m
2024-06-01 18:53:32 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (16596, 26)[0m
2024-06-01 18:53:32 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


In [40]:
#Load Dataset 4
train_data_4, test_data_4, test_RUL_data_4 = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)

2024-06-01 18:53:34 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 4...[0m
2024-06-01 18:53:34 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 4.[0m
2024-06-01 18:53:34 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (61249, 26)[0m
2024-06-01 18:53:34 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (41214, 26)[0m
2024-06-01 18:53:34 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (248, 1)[0m


# 📍 << Subtask 3: Traditional ML >>

# Best Results

In [None]:
# Dataset 1
# On Dataset 1 our best Training model was the RandomForest Regressor
# the parameter are the following:
# n_estimators=296
# max_features=4
# random_state = 17



# please find the complete pipeline and how we achieved this score below
# RMSE : 18.1911

In [None]:
# Dataset 2
# On Dataset 2 our best Training model was the XGBoost Regressor
# the parameter are the following:
# eta=0.02803
# gamma=0.8998
# max_depth=6


# please find the complete pipeline and how we achieved this score below
# RMSE on the Train-data: 34.11

In [None]:
# Dataset 3
# On the dataset 3 was XGBoostRegressor our best performing model.
# eta= 0.2079
# gamma= 1.342
# max_depth= 2
# It achieved an RMSE of 21.48 in this configuration

# The complete pipeline is listed down below

In [None]:
# Dataset 4
# For dataset 4 the RandomForestRegressor performed the best.
# n_estimators = 264
# max_features = 1
# random_state = 17
# In this case it achieved a RMSE of 39.53. When comparing the 4 datasets is by far the worst.

# Again the whole pipeline and process is detailed below.

# Procedure


# Data Cleaning

In [41]:
# Default Data cleaning
cleaned_train_1, cleaned_test_1 = clean_data(train_data_1, test_data_1, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)

2024-06-01 18:54:07 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 18:54:07 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 18:54:07 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:07 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:07 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-06-01 18:54:07 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-06-01 18:54:07 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-06-01 18:54:07 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-06-01 18:54:07 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

In [42]:
cleaned_train_2, cleaned_test_2 = clean_data(train_data_2, test_data_2, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)

2024-06-01 18:54:09 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 18:54:09 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 18:54:09 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:09 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:09 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-06-01 18:54:09 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-06-01 18:54:09 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-06-01 18:54:09 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-06-01 18:54:09 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

In [43]:
cleaned_train_3, cleaned_test_3 = clean_data(train_data_3, test_data_3, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)


2024-06-01 18:54:11 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 18:54:11 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 18:54:11 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:11 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:11 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-06-01 18:54:11 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-06-01 18:54:11 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-06-01 18:54:11 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-06-01 18:54:11 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

In [44]:
cleaned_train_4, cleaned_test_4 = clean_data(train_data_4, test_data_4, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)


2024-06-01 18:54:14 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 18:54:14 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 18:54:14 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:14 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:14 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-06-01 18:54:14 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-06-01 18:54:14 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-06-01 18:54:14 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-06-01 18:54:14 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

Reasons for default data cleaning

In [45]:
# Variations C
cleaned_train_1_varC, cleaned_test_1_varC = clean_data(train_data_1, test_data_1, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)

cleaned_train_3_varC, cleaned_test_1_varC = clean_data(train_data_3, test_data_3, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)

2024-06-01 18:54:17 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 18:54:17 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 18:54:17 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:17 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 18:54:17 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-06-01 18:54:17 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-06-01 18:54:17 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-06-01 18:54:17 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-06-01 18:54:17 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

Notes: Data cleaning Variation C for dataset 1 is used instead of the default configuration to reduce clutter by not helpful sensor data. This also improves computation time in all following steps which is very important.

# Feature Engineering

In [51]:
# Feature Selection -- 1 -> 4
feature_list_ds_1 = ["c3", "quantile", "mean", "root_mean_square", "median", "time_reversal_asymmetry_statistic", "absolute_maximum", "maximum", "minimum", "agg_autocorrelation", "autocorrelation" ]
feature_list_ds_2 = ["c3", "quantile", "mean", "median", "root_mean_square", "variance", "mean_abs_change", "standard_deviation", "skewness", "variation_coefficient", "last_location_of_maximum", "first_location_of_maximum"]
feature_list_ds_3 = ["c3", "quantile", "mean", "root_mean_square", "median", "time_reversal_asymmetry_statistic", "absolute_maximum", "maximum", "minimum", "agg_autocorrelation", "autocorrelation" ]
feature_list_ds_4 = ["c3", "quantile", "mean", "median", "root_mean_square", "variance", "mean_abs_change", "standard_deviation", "skewness", "variation_coefficient", "last_location_of_maximum", "first_location_of_maximum"]


The feature list we generated by evaluing most of the tsfresh FCparameters by themselves and choosing the top performing ones.
Since the datasets 1 & 3 and 2 & 4 are similar they share the same feature list. Further explainations for this are below.

In [None]:
#min_timeshift, max_timeshift = 17,18
#for ds_train, ds_test, ds_rul in [cleaned_train_1,cleaned_train_2,cleaned_train_3,cleaned_train_4],[cleaned_test_1,cleaned_test_2,cleaned_test_3,cleaned_test_4],[test_RUL_data_1,test_RUL_data_2,test_RUL_data_3,test_RUL_data_4]:
#  for feat in EfficientFCParameters():
#    # RollingWindow
#    rwCreator = RollingWindowDatasetCreator(max_timeshift=max_timeshift,min_timeshift=min_timeshift,feature_extraction_mode= 'custom',feature_list=[feat])
#    X_train, y_train, X_test, y_test = rwCreator.create_rolling_windows_datasets(train_data=ds_train, test_data=ds_test,test_RUL_data=ds_rul,)
#    # KNeighborsRegressor
#    knr = KNeighborsRegressor(3)
#    knr.fit(X_train, y_train.values.ravel())
#    rgr1 = np.sqrt(mean_squared_error(y_test, knr.predict(X_test)))
#    # RandomForestRegressor
#    rfr  = RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1, random_state=42)
#    rfr.fit(X_train, y_train.values.ravel())
#    rgr2 = np.sqrt(mean_squared_error(y_test, rfr.predict(X_test)))
#    # Lasso
#    lr = Lasso()
#    lr.fit(X_train, y_train.values.ravel())
#    rgr3 = np.sqrt(mean_squared_error(y_test, lr.predict(X_test)))
#    # XGBRegressor
#    xgbr = XGBRegressor(n_estimators=3, max_depth=1, learning_rate=0.211) # objective='binary:logistic'
#    xgbr.fit(X_train, y_train.values.ravel())
#    rgr4 = np.sqrt(mean_squared_error(y_test, xgbr.predict(X_test)))
#    # Prev Data import
#    df_in = pd.read_pickle("drive/MyDrive/PSDA_cml/data/processed/ds4_tsf-feat_eff_results.pkl")
#    df = pd.DataFrame(data={'Feature': feat, 'Regressor Results': [f"KNR: {rgr1}",f"RFR: {rgr2}", f"Lasso: {rgr3}", f"XBGr: {rgr4}"]})
#    df_out = pd.concat([df_in, df])
#    df_out.to_pickle(f"drive/MyDrive/PSDA_cml/data/processed/ds_tsf-feat_eff_results.pkl")
#    print({'Feature': feat, 'Regressor Results': [f"KNR: {rgr1}",f"RFR: {rgr2}", f"Lasso: {rgr3}", f"XBGr: {rgr4}"]})
#  df = pd.read_pickle("drive/MyDrive/PSDA_cml/data/processed/ds1_tsf-feat_eff_results.pkl")
#  dict_ds = dict()
#  for i in range(0,df.shape[0],4):
#    mean = (float(df.values[i, 1].partition(":")[2]) + float(df.values[i+1, 1].partition(":")[2]) + float(df.values[i+2, 1].partition(":")[2]) + float(df.values[i+3, 1].partition(":")[2])) /4
#    dict_ds[df.values[i,0]] = mean
#  df_ds = pd.DataFrame.from_dict(data=dict_ds,orient='index',columns=['mean'])
#  print(f"Dataset {ds_train}: ")
#  print(df_ds.sort_values(by='mean'))

This code loops over all datasets and all features of tsfreshs EfficientFCParameters. \\
This took multiple hours and never fully finished. The longest run computed 56 features while the other computed far less. Thus we based the feature_list on the longest succesfully running tries for dataset 1 & 3 and the dataset 2 & 4.
The two datasets were combined due to their similarity. This approach can't be advised to repeat since it each try, even unsuccessfull took ones multiple hours. The shortest was somewhat over 2 hours before the runtime reached its limit. Sadly because tsfresh was uncompatible with jupyterhub there was no real alternative. [Computation for this was done in Colab: https://colab.research.google.com/drive/1F_hpmXcxYoJT3LsvXjF65c3_lZ7ltEr6?usp=sharing ]

# Windowing

In [52]:
# Var
min_ts = 5
max_ts = 15

In [48]:
# Var
min_ts_ds1_varc = 29
max_ts_ds1_varc = 30

min_ts_ds2_varc = 17
max_ts_ds2_varc = 18

min_ts_ds3_varc = 29
max_ts_ds3_varc = 30

min_ts_ds4_varc = 17
max_ts_ds4_varc = 18

The best results were achieved with a window size of around 30. Because of that variation C uses a max timeshift of 30. Similar to some papers.
Sadly datasets 2 & 4 have entries which have fewer than 20 steps. Thus for them the timeshift is reduced.

In [None]:
#Erstellen der Datensätze mittels unserer Rolling Window Methode

In [53]:
rwCreator_varc = RollingWindowDatasetCreator(max_timeshift=max_ts_ds1_varc,min_timeshift=min_ts_ds1_varc,feature_extraction_mode= 'custom',feature_list=feature_list_ds_1)
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts,min_timeshift=min_ts,feature_extraction_mode= 'minimal')


RollingWindowCreator Variation C with the custom extraction mode is helpful for datasets 1 & 3. The results for the custom feature list for 2 & 4 were heavily dependent on the used regressor model. For them minimal seemed usually sufficient.

In [54]:
X_train_1_varc, y_train_1_varc, X_test_1_varc, y_test_1_varc = rwCreator_varc.create_rolling_windows_datasets(cleaned_train_1_varC,cleaned_test_1_varC,test_RUL_data_1)



2024-06-01 19:02:11 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:03<00:00,  5.78it/s]


2024-06-01 19:02:15 [[34msrc.rolling_window_creator:140[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [01:02<00:00,  3.12s/it]


2024-06-01 19:03:26 [[34msrc.rolling_window_creator:148[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 19:03:26 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 20/20 [00:03<00:00,  6.31it/s]


2024-06-01 19:03:29 [[34msrc.rolling_window_creator:140[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 11.31it/s]


2024-06-01 19:03:31 [[34msrc.rolling_window_creator:176[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m
2024-06-01 19:03:31 [[34msrc.rolling_window_creator:177[0m] [[32mINFO[0m] >>>> Shape of X_train: (17731, 396)[0m
2024-06-01 19:03:31 [[34msrc.rolling_window_creator:178[0m] [[32mINFO[0m] >>>> Shape of y_train: (17731, 1)[0m
2024-06-01 19:03:31 [[34msrc.rolling_window_creator:179[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 264)[0m
2024-06-01 19:03:31 [[34msrc.rolling_window_creator:180[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m


# Data Preprocessing

In [56]:
y_train_1_varc = y_train_1_varc.clip(upper=125)

In [57]:
#Scalieren der Datensätze
scaler_std = StandardScaler()





In [58]:
X_train_1_varc[2:] = scaler_std.fit_transform(X_train_1_varc[2:])
X_test_1_varc[2:] = scaler_std.fit_transform(X_test_1_varc[2:])

# Traditional ML Models

The models we tested are the following:

KNeighborsRegressor  \\
SupportVectorMachineRegressor \\
RandomForestRegressor  \\
MultiLayerPerceptronRegressor \\
AdaBoostRegressor \\
GaussianNaiveBayes \\
KernelRidge \\
Lasso  \\
LinearRegressor \\
LogisiticRegressor \\
GradBoostRegressor \\
XGBoostRegressor \\
ExtraTrees


In [None]:
#for reasons of clarity we only show the models on the first Dataset

In [None]:
# KNeighorsRegressor
rgr  = KNeighborsRegressor(3)
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

NameError: name 'KNeighborsRegressor' is not defined

In [None]:
# SupportVectorMachineRegressor
rgr = SVR(kernel="linear", C=0.025)
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

NameError: name 'SVR' is not defined

In [None]:
# RandomForestRegressor
rgr  = RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1, random_state=42)
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# MultiLayerPerceptronRegressor
rgr  = MLPRegressor(alpha=1, max_iter=1000, random_state=42)
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# AdaBoostRegressor
rgr  = AdaBoostRegressor(random_state=42)
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# GaussianNaiveBayes
rgr  = GaussianNB()
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# KernelRidgeRegressor
rgr  = KernelRidge()
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# Lasso
rgr  = Lasso()
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# LinearRegressor
rgr  = LinearRegression()
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# LogisticRegressor
rgr  = LogisticRegression()
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# GradientBoostRegressor
rgr  = GradientBoostingRegressor()
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
# XGBoostRegressor
rgr  = XGBRegressor()
rgr.fit(X_train_1_varc, y_train_1_varc.values.ravel())
print(np.sqrt(mean_squared_error(y_test_1_varc, rgr.predict(X_test_1_varc))))

In [None]:
#ExtraTreesRegressor
#separate try for the ExtraTreesRegressor as it was the suggestion of TPOT
# the pipeline for the ExtraTreesRegressor is different from the ones implemented above thats why we create the dataset with the rolling window again

# Erstellen die Rollfenster-Datensätze
X_train_1, y_train_1, X_test_1, y_test_1 = rwCreator.create_rolling_windows_datasets(
    train_data=cleaned_train_1,
    test_data=cleaned_test_1,
    test_RUL_data=test_RUL_data_1,
)

# split data into training and validation
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(
    X_train_1,
    y_train_1,
    test_size=0.2,  # 20% der Daten werden für die Validierung verwendet
    random_state=42
)
# Data-Scaling
scaler_1 = StandardScaler()
X_train_scaled_1 = scaler_1.fit_transform(X_train_1)
X_val_scaled_1 = scaler_1.transform(X_val_1)
X_test_scaled_1 = scaler_1.transform(X_test_1)

#ExtraTreeRegressor
extra_trees = ExtraTreesRegressor(n_estimators=140, max_depth= 15, min_samples_leaf=5, random_state=42)

# Training
extra_trees.fit(X_train_scaled_1, y_train_1)

# Vorhersagen und Bewerten des ExtraTreesRegressor
et_predictions_1 = extra_trees.predict(X_val_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_val_1, et_predictions_1))

# Findings: Models

Most notable of the first spectated models are KNeighbor, RandomForest, Lasso and GradientBoost. Except for GradientBoost all other regressors will be optimized for. Instead of GradientBoost we will instead optimize for XGBoost. \\
The reasons for this decisions are that the computation time for XGB is far shorter than GradBoost and also according to some papers XGBoost can perform very well on this dataset if optimized correctly.

### Optimization


In [None]:
# for the Optimization we also used TPOT to find the best model
# the result of the TPOT was the ExtraTreeRegressor
# due to very long runtime, the TPOT is in comments

In [None]:
#tpot
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

# TPOT
#tpot = TPOTRegressor(generations=3, population_size=20, cv=3, verbosity=2, random_state=42)

#tpot.fit(X_train_scaled_1, y_train_1)

#print(tpot.score(X_test_scaled_1, y_test_1))

#tpot.export('best_model_pipeline.py')

#print(tpot.fitted_pipeline_)


# Dataset 1

In [59]:
## Load first Dataset
train_data_1_opt, test_data_1_opt,test_rul_data_1_opt = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)
cleaned_train_1_opt, cleaned_test_1_opt = clean_data(train_data_1_opt, test_data_1_opt, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)

# Train Val Split
cl_train_1_opt, cl_val_1_opt = train_val_split_by_group(df = cleaned_train_1_opt,group = "UnitNumber",test_size = 0.18,n_splits = 2,random_state = 7)

# RollingWindowParameter
min_ts_1_opt = 29
max_ts_1_opt = 30
feature_list_ds_1 = ["c3", "quantile", "mean", "root_mean_square", "median", "time_reversal_asymmetry_statistic", "absolute_maximum", "maximum", "minimum", "agg_autocorrelation", "autocorrelation" ]
# Create RollingWindows
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_1_opt,min_timeshift=min_ts_1_opt,feature_extraction_mode= 'custom',feature_list=feature_list_ds_1)
X_train_1_opt, y_train_1_opt = rwCreator._process_data(cl_train_1_opt, 'train')
X_val_1_opt, y_val_1_opt = rwCreator._process_data(cl_val_1_opt, 'train')
X_test_1_opt, y_test_1_opt = rwCreator._process_data(cleaned_test_1_opt, 'test', test_rul_data_1_opt)

#Data Preprocessing
y_train_1_opt = y_train_1_opt.clip(upper=125)
scaler = StandardScaler()
X_train_1_opt[2:] = scaler.fit_transform(X_train_1_opt[2:])
X_val_1_opt[2:] = scaler.fit_transform(X_val_1_opt[2:])
X_test_1_opt[2:] = scaler.fit_transform(X_test_1_opt[2:])

2024-06-01 19:08:34 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-06-01 19:08:34 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-06-01 19:08:34 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-06-01 19:08:34 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-06-01 19:08:34 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m
2024-06-01 19:08:34 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 19:08:34 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 19:08:34 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 19:08:34 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 19:08:34 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>

Rolling: 100%|██████████| 20/20 [00:03<00:00,  6.50it/s]


2024-06-01 19:08:37 [[34msrc.rolling_window_creator:140[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:55<00:00,  2.79s/it]


2024-06-01 19:09:40 [[34msrc.rolling_window_creator:148[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 19:09:40 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 19/19 [00:01<00:00, 10.39it/s]

2024-06-01 19:09:42 [[34msrc.rolling_window_creator:140[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m



Feature Extraction: 100%|██████████| 20/20 [00:14<00:00,  1.41it/s]


2024-06-01 19:09:57 [[34msrc.rolling_window_creator:148[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 19:09:57 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 19/19 [00:02<00:00,  8.57it/s]


2024-06-01 19:09:59 [[34msrc.rolling_window_creator:140[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 12.74it/s]


In [None]:
#knn optimization
def hyperparameter_function_knn(neighbours):
      """ Function for hyperparameter optimization
      """
      neighbours = neighbours.round().astype(int)
      knn_regressor = KNeighborsRegressor(n_neighbors=neighbours)
      metric = cross_val_score(knn_regressor, X=X_train_1_opt, y=y_train_1_opt, cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'neighbours': (1, 750)}

optimizer = BayesianOptimization(
  f=hyperparameter_function_knn,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=383)
knn_regressor.fit(X_train_1_opt,y_train_1_opt )
y_pred_1_opt = knn_regressor.predict(X_test_1_opt)
print(np.sqrt(mean_squared_error(y_test_1_opt, y_pred_1_opt)))

NameError: name 'KNeighborsRegressor' is not defined

Results: 23.36

Notes: KNR works very well and achieved similar values in both validation and testing.

In [None]:
#Random Forest Optimization
def hyperparameter_function_rf(n_estimators, max_features, ):
      """ Function for hyperparameter optimization
      """
      n_estimators = n_estimators.round().astype(int)
      max_features = max_features.round().astype(int)

      rf_regressor = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,random_state=17,n_jobs=-1)
      metric = cross_val_score(rf_regressor, X=X_train_1_opt, y=y_train_1_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'n_estimators': (20, 500),'max_features': (1,1) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_rf,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
# n_estimators=296 , max_features=4 -> 22.93

#Random Forest Regressor Testing
rf_regressor = RandomForestRegressor(n_estimators=296,max_features=4,random_state=17)
rf_regressor.fit(X_train_1_opt,y_train_1_opt.values.ravel())
y_pred_1_opt = rf_regressor.predict(X_test_1_opt)
print(np.sqrt(mean_squared_error(y_test_1_opt, y_pred_1_opt)))

NameError: name 'RandomForestRegressor' is not defined

Results: 18.47

Notes: RFR was the best performer for the custom feature set on tsfresh
with reaching a peak in optimization of 18.56. This is far above expectation since similar papers stopped at the low twenties.

In [None]:
#Lasso Optimization
def hyperparameter_function_lasso(alpha, max_iter, ):
      """ Function for hyperparameter optimization
      """
      max_iter = max_iter.round().astype(int)

      lasso_regressor = Lasso(alpha=alpha,max_iter=max_iter,random_state=17)
      metric = cross_val_score(lasso_regressor, X=X_train_1_opt, y=y_train_1_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'alpha': (0.001, 1),'max_iter': (100,10000) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_lasso,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
lasso_regressor = Lasso(alpha=0.0319,max_iter=656,random_state=17)
lasso_regressor.fit(X_train_1_opt,y_train_1_opt )
y_pred_1_opt = lasso_regressor.predict(X_test_1_opt)
print(np.sqrt(mean_squared_error(y_test_1_opt, y_pred_1_opt)))

Results: 1620.91

Notes: Despite the first test Lasso seems to be underperforming. Which might be because of the hyperparameter optimization or other factors such as the random_stare

In [None]:
#XGBoost Optimization
def hyperparameter_function_xgboost(eta, gamma ,max_depth,reg_lambda,reg_alpha ):
      """ Function for hyperparameter optimization
      """
      max_depth = max_depth.round().astype(int)

      xgb_regressor = XGBRegressor(eta=eta,gamma=gamma,max_depth=max_depth,reg_lambda=reg_lambda, reg_alpha=reg_alpha)
      metric = cross_val_score(xgb_regressor, X=X_train_1_opt, y=y_train_1_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'eta': (0, 1),'gamma': (0,2),'max_depth':(1,10),'reg_lambda':(1,1),'reg_alpha':(0,0) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_xgboost,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
xgb_regressor = XGBRegressor(eta=0.09569,gamma=0.05334,max_depth=4,reg_lambda=1, reg_alpha=0)
xgb_regressor.fit(X_train_1_opt,y_train_1_opt )
y_pred_1_opt = xgb_regressor.predict(X_test_1_opt)

print(np.sqrt(mean_squared_error(y_test_1_opt, y_pred_1_opt)))

Results: 19.02

Notes: The XGBoost results is also very solid and achieved comparable results to the paper which featured it.

In [None]:
# Define the function to optimize
def evaluate_model(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Make sure parameters are integer
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)

    # Define the model with the parameters
    model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                random_state=42, n_jobs=-1)

    # Fit and predict
    model.fit(X_train_1, y_train_1)
    pred = model.predict(X_val_1)

    # Calculate RMSE
    rmse = mean_squared_error(y_val_1, pred, squared=False)

    # We want to minimize RMSE, so we return the negative value
    return -rmse


# Define the bounds of the parameters
param_bounds = {
    'n_estimators': (100, 200),
    'max_depth': (10, 30),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4)
}
# Create the BayesianOptimization object
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=param_bounds,
    random_state=42,
)

# Perform the optimization
optimizer.maximize(init_points=5, n_iter=12)

# Print the best parameters
print("Beste Hyperparameter-Kombination:", optimizer.max['params'])
#Beste Hyperparameter-Kombination: {'max_depth': 26.648852816008436, 'min_samples_leaf': 1.6370173320348285, 'min_samples_split': 3.454599737656805, 'n_estimators': 118.34045098534338}


In [None]:
#Prediction on testdata
extra_trees = ExtraTreesRegressor(n_estimators=118, max_depth=27, min_samples_split=3, min_samples_leaf=2, random_state=42)
# Training
extra_trees.fit(X_train_scaled_1, y_train_1)
test_predictions = extra_trees.predict(X_test_scaled_1)
test_rmse = sklearn.metrics.root_mean_squared_error(y_test_1, test_predictions)
print(f"Test RMSE: {test_rmse:.4f}")

Result:
Test RMSE: 62.9231


The optimization for the first dataset went well. We reach comparable results we found in papers for this dataset with traditional ML approaches

# Dataset 2

In [None]:
## Load second dataset
train_data_2_opt, test_data_2_opt,test_rul_data_2_opt = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)
cleaned_train_2_opt, cleaned_test_2_opt = clean_data(train_data_2_opt, test_data_2_opt, method=None, ignore_columns=['UnitNumber', 'Cycle','Operation Setting 2','Operation Setting 3','Sensor Measure 2','Sensor Measure 3','Sensor Measure 4','Sensor Measure 8','Sensor Measure 9','Sensor Measure 11','Sensor Measure 15','Sensor Measure 17'], threshold_missing=0.1, threshold_corr=0.5)

# Train Val Split
cl_train_2_opt, cl_val_2_opt = train_val_split_by_group(df = cleaned_train_2_opt,group = "UnitNumber",test_size = 0.18,n_splits = 2,random_state = 7)

## RollingWindowParameter
min_ts_2_opt = 17
max_ts_2_opt = 18
feature_list_ds_2 = ["c3", "quantile", "mean", "median", "root_mean_square", "variance", "mean_abs_change", "standard_deviation", "skewness", "variation_coefficient", "last_location_of_maximum", "first_location_of_maximum"]

# Create RollingWindows
#rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_2_opt,min_timeshift=min_ts_2_opt,feature_extraction_mode= 'custom',feature_list=feature_list_ds_3)
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_2_opt,min_timeshift=min_ts_2_opt,feature_extraction_mode= 'minimal',feature_list=["median"])
X_train_2_opt, y_train_2_opt = rwCreator._process_data(cl_train_2_opt, 'train')
X_val_2_opt, y_val_2_opt = rwCreator._process_data(cl_val_2_opt, 'train')
X_test_2_opt, y_test_2_opt = rwCreator._process_data(cleaned_test_2_opt, 'test', test_rul_data_2_opt)

# Data Preprocessing
y_train_2_opt = y_train_2_opt.clip(upper=125)
scaler = StandardScaler()
X_train_2_opt[2:] = scaler.fit_transform(X_train_2_opt[2:])
X_val_2_opt[2:] = scaler.fit_transform(X_val_2_opt[2:])
X_test_2_opt[2:] = scaler.fit_transform(X_test_2_opt[2:])

In [None]:
def hyperparameter_function_knn(neighbours):
      """ Function for hyperparameter optimization
      """
      neighbours = neighbours.round().astype(int)
      knn_regressor = KNeighborsRegressor(n_neighbors=neighbours)
      metric = cross_val_score(knn_regressor, X=X_train_2_opt, y=y_train_2_opt, cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'neighbours': (1, 750)}

optimizer = BayesianOptimization(
  f=hyperparameter_function_knn,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
knn_regressor = KNeighborsRegressor(n_neighbors=20)
knn_regressor.fit(X_train_2_opt,y_train_2_opt )
y_pred_2_opt = knn_regressor.predict(X_test_2_opt)
print(np.sqrt(mean_squared_error(y_test_2_opt, y_pred_2_opt)))

Results: 38.46

Notes: Still KNeighbors performs as a one of the worst (when Lasso is out of competition)

In [None]:
def hyperparameter_function_rf(n_estimators, max_features, ):
      """ Function for hyperparameter optimization
      """
      n_estimators = n_estimators.round().astype(int)
      max_features = max_features.round().astype(int)

      rf_regressor = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,random_state=17,n_jobs=-1)
      metric = cross_val_score(rf_regressor, X=X_train_2_opt, y=y_train_2_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'n_estimators': (20, 500),'max_features': (1,1) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_rf,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=296,max_features=4)
rf_regressor.fit(X_train_2_opt,y_train_2_opt.values.ravel())
y_pred_2_opt = rf_regressor.predict(X_test_2_opt)
print(np.sqrt(mean_squared_error(y_test_2_opt, y_pred_2_opt)))

Results: 34.75

Notes: This might result in weaker results.
RandomForest still performs rather well and is at the same level as XGBoost

In [None]:
def hyperparameter_function_lasso(alpha, max_iter, ):
      """ Function for hyperparameter optimization
      """
      max_iter = max_iter.round().astype(int)

      lasso_regressor = Lasso(alpha=alpha,max_iter=max_iter,random_state=17)
      metric = cross_val_score(lasso_regressor, X=X_train_1_opt, y=y_train_1_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'alpha': (0.001, 1),'max_iter': (100,10000) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_lasso,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
# alpha= , max_iter= ->
lasso_regressor = Lasso(alpha=0.9457,max_iter=694)
lasso_regressor.fit(X_train_2_opt,y_train_2_opt )
y_pred_2_opt = lasso_regressor.predict(X_test_2_opt)
print(np.sqrt(mean_squared_error(y_test_2_opt, y_pred_2_opt)))

Results: 86.99

Notes: In this dataset Lasso doesnt perform good, but it seems to be far more stable then in the other cases with RMSE of over 1000.

In [None]:
def hyperparameter_function_xgboost(eta, gamma ,max_depth,reg_lambda,reg_alpha ):
      """ Function for hyperparameter optimization
      """
      max_depth = max_depth.round().astype(int)

      xgb_regressor = XGBRegressor(eta=eta,gamma=gamma,max_depth=max_depth,reg_lambda=reg_lambda, reg_alpha=reg_alpha)
      metric = cross_val_score(xgb_regressor, X=X_train_1_opt, y=y_train_1_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'eta': (0, 1),'gamma': (0,2),'max_depth':(1,10),'reg_lambda':(1,1),'reg_alpha':(0,0) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_xgboost,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
xgb_regressor = XGBRegressor(eta=0.02803,gamma=0.8998,max_depth=6,reg_lambda=1, reg_alpha=0)
xgb_regressor.fit(X_train_2_opt,y_train_2_opt )
y_pred_2_opt = xgb_regressor.predict(X_test_2_opt)
print(np.sqrt(mean_squared_error(y_test_2_opt, y_pred_2_opt)))

Results: 34.11

Notes: Is performing really well. If the was a need to further improve the result XGBoostRegressor would still possess options to do so.

In [None]:
#Due to a different pipeline, the ExtraTreesRegressor is not fully implemented in this notebook
# for further refrence, please have a look in the notebook Niklas_Model_approaches

#Bayesian Optimzier für Dataset 2
# Define the function to optimize
def evaluate_model(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Make sure parameters are integer
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)

    # Define the model with the parameters
    model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                random_state=42, n_jobs=-1)

    # Fit and predict
    model.fit(X_train_2, y_train_2)
    pred = model.predict(X_val_2)

    # Calculate RMSE
    rmse = mean_squared_error(y_val_2, pred, squared=False)

    # We want to minimize RMSE, so we return the negative value
    return -rmse


# Define the bounds of the parameters
param_bounds = {
    'n_estimators': (100, 200),
    'max_depth': (10, 30),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4)
}
# Create the BayesianOptimization object
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=param_bounds,
    random_state=42,
)
# Perform the optimization
optimizer.maximize(init_points=5, n_iter=12)

# Print the best parameters
print("Beste Hyperparameter-Kombination:", optimizer.max['params'])
#Best hyperparameter-combination: {'max_depth': 30.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 151.63447537285498}



In [None]:
#prediction on the testdata
extra_trees = ExtraTreesRegressor(n_estimators=140, max_depth= 15, min_samples_leaf=5, random_state=42)

# Training
extra_trees.fit(X_train_scaled_2, y_train_2)
test_predictions = extra_trees.predict(X_test_scaled_2)
test_rmse = sklearn.metrics.root_mean_squared_error(y_test_2, test_predictions)
print(f"Test RMSE: {test_rmse:.4f}")

Result: Test RMSE: 79.9300

# Dataset 3

In [None]:
## Load third dataset
train_data_3_opt, test_data_3_opt,test_rul_data_3_opt = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)
cleaned_train_3_opt, cleaned_test_3_opt = clean_data(train_data_3_opt, test_data_3_opt, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)

# Train Val Split
cl_train_3_opt, cl_val_3_opt = train_val_split_by_group(df = cleaned_train_3_opt,group = "UnitNumber",test_size = 0.18,n_splits = 2,random_state = 7)

## RollingWindowParameter
min_ts_3_opt = 29
max_ts_3_opt = 30
feature_list_ds_3 = ["c3", "quantile", "mean", "root_mean_square", "median", "time_reversal_asymmetry_statistic", "absolute_maximum", "maximum", "minimum", "agg_autocorrelation", "autocorrelation" ]
# Create RollingWindows
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_3_opt,min_timeshift=min_ts_3_opt,feature_extraction_mode= 'custom',feature_list=feature_list_ds_3)
X_train_3_opt, y_train_3_opt = rwCreator._process_data(cl_train_3_opt, 'train')
X_val_3_opt, y_val_3_opt = rwCreator._process_data(cl_val_3_opt, 'train')
X_test_3_opt, y_test_3_opt = rwCreator._process_data(cleaned_test_3_opt, 'test', test_rul_data_3_opt)

# Data Preprocessing
y_train_3_opt = y_train_3_opt.clip(upper=125)
scaler = StandardScaler()
X_train_3_opt[2:] = scaler.fit_transform(X_train_3_opt[2:])
X_val_3_opt[2:] = scaler.fit_transform(X_val_3_opt[2:])
X_test_3_opt[2:] = scaler.fit_transform(X_test_3_opt[2:])

In [None]:
def hyperparameter_function_knn(neighbours):
      """ Function for hyperparameter optimization
      """
      neighbours = neighbours.round().astype(int)
      knn_regressor = KNeighborsRegressor(n_neighbors=neighbours)
      metric = cross_val_score(knn_regressor, X=X_train_3_opt, y=y_train_3_opt, cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'neighbours': (1, 750)}

optimizer = BayesianOptimization(
  f=hyperparameter_function_knn,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
# Neigbors = 300
knn_regressor = KNeighborsRegressor(n_neighbors=300)
knn_regressor.fit(X_train_3_opt,y_train_3_opt )
y_pred_3_opt = knn_regressor.predict(X_test_3_opt)
print(np.sqrt(mean_squared_error(y_test_3_opt, y_pred_3_opt)))

Result: 24.76

Notes: Worse performance compare to dataset1 but still a solid result

In [None]:
def hyperparameter_function_rf(n_estimators, max_features, ):
      """ Function for hyperparameter optimization
      """
      n_estimators = n_estimators.round().astype(int)
      max_features = max_features.round().astype(int)

      rf_regressor = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,random_state=17,n_jobs=-1)
      metric = cross_val_score(rf_regressor, X=X_train_3_opt, y=y_train_3_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'n_estimators': (20, 500),'max_features': (1,1) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_rf,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=333,max_features=1,random_state=17)
rf_regressor.fit(X_train_3_opt,y_train_3_opt.values.ravel())
y_pred_3_opt = rf_regressor.predict(X_test_3_opt)
print(np.sqrt(mean_squared_error(y_test_3_opt, y_pred_3_opt)))

Result: 23.36

Notes: Differing from the from first the max_features parameter will no longer be optimized for.
Strong performance. The optimization improved the results by quite a lot.

In [None]:
def hyperparameter_function_lasso(alpha, max_iter, ):
      """ Function for hyperparameter optimization
      """
      max_iter = max_iter.round().astype(int)

      lasso_regressor = Lasso(alpha=alpha,max_iter=max_iter,random_state=17)
      metric = cross_val_score(lasso_regressor, X=X_train_3_opt, y=y_train_3_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'alpha': (0.001, 1),'max_iter': (100,10000) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_lasso,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
lasso_regressor = Lasso(alpha=0.1923,max_iter=772,random_state=17)
lasso_regressor.fit(X_train_3_opt,y_train_3_opt )
y_pred_3_opt = lasso_regressor.predict(X_test_3_opt)
print(np.sqrt(mean_squared_error(y_test_3_opt, y_pred_3_opt)))

Result: 8080.99

Notes: Similar to the previous optimizations. Lasso disappoints again despite almost an hour in optimization time in colab.

In [None]:
def hyperparameter_function_xgboost(eta, gamma ,max_depth,reg_lambda,reg_alpha ):
      """ Function for hyperparameter optimization
      """
      max_depth = max_depth.round().astype(int)

      xgb_regressor = XGBRegressor(eta=eta,gamma=gamma,max_depth=max_depth,reg_lambda=reg_lambda, reg_alpha=reg_alpha)
      metric = cross_val_score(xgb_regressor, X=X_train_3_opt, y=y_train_3_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'eta': (0, 1),'gamma': (0,2),'max_depth':(1,10),'reg_lambda':(1,1),'reg_alpha':(0,0) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_xgboost,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
xgb_regressor = XGBRegressor(eta=0.2079,gamma=1.342,max_depth=2,reg_lambda=1, reg_alpha=0)
xgb_regressor.fit(X_train_3_opt,y_train_3_opt )
y_pred_3_opt = xgb_regressor.predict(X_test_3_opt)

print(np.sqrt(mean_squared_error(y_test_3_opt, y_pred_3_opt)))

In [None]:
#Due to a different pipeline, the ExtraTreesRegressor is not fully implemented in this notebook
# for further refrence, please have a look in the notebook Niklas_Model_approaches

#Bayesian Optimzier für Dataset 3
# Define the function to optimize
def evaluate_model(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Make sure parameters are integer
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)

    # Define the model with the parameters
    model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth,
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                random_state=42, n_jobs=-1)

    # Fit and predict
    model.fit(X_train_3, y_train_3)
    pred = model.predict(X_val_3)

    # Calculate RMSE
    rmse = mean_squared_error(y_val_3, pred, squared=False)

    # We want to minimize RMSE, so we return the negative value
    return -rmse


# Define the bounds of the parameters
param_bounds = {
    'n_estimators': (100, 200),
    'max_depth': (10, 30),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4)
}
# Create the BayesianOptimization object
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=param_bounds,
    random_state=42,
)

# Perform the optimization
optimizer.maximize(init_points=5, n_iter=12)

# Print the best parameters
print("Beste Hyperparameter-Kombination:", optimizer.max['params'])
#Best hyperparameter-combination: {'max_depth': 30.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 167.97080791958393}


In [None]:
#prediction on the testdata auf den Testdaten
extra_trees = ExtraTreesRegressor(n_estimators=168, max_depth=30, min_samples_split=2, min_samples_leaf=1, random_state=42)
# Training
extra_trees.fit(X_train_scaled_3, y_train_3)
test_predictions = extra_trees.predict(X_test_scaled_3)
test_rmse = sklearn.metrics.root_mean_squared_error(y_test_3, test_predictions)
print(f"Test RMSE: {test_rmse:.4f}")


Result: Test RMSE: 77.1180





Notes: This time the XGBoostRegressors performs the best over the custom feature_list. Which is inline with the promised results. (The custom feature_list is only used by KNeighbors, RandomForest, Lasso and XGBoost)

# Dataset 4

In [60]:
## Load fourth dataset
train_data_4_opt, test_data_4_opt,test_rul_data_4_opt = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)
cleaned_train_4_opt, cleaned_test_4_opt = clean_data(train_data_4_opt, test_data_4_opt, method=None, ignore_columns=['UnitNumber', 'Cycle','Operation Setting 2','Operation Setting 3','Sensor Measure 2','Sensor Measure 3','Sensor Measure 4','Sensor Measure 8','Sensor Measure 9','Sensor Measure 11','Sensor Measure 15','Sensor Measure 17'], threshold_missing=0.1, threshold_corr=0.5)

# Train Val Split
cl_train_4_opt, cl_val_4_opt = train_val_split_by_group(df = cleaned_train_4_opt,group = "UnitNumber",test_size = 0.18,n_splits = 2,random_state = 7)

## RollingWindowParameter
min_ts_4_opt = 17
max_ts_4_opt = 18
feature_list_ds_4 = ["c3", "quantile", "mean", "median", "root_mean_square", "variance", "mean_abs_change", "standard_deviation", "skewness", "variation_coefficient", "last_location_of_maximum", "first_location_of_maximum"]

#rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_4_opt,min_timeshift=min_ts_4_opt,feature_extraction_mode= 'custom',feature_list=feature_list_ds_4)
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_4_opt,min_timeshift=min_ts_4_opt,feature_extraction_mode= 'minimal',feature_list=["median"])


X_train_4_opt, y_train_4_opt = rwCreator._process_data(cl_train_4_opt, 'train')
X_val_4_opt, y_val_4_opt = rwCreator._process_data(cl_val_4_opt, 'train')
X_test_4_opt, y_test_4_opt = rwCreator._process_data(cleaned_test_4_opt, 'test', test_rul_data_4_opt)

y_train_4_opt = y_train_4_opt.clip(upper=125)

scaler = StandardScaler()
X_train_4_opt[2:] = scaler.fit_transform(X_train_4_opt[2:])
X_val_4_opt[2:] = scaler.fit_transform(X_val_4_opt[2:])
X_test_4_opt[2:] = scaler.fit_transform(X_test_4_opt[2:])

2024-06-01 19:18:49 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 4...[0m
2024-06-01 19:18:49 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 4.[0m
2024-06-01 19:18:49 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (61249, 26)[0m
2024-06-01 19:18:49 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (41214, 26)[0m
2024-06-01 19:18:49 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (248, 1)[0m
2024-06-01 19:18:49 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 19:18:49 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 19:18:49 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 19:18:49 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 19:18:49 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>

Rolling: 100%|██████████| 20/20 [00:06<00:00,  3.10it/s]


2024-06-01 19:18:56 [[34msrc.rolling_window_creator:140[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:48<00:00,  2.43s/it]


2024-06-01 19:19:52 [[34msrc.rolling_window_creator:148[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 19:19:52 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:01<00:00, 10.04it/s]

2024-06-01 19:19:54 [[34msrc.rolling_window_creator:140[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m



Feature Extraction: 100%|██████████| 20/20 [00:08<00:00,  2.26it/s]


2024-06-01 19:20:03 [[34msrc.rolling_window_creator:148[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 19:20:03 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 20/20 [00:04<00:00,  4.54it/s]


2024-06-01 19:20:08 [[34msrc.rolling_window_creator:140[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 13.14it/s]


Notes: This is by far the worst performing dataset in regards to the achieved RMSE by our models.

In [None]:
def hyperparameter_function_knn(neighbours):
      """ Function for hyperparameter optimization
      """
      neighbours = neighbours.round().astype(int)
      knn_regressor = KNeighborsRegressor(n_neighbors=neighbours)
      metric = cross_val_score(knn_regressor, X=X_train_4_opt, y=y_train_4_opt, cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'neighbours': (1, 750)}

optimizer = BayesianOptimization(
  f=hyperparameter_function_knn,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
# Neigbors =
knn_regressor = KNeighborsRegressor(n_neighbors=15)
knn_regressor.fit(X_train_4_opt,y_train_4_opt )
y_pred_4_opt = knn_regressor.predict(X_test_4_opt)
print(np.sqrt(mean_squared_error(y_test_4_opt, y_pred_4_opt)))

Result for knn: 42.10

Notes: For this dataset KNeighbors is squarely in the middlefield.

In [None]:
def hyperparameter_function_rf(n_estimators, max_features, ):
      """ Function for hyperparameter optimization
      """
      n_estimators = n_estimators.round().astype(int)
      max_features = max_features.round().astype(int)

      rf_regressor = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,random_state=17,n_jobs=-1)
      metric = cross_val_score(rf_regressor, X=X_train_4_opt, y=y_train_4_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'n_estimators': (20, 500),'max_features': (1,1) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_rf,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=264,max_features=1)
rf_regressor.fit(X_train_4_opt,y_train_4_opt.values.ravel())
y_pred_4_opt = rf_regressor.predict(X_test_4_opt)
print(np.sqrt(mean_squared_error(y_test_4_opt, y_pred_4_opt)))

Result for RandomForestRegressor: 39.53

Notes: The best model for dataset 4 and the only one to breach below 40. Despite extensive optimization it wasn't possible to further reduce the RMSE with the RandomForest.

In [None]:
def hyperparameter_function_lasso(alpha, max_iter, ):
      """ Function for hyperparameter optimization
      """
      max_iter = max_iter.round().astype(int)

      lasso_regressor = Lasso(alpha=alpha,max_iter=max_iter,random_state=17)
      metric = cross_val_score(lasso_regressor, X=X_train_4_opt, y=y_train_4_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'alpha': (0.001, 1),'max_iter': (100,10000) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_lasso,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
lasso_regressor = Lasso(alpha=0.9457,max_iter=694)
lasso_regressor.fit(X_train_4_opt,y_train_4_opt )
y_pred_4_opt = lasso_regressor.predict(X_test_4_opt)
print(np.sqrt(mean_squared_error(y_test_4_opt, y_pred_4_opt)))

NameError: name 'Lasso' is not defined

Result for Lasso: 50.39

Notes: The best overall result for the Lasso Regressor despite the most difficult dataset.

In [None]:
def hyperparameter_function_xgboost(eta, gamma ,max_depth,reg_lambda,reg_alpha ):
      """ Function for hyperparameter optimization
      """
      max_depth = max_depth.round().astype(int)

      xgb_regressor = XGBRegressor(eta=eta,gamma=gamma,max_depth=max_depth,reg_lambda=reg_lambda, reg_alpha=reg_alpha)
      metric = cross_val_score(xgb_regressor, X=X_train_4_opt, y=y_train_4_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'eta': (0, 1),'gamma': (0,2),'max_depth':(1,10),'reg_lambda':(1,1),'reg_alpha':(0,0) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_xgboost,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

In [None]:
# eta= , gamma= , max_depth= ,lambda=1 , alpha=0 ->
xgb_regressor = XGBRegressor(eta=0.1149,gamma=0.4352,max_depth=4,reg_lambda=1, reg_alpha=0)
xgb_regressor.fit(X_train_4_opt,y_train_4_opt )
y_pred_4_opt = xgb_regressor.predict(X_test_4_opt)
print(np.sqrt(mean_squared_error(y_test_4_opt, y_pred_4_opt)))

Result for XGBRegressor: 43.26

Notes: Solid result but really didn't outperform any other regressor.

# Findings

Certain regressors are heavily hyperparameter dependent, especially lasso.
The feature selection was a mixed bag. While some regressors seemed to profit others less so. A more extensive evaluation could be performed in the future.
The datasets 2 & 4 were as promised by the EDA much more dificult to get a lower rmse. While datasets 1 & 3 we were able to reduce the rmse to the low twenties. This was not directly the case for 2 and 4.

# Results

RandomForest and XGBoostRegressor were overall our best performers. 



Final Notes: The original notebooks are in the referenced Repository in case any of our merged results do not reflect the communicated results, they should be used to compare.

Part of this was done in colab. The part can be found under the following link:
https://colab.research.google.com/drive/1F_hpmXcxYoJT3LsvXjF65c3_lZ7ltEr6?usp=sharing