In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
!pip install -q tsfresh
!pip install -q colorlog

In [5]:
!pip install bayesian-optimization -q

In [45]:
# third-party libraries
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm


# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html; Examples on how to approach classic classifiers
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ridge_regression, LogisticRegression, Lasso, LinearRegression

from tsfresh.feature_extraction import feature_calculators, MinimalFCParameters, EfficientFCParameters

# previous
from sklearn.metrics import mean_squared_error
from bayes_opt import BayesianOptimization
import sklearn.model_selection
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
!pip install -q tpot

In [8]:
from tpot import TPOTRegressor

In [9]:
import sys
sys.path.insert(0, '/content/drive/MyDrive/PSDA_cml')

In [10]:
# source code
from src.utils import load_data, load_config, train_val_split_by_group
from src.data_cleaning import clean_data, format_dtype
from src.rolling_window_creator import calculate_RUL, RollingWindowDatasetCreator


In [11]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [12]:
np.random.seed(42)

# Paths

In [17]:
PATH_TO_CONFIG = "drive/MyDrive/PSDA_cml/configs/config.yaml"

# Load Config + Data

In [None]:
#Debug

for i in MinimalFCParameters():
  print(i)



sum_values
median
mean
length
standard_deviation
variance
root_mean_square
maximum
absolute_maximum
minimum


In [15]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [18]:
%%time

train_data, test_data,test_rul_data = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)
train_data_2, test_data_2,test_rul_data_2 = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)
train_data_3, test_data_3,test_rul_data_3 = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)
train_data_4, test_data_4,test_rul_data_4 = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)

2024-06-01 10:02:19 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m


INFO:src.utils:Loading data set 1...


2024-06-01 10:02:20 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m


INFO:src.utils:Loaded raw data for dataset 1.


2024-06-01 10:02:20 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m


INFO:src.utils:Train Data: (20631, 26)


2024-06-01 10:02:20 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m


INFO:src.utils:Test Data: (13096, 26)


2024-06-01 10:02:20 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


INFO:src.utils:Test RUL Data: (100, 1)


2024-06-01 10:02:20 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m


INFO:src.utils:Loading data set 2...


2024-06-01 10:02:21 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m


INFO:src.utils:Loaded raw data for dataset 2.


2024-06-01 10:02:21 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m


INFO:src.utils:Train Data: (53759, 26)


2024-06-01 10:02:21 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m


INFO:src.utils:Test Data: (33991, 26)


2024-06-01 10:02:21 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


INFO:src.utils:Test RUL Data: (259, 1)


2024-06-01 10:02:21 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 3...[0m


INFO:src.utils:Loading data set 3...


2024-06-01 10:02:22 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 3.[0m


INFO:src.utils:Loaded raw data for dataset 3.


2024-06-01 10:02:22 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (24720, 26)[0m


INFO:src.utils:Train Data: (24720, 26)


2024-06-01 10:02:22 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (16596, 26)[0m


INFO:src.utils:Test Data: (16596, 26)


2024-06-01 10:02:22 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


INFO:src.utils:Test RUL Data: (100, 1)


2024-06-01 10:02:22 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 4...[0m


INFO:src.utils:Loading data set 4...


2024-06-01 10:02:24 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 4.[0m


INFO:src.utils:Loaded raw data for dataset 4.


2024-06-01 10:02:24 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (61249, 26)[0m


INFO:src.utils:Train Data: (61249, 26)


2024-06-01 10:02:24 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (41214, 26)[0m


INFO:src.utils:Test Data: (41214, 26)


2024-06-01 10:02:24 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (248, 1)[0m


INFO:src.utils:Test RUL Data: (248, 1)


CPU times: user 2.56 s, sys: 191 ms, total: 2.75 s
Wall time: 4.72 s


In [None]:
print("DS 1:" ,train_data.shape, "DS 2:" ,train_data_2.shape, "DS 3:" ,train_data_3.shape, "DS 4:" ,train_data_4.shape)

In [None]:
print("DS 1:" ,train_data.columns, "DS 2:" ,train_data_2.columns, "DS 3:" ,train_data_3.columns, "DS 4:" ,train_data_4.columns)

# 📍 << Task 2: Classic Machine learning >>

[TEMPLATE]

Findings:
* Interpretation of plots
* or other key take aways from previous code

In [None]:
# [TEMPLATE] - save processed data (as pickle)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_pickle(f"{config['paths']['processed_data_dir']}ex2_topic_{timestamp}.pkl")

In [None]:
# [TEMPLATE] - save data predictions (as csv)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_csv(f"{config['paths']['prediction_dir']}ex2_topic_{timestamp}.csv", sep=',', decimal='.')

In [None]:
# [TEMPLATE] - save plot results (as png)
fig = plt.figure(figsize=(9, 6))
timestamp = time.strftime("%Y%m%d-%H%M%S")
fig.savefig(f"{config['paths']['plot_dir']}ex2_topic_{timestamp}.png")

## Data Cleaning

Use the created in advanced functions to remove non-helpful data as determined by the EDA
(Currently without outlier removal)

In [None]:
cleaned_train, cleaned_test = clean_data(train_data, test_data, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)

2024-05-31 19:33:26 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m


INFO:src.data_cleaning:Cleaning train and test data...


2024-05-31 19:33:27 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m


INFO:src.data_cleaning:Formatting column types...


2024-05-31 19:33:27 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-05-31 19:33:27 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-05-31 19:33:27 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m


INFO:src.data_cleaning:Handling duplicates...


2024-05-31 19:33:27 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m


INFO:src.data_cleaning:Removing outliers...


2024-05-31 19:33:27 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-05-31 19:33:27 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-05-31 19:33:27 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-05-31 19:33:27 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-05-31 19:33:27 [[34msrc.data_cleaning:150[0m] [[32mINFO[0m] >>>> Filter features based train data...[0m


INFO:src.data_cleaning:Filter features based train data...


2024-05-31 19:33:27 [[34msrc.data_cleaning:26[0m] [DEBUG[0m] >>>> Found 0 features with missing values above the threshold of 0.1.[0m


DEBUG:src.data_cleaning:Found 0 features with missing values above the threshold of 0.1.


2024-05-31 19:33:27 [[34msrc.data_cleaning:46[0m] [DEBUG[0m] >>>> Found 1 features with only a single unique value: ['Operation Setting 3'][0m


DEBUG:src.data_cleaning:Found 1 features with only a single unique value: ['Operation Setting 3']


2024-05-31 19:33:27 [[34msrc.data_cleaning:103[0m] [DEBUG[0m] >>>> Found 3 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Operation Setting 1', 'Operation Setting 2'][0m


DEBUG:src.data_cleaning:Found 3 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Operation Setting 1', 'Operation Setting 2']


2024-05-31 19:33:27 [[34msrc.data_cleaning:162[0m] [[32mINFO[0m] >>>> Dropping features based on missing values, single unique values, and no target correlation...[0m


INFO:src.data_cleaning:Dropping features based on missing values, single unique values, and no target correlation...


2024-05-31 19:33:27 [[34msrc.data_cleaning:172[0m] [[32mINFO[0m] >>>> Data cleaning completed.[0m


INFO:src.data_cleaning:Data cleaning completed.


2024-05-31 19:33:27 [[34msrc.data_cleaning:173[0m] [[32mINFO[0m] >>>> Original train DataFrame shape: (20631, 14), Resulting train DataFrame shape: (20631, 14)[0m


INFO:src.data_cleaning:Original train DataFrame shape: (20631, 14), Resulting train DataFrame shape: (20631, 14)


2024-05-31 19:33:27 [[34msrc.data_cleaning:174[0m] [[32mINFO[0m] >>>> Original test DataFrame shape: (13096, 14), Resulting test DataFrame shape: (13096, 14)[0m


INFO:src.data_cleaning:Original test DataFrame shape: (13096, 14), Resulting test DataFrame shape: (13096, 14)


In [None]:
train_data = format_dtype(train_data)

2024-05-31 19:33:31 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


In [None]:
cleaned_train, cleaned_test = clean_data(train_data, test_data, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)
cleaned_train_2, cleaned_test_2 = clean_data(train_data_2, test_data_2, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.01)
cleaned_train_3, cleaned_test_3 = clean_data(train_data_3, test_data_3, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)
cleaned_train_4, cleaned_test_4 = clean_data(train_data_4, test_data_4, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.01)

In [None]:
print("DS 1:" ,train_data.shape, "DS 2:" ,train_data_2.shape, "DS 3:" ,train_data_3.shape, "DS 4:" ,train_data_4.shape)
print( )
print("DS 1:" ,train_data.columns,"\n", "DS 2:" ,train_data_2.columns,"\n", "DS 3:" ,train_data_3.columns,"\n", "DS 4:" ,train_data_4.columns)

## Feature Engineering

Choose a set of feature options for tsfresh windowing


In [20]:
# Currently using minimal to ease optimization so feature_list is not necessary
feature_list_ds_1 = ["c3", "quantile", "mean", "root_mean_square", "median", "time_reversal_asymmetry_statistic", "absolute_maximum", "maximum", "minimum", "agg_autocorrelation", "autocorrelation" ]
# feature_list for dataset 1




#currentpath = os.getcwd()
#ft_list = pd.read_pickle(currentpath+ "/data/processed/dataset1_remaining_features_0521.pkl")

In [19]:
min_ts = 29
max_ts = 30

## Windowing

via tsfresh

In [21]:
rwCreator_std = RollingWindowDatasetCreator(max_timeshift=max_ts,min_timeshift=min_ts,feature_extraction_mode= 'minimal',feature_list=["median"])
rwCreator_fte = RollingWindowDatasetCreator(max_timeshift=max_ts,min_timeshift=min_ts,feature_extraction_mode= 'custom',feature_list=feature_list)



NameError: name 'feature_list' is not defined

In [None]:
X_train_std, y_train_std, X_test_std, y_test_std = rwCreator_std.create_rolling_windows_datasets(train_data=cleaned_train, test_data=cleaned_test,test_RUL_data=test_rul_data,)
X_train_fte, y_train_fte, X_test_fte, y_test_fte = rwCreator_fte.create_rolling_windows_datasets(train_data=cleaned_train, test_data=cleaned_test,test_RUL_data=test_rul_data,)

2024-05-31 19:37:09 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 362/362 [00:27<00:00, 13.27it/s] 


2024-05-31 19:37:37 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 212772/212772 [02:10<00:00, 1627.23it/s]


2024-05-31 19:39:56 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-05-31 19:39:56 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for test data...
Rolling: 100%|██████████| 303/303 [00:16<00:00, 18.47it/s] 


2024-05-31 19:40:13 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


INFO:src.rolling_window_creator:Extracting features for test data...
Feature Extraction: 100%|██████████| 1200/1200 [00:00<00:00, 2540.97it/s]


2024-05-31 19:40:14 [[34msrc.rolling_window_creator:170[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m


INFO:src.rolling_window_creator:Datasets created successfully.


2024-05-31 19:40:14 [[34msrc.rolling_window_creator:171[0m] [[32mINFO[0m] >>>> Shape of X_train: (17731, 120)[0m


INFO:src.rolling_window_creator:Shape of X_train: (17731, 120)


2024-05-31 19:40:14 [[34msrc.rolling_window_creator:172[0m] [[32mINFO[0m] >>>> Shape of y_train: (17731, 1)[0m


INFO:src.rolling_window_creator:Shape of y_train: (17731, 1)


2024-05-31 19:40:14 [[34msrc.rolling_window_creator:173[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 120)[0m


INFO:src.rolling_window_creator:Shape of X_test: (100, 120)


2024-05-31 19:40:14 [[34msrc.rolling_window_creator:174[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m


INFO:src.rolling_window_creator:Shape of y_test: (100, 1)


2024-05-31 19:40:14 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 362/362 [00:30<00:00, 11.99it/s]


2024-05-31 19:40:45 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 212772/212772 [13:00<00:00, 272.48it/s]


2024-05-31 19:54:21 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-05-31 19:54:22 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for test data...
Rolling: 100%|██████████| 303/303 [00:17<00:00, 17.37it/s] 


2024-05-31 19:54:40 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


INFO:src.rolling_window_creator:Extracting features for test data...
Feature Extraction: 100%|██████████| 1200/1200 [00:04<00:00, 281.20it/s]


2024-05-31 19:54:44 [[34msrc.rolling_window_creator:170[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m


INFO:src.rolling_window_creator:Datasets created successfully.


2024-05-31 19:54:44 [[34msrc.rolling_window_creator:171[0m] [[32mINFO[0m] >>>> Shape of X_train: (17731, 396)[0m


INFO:src.rolling_window_creator:Shape of X_train: (17731, 396)


2024-05-31 19:54:44 [[34msrc.rolling_window_creator:172[0m] [[32mINFO[0m] >>>> Shape of y_train: (17731, 1)[0m


INFO:src.rolling_window_creator:Shape of y_train: (17731, 1)


2024-05-31 19:54:44 [[34msrc.rolling_window_creator:173[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 396)[0m


INFO:src.rolling_window_creator:Shape of X_test: (100, 396)


2024-05-31 19:54:44 [[34msrc.rolling_window_creator:174[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m


INFO:src.rolling_window_creator:Shape of y_test: (100, 1)


In [None]:
X_train_2, y_train_2, X_test_2, y_test_2 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_2, test_data=cleaned_test_2,test_RUL_data=test_rul_data_2,)
X_train_3, y_train_3, X_test_3, y_test_3 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_3, test_data=cleaned_test_3,test_RUL_data=test_rul_data_3,)
X_train_4, y_train_4, X_test_4, y_test_4 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_4, test_data=cleaned_test_4,test_RUL_data=test_rul_data_4,)

## Scaling

- StandardScaler

In [None]:
scaler_1 = StandardScaler()
scaler_2 = StandardScaler()

In [None]:
X_train_std = scaler_1.fit_transform(X_train_std)
X_test_std = scaler_1.fit_transform(X_test_std)

X_train_fte = scaler_2.fit_transform(X_train_fte)
X_test_fte = scaler_2.fit_transform(X_test_fte)

In [None]:
# Nachdem Hinweis am Montag in der Besprechung hinzugefügt. Verbesserung von ~10 pts. im RSME gesehen.
y_train_std = y_train_std.clip(upper=125)
y_train_fte = y_train_fte.clip(upper=125)

## Feature Selection

- RFE
- SimpleCorrApproach

## Models
- RandomForestClassifier - niklas
- SVC
- MLP - niklas
- GPC
- KNeighboursClassifier - niklas
- Gaussian Naive Bayes
- AdaBoostClassifier - niklas
- QuadraticDiscrimantAnalysis


In [None]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),

]

In [None]:
regressors = [
    KNeighborsRegressor(3),
    SVR(kernel="linear", C=0.025),
    SVR(gamma=2, C=1),
    GaussianProcessRegressor(1.0 * RBF(1.0), random_state=42),
    #RandomForestRegressor(),#max_depth=5, n_estimators=10, max_features=5, random_state=42
    MLPRegressor(alpha=1, max_iter=1000, random_state=42),
    AdaBoostRegressor(random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

## Training & Evaluation

In [None]:
type(y_train.values.ravel())

In [None]:
#for clf in regressors:
#
#    clf.fit(X_train, y_train.values.ravel())
#    y_pred = clf.predict(X_test)
#
#    print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

    #rmse = sklearn.model_selection.cross_val_score(clf, X=X_train, y=y_train, cv=5, scoring='root_mean_squared_error')
    #print(rmse)


In [None]:
%%time
# TEST between minimal VS Feat Eng

rgr_std  = KNeighborsRegressor(3)
rgr_std.fit(X_train_std, y_train_std.values.ravel())
y_pred_std = rgr_std.predict(X_test_std)
print(np.sqrt(sklearn.metrics.mean_squared_error(y_test_std, y_pred_std)))


rgr_fte  = KNeighborsRegressor(3)
rgr_fte.fit(X_train_fte, y_train_fte.values.ravel())
y_pred_fte = rgr_fte.predict(X_test_fte)
print(np.sqrt(sklearn.metrics.mean_squared_error(y_test_fte, y_pred_fte)))


25.071608555406964
22.376475742767596
CPU times: user 143 ms, sys: 20 ms, total: 163 ms
Wall time: 202 ms


In [None]:
%%time
rgr  = SVR(kernel="linear", C=0.025)

rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

24.416745560533734
CPU times: user 59.3 s, sys: 314 ms, total: 59.6 s
Wall time: 1min 6s


In [None]:
%%time
rgr  = SVR(gamma=2, C=1)

rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

48.81927793822187
CPU times: user 1min 11s, sys: 245 ms, total: 1min 11s
Wall time: 1min 12s


In [None]:
#rgr  = GaussianProcessRegressor(1.0 * RBF(1.0), random_state=42)

rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

In [None]:
%%time
# TEST between minimal VS Feat Eng

rgr_std  = RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1, random_state=42)
rgr_std.fit(X_train_std, y_train_std.values.ravel())
y_pred_std = rgr_std.predict(X_test_std)
print(np.sqrt(sklearn.metrics.mean_squared_error(y_test_std, y_pred_std)))

rgr_fte  = RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1, random_state=42)
rgr_fte.fit(X_train_fte, y_train_fte.values.ravel())
y_pred_fte = rgr_fte.predict(X_test_fte)
print(np.sqrt(sklearn.metrics.mean_squared_error(y_test_fte, y_pred_fte)))

24.180103379729868
23.656090593432143
CPU times: user 197 ms, sys: 15 ms, total: 212 ms
Wall time: 229 ms


In [None]:
%%time
rgr  = MLPRegressor(alpha=1, max_iter=1000, random_state=42)
rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

30.842518615046394
CPU times: user 3min 48s, sys: 2min 22s, total: 6min 11s
Wall time: 4min 4s


In [None]:
%%time
rgr  = AdaBoostRegressor(random_state=42)
rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

25.612671397542982
CPU times: user 29.1 s, sys: 41.3 ms, total: 29.1 s
Wall time: 30.6 s


In [None]:
%%time
rgr  = GaussianNB()

rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

60.878814048895535
CPU times: user 89.9 ms, sys: 45.9 ms, total: 136 ms
Wall time: 85.8 ms


In [None]:
rgr  = QuadraticDiscriminantAnalysis()

rgr.fit(X_train, y_train)
y_pred = rgr.predict(X_test)

print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

In [None]:
rgr  = KernelRidge()

rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(sklearn.metrics.mean_squared_error(y_test, y_pred)))

In [None]:
%%time
# TEST between minimal VS Feat Eng

rgr_std  = Lasso()
rgr_std.fit(X_train_std, y_train_std.values.ravel())
y_pred_std = rgr_std.predict(X_test_std)
print(np.sqrt(sklearn.metrics.mean_squared_error(y_test_std, y_pred_std)))

rgr_fte  = Lasso()
rgr_fte.fit(X_train_fte, y_train_fte.values.ravel())
y_pred_fte = rgr_fte.predict(X_test_fte)
print(np.sqrt(sklearn.metrics.mean_squared_error(y_test_fte, y_pred_fte)))


22.454939897580797
19.546604449298922
CPU times: user 6.4 s, sys: 1.47 s, total: 7.87 s
Wall time: 4.11 s


In [None]:
%%time
rgr  = LinearRegression()

rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))

223930.8766704401
CPU times: user 346 ms, sys: 48.8 ms, total: 394 ms
Wall time: 251 ms


In [None]:
%%time
rgr  = LogisticRegression()

rgr.fit(X_train, y_train.values.ravel())
y_pred = rgr.predict(X_test)

print(np.sqrt(mean_squared_error(y_test, y_pred)))

37.078834933152905
CPU times: user 20.4 s, sys: 13.1 s, total: 33.5 s
Wall time: 20.2 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
%%time
# TEST between minimal VS Feat Eng

rgr_std  = GradientBoostingRegressor()
rgr_std.fit(X_train_std, y_train_std.values.ravel())
y_pred_std = rgr_std.predict(X_test_std)
print(np.sqrt(sklearn.metrics.mean_squared_error(y_test_std, y_pred_std)))

rgr_fte  = GradientBoostingRegressor()
rgr_fte.fit(X_train_fte, y_train_fte.values.ravel())
y_pred_fte = rgr_fte.predict(X_test_fte)
print(np.sqrt(sklearn.metrics.mean_squared_error(y_test_fte, y_pred_fte)))



22.121312383899582
17.64114406542858
CPU times: user 5min 40s, sys: 346 ms, total: 5min 40s
Wall time: 5min 43s


In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBRegressor

In [None]:
%%time
# TEST between minimal VS Feat Eng

rgr_std = XGBRegressor(n_estimators=3, max_depth=1, learning_rate=0.211)
rgr_std.fit(X_train_std, y_train_std.values.ravel())
y_pred_std = rgr_std.predict(X_test_std)
print(np.sqrt(mean_squared_error(y_test_std, y_pred_std)))


rgr_fte = XGBRegressor(n_estimators=3, max_depth=1, learning_rate=0.211)
rgr_fte.fit(X_train_fte, y_train_fte.values.ravel())
y_pred_fte = rgr_fte.predict(X_test_fte)
print(np.sqrt(mean_squared_error(y_test_fte, y_pred_fte)))

31.723973334531024
31.826748538652865
CPU times: user 2.42 s, sys: 55.7 ms, total: 2.47 s
Wall time: 2.52 s


## First Observations

Size is too big -> notebook outsourced to colab


## Optimizations

-- TODO --

In [None]:
def hyperparameter_function(clf):
    ''' Hyperparameter Opti
    '''
    classifier = clf
    rmse = sklearn.model_selection.cross_val_score(classifier, X=X_train, y=y_train, cv=5, scoring='root_mean_squared_error')
    return rmse



# Bounded region of parameter space
    pbounds = {'neighbours': (3, 7)}

    optimizer = BayesianOptimization(
        f=hyperparameter_function_knn,
        pbounds=pbounds,
        random_state=17,
        allow_duplicate_points= True
    )

    optimizer.maximize(
        init_points=50,
        n_iter=100,
    )

## Testing

- Operation Setting Dependency in Datasets 2 & 4
- Basic Loop with 3 Regressors

In [None]:
train_data_2_wRUL = calculate_RUL(train_data_2, time_column='Cycle', group_column='UnitNumber')
train_data_4_wRUL = calculate_RUL(train_data_4, time_column='Cycle', group_column='UnitNumber')

In [None]:
print(train_data_2.shape)
print(train_data_2_wRUL.shape)
print(train_data_2_wRUL.columns)

(53759, 26)
(53759, 27)
Index(['UnitNumber', 'Cycle', 'Operation Setting 1', 'Operation Setting 2',
       'Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 2',
       'Sensor Measure 3', 'Sensor Measure 4', 'Sensor Measure 5',
       'Sensor Measure 6', 'Sensor Measure 7', 'Sensor Measure 8',
       'Sensor Measure 9', 'Sensor Measure 10', 'Sensor Measure 11',
       'Sensor Measure 12', 'Sensor Measure 13', 'Sensor Measure 14',
       'Sensor Measure 15', 'Sensor Measure 16', 'Sensor Measure 17',
       'Sensor Measure 18', 'Sensor Measure 19', 'Sensor Measure 20',
       'Sensor Measure 21', 'RUL'],
      dtype='object')


In [None]:
#print("Dataset1:", train_data['Operation Setting 1'].value_counts(), train_data['Operation Setting 2'].value_counts(), train_data['Operation Setting 3'].value_counts())
print("Dataset2:", train_data_2_wRUL['Operation Setting 1'].value_counts(), train_data_2_wRUL['Operation Setting 2'].value_counts(), train_data_2_wRUL['Operation Setting 3'].value_counts())
print("Dataset4:", train_data_4_wRUL['Operation Setting 1'].value_counts(), train_data_4_wRUL['Operation Setting 2'].value_counts(), train_data_4_wRUL['Operation Setting 3'].value_counts())

Dataset2: Operation Setting 1
0.0020     310
0.0004     308
0.0026     293
0.0029     285
0.0022     281
          ... 
24.9980     43
25.0080     41
10.0080     41
34.9980     39
19.9980     32
Name: count, Length: 536, dtype: int64 Operation Setting 2
0.8400    11106
0.0000     4187
0.2500     4138
0.6200     4128
0.7000     4097
          ...  
0.7005      168
0.7020      124
0.6220       98
0.0020       93
0.2520       93
Name: count, Length: 105, dtype: int64 Operation Setting 3
100.0    45757
60.0      8002
Name: count, dtype: int64
Dataset4: Operation Setting 1
0.0022     339
0.0015     333
0.0023     324
0.0019     323
0.0006     323
          ... 
35.0080     48
34.9980     40
24.9980     37
10.0080     36
25.0080     36
Name: count, Length: 536, dtype: int64 Operation Setting 2
0.8400    12511
0.0000     4776
0.2500     4686
0.6200     4683
0.7000     4634
          ...  
0.7006      186
0.7020      124
0.0020      115
0.6220      115
0.2520      102
Name: count, Length: 105,

In [None]:
tr_df_2_os2_84 = train_data_2_wRUL[train_data_2_wRUL['Operation Setting 2'] == 0.8400]
tr_df_2_os3_60 = train_data_2_wRUL[train_data_2_wRUL['Operation Setting 3'] == 60.0]


tr_df_4_os2_84 = train_data_4_wRUL[train_data_4_wRUL['Operation Setting 2'] == 0.8400]
tr_df_4_os3_60 = train_data_4_wRUL[train_data_4_wRUL['Operation Setting 3'] == 60.0]

In [None]:
print("Dataset 2")
print("Operation Setting 2")
for col in tr_df_2_os2_84.columns:
  if abs(tr_df_2_os2_84['RUL'].corr(tr_df_2_os2_84[col])) >= 0.5:
    print(col)

print("Operation Setting 3")
for col in tr_df_2_os3_60.columns:
  if abs(tr_df_2_os3_60['RUL'].corr(tr_df_2_os3_60[col])) >= 0.5:
    print(col)

print("Dataset 3")
print("Operation Setting 2")
for col in tr_df_4_os2_84.columns:
  if abs(tr_df_4_os2_84['RUL'].corr(tr_df_4_os2_84[col])) >= 0.5:
    print(col)

print("Operation Setting 3")
for col in tr_df_4_os3_60.columns:
  if abs(tr_df_4_os3_60['RUL'].corr(tr_df_4_os3_60[col])) >= 0.5:
    print(col)


Dataset 2
Operation Setting 2
Cycle
Sensor Measure 4
Sensor Measure 11
Sensor Measure 15
RUL
Operation Setting 3
Cycle
Sensor Measure 3
Sensor Measure 4
Sensor Measure 11
Sensor Measure 15
Sensor Measure 17
RUL
Dataset 3
Operation Setting 2
Cycle
Sensor Measure 4
Sensor Measure 11
RUL
Operation Setting 3
Cycle
Sensor Measure 3
Sensor Measure 4
Sensor Measure 9
Sensor Measure 11
Sensor Measure 17
RUL


# Findings:

This eyperiment shows that certain sensors measures are still useful in Dataset 2 & 4

Thus they should get a manual droplist instead of automatic datacleaning

DS2: 3,4,11,15,17
DS4: 3,4,9,11,17


Basic pipeline with 3 regressors to evaluate tsfresh features

In [None]:
#test
feat = "test"
rgr1 = 24.1
rgr2 = 24.2
rgr3 = 24.3
rgr4 = 24.4

d = {'Feature': feat, 'Regressor Results': [f"KNR: {rgr1}",f"RFR: {rgr2}", f"Lasso: {rgr3}", f"XBGr: {rgr4}"]}
df = pd.DataFrame(data=d)
df.to_pickle(f"drive/MyDrive/PSDA_cml/data/processed/ds4_tsf-feat_eff_results.pkl")

In [None]:
df = pd.read_pickle("drive/MyDrive/PSDA_cml/data/processed/ds4_tsf-feat_eff_results.pkl")
df.head()

In [None]:
#train_data
#clean data
#choose 1 ft from tsfresh.feature_extraction importfeature_calculators, MinimalFCParameters, EfficientFCParameters
#scale
#fit 1-3
#eval
#export findings

In [22]:
from tsfresh.feature_extraction import feature_calculators, MinimalFCParameters, EfficientFCParameters

3,4,11,15,17 DS4: 3,4,9,11,17

In [None]:
cleaned_train_1, cleaned_test_1 = clean_data(train_data_4, test_data_4, method=None, ignore_columns=['UnitNumber', 'Cycle','Sensor Measure 3','Sensor Measure 4','Sensor Measure 9','Sensor Measure 11','Sensor Measure 15','Sensor Measure 17',], threshold_missing=0.1, threshold_corr=0.1)

In [None]:
min_timeshift = 17
max_timeshift = 18

In [None]:
print(train_data.columns)
cleaned_train_1.columns

In [None]:
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_timeshift,min_timeshift=min_timeshift,feature_extraction_mode= 'custom',feature_list=["feat"])
X_train_1, y_train_1, X_test_1, y_test_1 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_1, test_data=cleaned_test_1,test_RUL_data=test_rul_data,)

In [None]:
i= 0
for feat in EfficientFCParameters():
  #Counter
  print(i)
  i = i +1
  # RollingWindow
  rwCreator = RollingWindowDatasetCreator(max_timeshift=max_timeshift,min_timeshift=min_timeshift,feature_extraction_mode= 'custom',feature_list=[feat])
  X_train_1, y_train_1, X_test_1, y_test_1 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_1, test_data=cleaned_test_1,test_RUL_data=test_rul_data_4,)
  # KNeighborsRegressor
  knr = KNeighborsRegressor(3)
  knr.fit(X_train_1, y_train_1.values.ravel())
  rgr1 = np.sqrt(sklearn.metrics.mean_squared_error(y_test_1, knr.predict(X_test_1)))
  # RandomForestRegressor
  rfr  = RandomForestRegressor(max_depth=5, n_estimators=10, max_features=1, random_state=42)
  rfr.fit(X_train_1, y_train_1.values.ravel())
  rgr2 = np.sqrt(sklearn.metrics.mean_squared_error(y_test_1, rfr.predict(X_test_1)))
  # Lasso
  lr = Lasso()
  lr.fit(X_train_1, y_train_1.values.ravel())
  rgr3 = np.sqrt(sklearn.metrics.mean_squared_error(y_test_1, lr.predict(X_test_1)))
  # XGBRegressor
  xgbr = XGBRegressor(n_estimators=3, max_depth=1, learning_rate=0.211) # objective='binary:logistic'
  xgbr.fit(X_train_1, y_train_1.values.ravel())
  rgr4 = np.sqrt(mean_squared_error(y_test_1, xgbr.predict(X_test_1)))
  # Prev Data import
  df_in = pd.read_pickle("drive/MyDrive/PSDA_cml/data/processed/ds4_tsf-feat_eff_results.pkl")
  df = pd.DataFrame(data={'Feature': feat, 'Regressor Results': [f"KNR: {rgr1}",f"RFR: {rgr2}", f"Lasso: {rgr3}", f"XBGr: {rgr4}"]})
  df_out = pd.concat([df_in, df])
  df_out.to_pickle(f"drive/MyDrive/PSDA_cml/data/processed/ds4_tsf-feat_eff_results.pkl")
  print({'Feature': feat, 'Regressor Results': [f"KNR: {rgr1}",f"RFR: {rgr2}", f"Lasso: {rgr3}", f"XBGr: {rgr4}"]})

In [None]:
df = pd.read_pickle("drive/MyDrive/PSDA_cml/data/processed/ds1_tsf-feat_eff_results.pkl")

In [None]:
df

In [None]:
# After compiling the best features for all 4 datasets

# Bayesian Optimization over the 4 chosen ones
# KNR RFR Lasso XGBR

In [None]:
# Dataset 1 TPOT Hyperparameter Optimization

In [None]:
min_ts_tpot = 29
max_ts_tpot = 30
cleaned_train_tpot, cleaned_test_tpot = clean_data(train_data, test_data, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_tpot,min_timeshift=min_ts_tpot,feature_extraction_mode= 'minimal',feature_list=["median"])
X_train_tpot, y_train_tpot, X_test_tpot, y_test_tpot = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_tpot, test_data=cleaned_test_tpot,test_RUL_data=test_rul_data,)
y_train_tpot = y_train_tpot.clip(upper=125)

2024-05-31 11:58:23 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m


INFO:src.data_cleaning:Cleaning train and test data...


2024-05-31 11:58:23 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m


INFO:src.data_cleaning:Formatting column types...


2024-05-31 11:58:23 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-05-31 11:58:24 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-05-31 11:58:24 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m


INFO:src.data_cleaning:Handling duplicates...


2024-05-31 11:58:24 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m


INFO:src.data_cleaning:Removing outliers...


2024-05-31 11:58:24 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-05-31 11:58:24 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-05-31 11:58:24 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-05-31 11:58:24 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-05-31 11:58:24 [[34msrc.data_cleaning:150[0m] [[32mINFO[0m] >>>> Filter features based train data...[0m


INFO:src.data_cleaning:Filter features based train data...


2024-05-31 11:58:24 [[34msrc.data_cleaning:26[0m] [DEBUG[0m] >>>> Found 0 features with missing values above the threshold of 0.1.[0m


DEBUG:src.data_cleaning:Found 0 features with missing values above the threshold of 0.1.


2024-05-31 11:58:24 [[34msrc.data_cleaning:46[0m] [DEBUG[0m] >>>> Found 0 features with only a single unique value: [][0m


DEBUG:src.data_cleaning:Found 0 features with only a single unique value: []


2024-05-31 11:58:24 [[34msrc.data_cleaning:103[0m] [DEBUG[0m] >>>> Found 1 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber'][0m


DEBUG:src.data_cleaning:Found 1 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber']


2024-05-31 11:58:24 [[34msrc.data_cleaning:162[0m] [[32mINFO[0m] >>>> Dropping features based on missing values, single unique values, and no target correlation...[0m


INFO:src.data_cleaning:Dropping features based on missing values, single unique values, and no target correlation...


2024-05-31 11:58:24 [[34msrc.data_cleaning:172[0m] [[32mINFO[0m] >>>> Data cleaning completed.[0m


INFO:src.data_cleaning:Data cleaning completed.


2024-05-31 11:58:24 [[34msrc.data_cleaning:173[0m] [[32mINFO[0m] >>>> Original train DataFrame shape: (20631, 14), Resulting train DataFrame shape: (20631, 14)[0m


INFO:src.data_cleaning:Original train DataFrame shape: (20631, 14), Resulting train DataFrame shape: (20631, 14)


2024-05-31 11:58:24 [[34msrc.data_cleaning:174[0m] [[32mINFO[0m] >>>> Original test DataFrame shape: (13096, 14), Resulting test DataFrame shape: (13096, 14)[0m


INFO:src.data_cleaning:Original test DataFrame shape: (13096, 14), Resulting test DataFrame shape: (13096, 14)


2024-05-31 11:58:24 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 362/362 [00:27<00:00, 13.02it/s] 


2024-05-31 11:58:52 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 212772/212772 [01:32<00:00, 2293.32it/s]


2024-05-31 12:00:33 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-05-31 12:00:33 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for test data...
Rolling: 100%|██████████| 303/303 [00:15<00:00, 19.94it/s] 


2024-05-31 12:00:48 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


INFO:src.rolling_window_creator:Extracting features for test data...
Feature Extraction: 100%|██████████| 1200/1200 [00:00<00:00, 2612.74it/s]


2024-05-31 12:00:49 [[34msrc.rolling_window_creator:170[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m


INFO:src.rolling_window_creator:Datasets created successfully.


2024-05-31 12:00:49 [[34msrc.rolling_window_creator:171[0m] [[32mINFO[0m] >>>> Shape of X_train: (17731, 120)[0m


INFO:src.rolling_window_creator:Shape of X_train: (17731, 120)


2024-05-31 12:00:49 [[34msrc.rolling_window_creator:172[0m] [[32mINFO[0m] >>>> Shape of y_train: (17731, 1)[0m


INFO:src.rolling_window_creator:Shape of y_train: (17731, 1)


2024-05-31 12:00:49 [[34msrc.rolling_window_creator:173[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 120)[0m


INFO:src.rolling_window_creator:Shape of X_test: (100, 120)


2024-05-31 12:00:49 [[34msrc.rolling_window_creator:174[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m


INFO:src.rolling_window_creator:Shape of y_test: (100, 1)


In [None]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_squared_error#, neg_mean_squared_error

In [None]:
tpot = TPOTRegressor(generations=5, population_size=4, verbosity=2,
                      scoring='neg_mean_squared_error')
tpot.fit(X_train_tpot, y_train_tpot.values.ravel())

print(tpot.score(X_test_tpot, y_test_tpot))

Optimization Progress:   0%|          | 0/24 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -1647.5982802897245

Generation 2 - Current best internal CV score: -1647.5982802897245

Generation 3 - Current best internal CV score: -1644.2913001924958

Generation 4 - Current best internal CV score: -1644.2913001924958

Generation 5 - Current best internal CV score: -1644.2913001924958

Best pipeline: RandomForestRegressor(input_matrix, bootstrap=True, max_features=0.2, min_samples_leaf=14, min_samples_split=16, n_estimators=100)
-1616.0362900072512


  y = column_or_1d(y, warn=True)


# Findings
- TPOT mit default Parametern dauert aufwärts von 4 Stunden
- Mit kleineren Werten(Generations = 2) keine sinnvollen Ergebnisse mse von ~1790
- Letzter Test von TPOT (Generations = 5, Pop_size = 4):

## Bayesian Optimization

Datensatz 1

In [59]:
## Laden des ersten Datensatzes
train_data_1_opt, test_data_1_opt,test_rul_data_1_opt = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)
cleaned_train_1_opt, cleaned_test_1_opt = clean_data(train_data_1_opt, test_data_1_opt, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)

# Train Val Split
cl_train_1_opt, cl_val_1_opt = train_val_split_by_group(df = cleaned_train_1_opt,group = "UnitNumber",test_size = 0.18,n_splits = 2,random_state = 7)

## RollingWindowParameter
min_ts_1_opt = 29
max_ts_1_opt = 30
feature_list_ds_1 = ["c3", "quantile", "mean", "root_mean_square", "median", "time_reversal_asymmetry_statistic", "absolute_maximum", "maximum", "minimum", "agg_autocorrelation", "autocorrelation" ]

rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_1_opt,min_timeshift=min_ts_1_opt,feature_extraction_mode= 'custom',feature_list=feature_list_ds_1)
#rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_1_opt,min_timeshift=min_ts_1_opt,feature_extraction_mode= 'minimal',feature_list=["median"])


X_train_1_opt, y_train_1_opt = rwCreator._process_data(cl_train_1_opt, 'train')
X_val_1_opt, y_val_1_opt = rwCreator._process_data(cl_val_1_opt, 'train')
X_test_1_opt, y_test_1_opt = rwCreator._process_data(cleaned_test_1_opt, 'test', test_rul_data_1_opt)

y_train_1_opt = y_train_1_opt.clip(upper=125)



scaler = StandardScaler()
X_train_1_opt[2:] = scaler.fit_transform(X_train_1_opt[2:])
X_val_1_opt[2:] = scaler.fit_transform(X_val_1_opt[2:])
X_test_1_opt[2:] = scaler.fit_transform(X_test_1_opt[2:])

2024-06-01 13:15:34 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m


INFO:src.utils:Loading data set 1...


2024-06-01 13:15:34 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m


INFO:src.utils:Loaded raw data for dataset 1.


2024-06-01 13:15:34 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m


INFO:src.utils:Train Data: (20631, 26)


2024-06-01 13:15:34 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m


INFO:src.utils:Test Data: (13096, 26)


2024-06-01 13:15:34 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


INFO:src.utils:Test RUL Data: (100, 1)


2024-06-01 13:15:34 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m


INFO:src.data_cleaning:Cleaning train and test data...


2024-06-01 13:15:34 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m


INFO:src.data_cleaning:Formatting column types...


2024-06-01 13:15:34 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-06-01 13:15:34 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-06-01 13:15:34 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m


INFO:src.data_cleaning:Handling duplicates...


2024-06-01 13:15:34 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m


INFO:src.data_cleaning:Removing outliers...


2024-06-01 13:15:34 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-06-01 13:15:34 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-06-01 13:15:34 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-06-01 13:15:34 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-06-01 13:15:34 [[34msrc.data_cleaning:150[0m] [[32mINFO[0m] >>>> Filter features based train data...[0m


INFO:src.data_cleaning:Filter features based train data...


2024-06-01 13:15:34 [[34msrc.data_cleaning:26[0m] [DEBUG[0m] >>>> Found 0 features with missing values above the threshold of 0.1.[0m


DEBUG:src.data_cleaning:Found 0 features with missing values above the threshold of 0.1.


2024-06-01 13:15:34 [[34msrc.data_cleaning:46[0m] [DEBUG[0m] >>>> Found 7 features with only a single unique value: ['Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 5', 'Sensor Measure 10', 'Sensor Measure 16', 'Sensor Measure 18', 'Sensor Measure 19'][0m


DEBUG:src.data_cleaning:Found 7 features with only a single unique value: ['Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 5', 'Sensor Measure 10', 'Sensor Measure 16', 'Sensor Measure 18', 'Sensor Measure 19']


2024-06-01 13:15:34 [[34msrc.data_cleaning:103[0m] [DEBUG[0m] >>>> Found 6 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Operation Setting 1', 'Operation Setting 2', 'Sensor Measure 6', 'Sensor Measure 9', 'Sensor Measure 14'][0m


DEBUG:src.data_cleaning:Found 6 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Operation Setting 1', 'Operation Setting 2', 'Sensor Measure 6', 'Sensor Measure 9', 'Sensor Measure 14']


2024-06-01 13:15:34 [[34msrc.data_cleaning:162[0m] [[32mINFO[0m] >>>> Dropping features based on missing values, single unique values, and no target correlation...[0m


INFO:src.data_cleaning:Dropping features based on missing values, single unique values, and no target correlation...


2024-06-01 13:15:34 [[34msrc.data_cleaning:172[0m] [[32mINFO[0m] >>>> Data cleaning completed.[0m


INFO:src.data_cleaning:Data cleaning completed.


2024-06-01 13:15:34 [[34msrc.data_cleaning:173[0m] [[32mINFO[0m] >>>> Original train DataFrame shape: (20631, 14), Resulting train DataFrame shape: (20631, 14)[0m


INFO:src.data_cleaning:Original train DataFrame shape: (20631, 14), Resulting train DataFrame shape: (20631, 14)


2024-06-01 13:15:34 [[34msrc.data_cleaning:174[0m] [[32mINFO[0m] >>>> Original test DataFrame shape: (13096, 14), Resulting test DataFrame shape: (13096, 14)[0m


INFO:src.data_cleaning:Original test DataFrame shape: (13096, 14), Resulting test DataFrame shape: (13096, 14)


2024-06-01 13:15:34 [[34msrc.utils:131[0m] [[32mINFO[0m] >>>> Train set contains 82 different engines --> in total 16807[0m


INFO:src.utils:Train set contains 82 different engines --> in total 16807


2024-06-01 13:15:34 [[34msrc.utils:132[0m] [[32mINFO[0m] >>>>  Test set contains 18 different engines --> in total 3824[0m


INFO:src.utils: Test set contains 18 different engines --> in total 3824


2024-06-01 13:15:35 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 362/362 [00:26<00:00, 13.50it/s] 


2024-06-01 13:16:02 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 173148/173148 [10:22<00:00, 278.28it/s]


2024-06-01 13:26:46 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-06-01 13:26:47 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 341/341 [00:04<00:00, 71.63it/s] 


2024-06-01 13:26:51 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 39624/39624 [02:19<00:00, 283.75it/s]


2024-06-01 13:29:15 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-06-01 13:29:15 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for test data...
Rolling: 100%|██████████| 303/303 [00:16<00:00, 18.63it/s] 


2024-06-01 13:29:33 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


INFO:src.rolling_window_creator:Extracting features for test data...
Feature Extraction: 100%|██████████| 1200/1200 [00:03<00:00, 332.81it/s]


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import root_mean_squared_error

In [None]:
def hyperparameter_function_knn(neighbours):
      """ Function for hyperparameter optimization
      """
      neighbours = neighbours.round().astype(int)
      knn_regressor = KNeighborsRegressor(n_neighbors=neighbours)
      metric = cross_val_score(knn_regressor, X=X_train_1_opt, y=y_train_1_opt, cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'neighbours': (1, 750)}

optimizer = BayesianOptimization(
  f=hyperparameter_function_knn,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   | neighb... |
-------------------------------------
| [0m1        [0m | [0m-22.76   [0m | [0m221.7    [0m |
| [95m2        [0m | [95m-22.71   [0m | [95m398.4    [0m |
| [0m3        [0m | [0m-22.92   [0m | [0m144.4    [0m |
| [0m4        [0m | [0m-23.71   [0m | [0m51.86    [0m |
| [0m5        [0m | [0m-22.72   [0m | [0m590.5    [0m |
| [0m6        [0m | [0m-22.71   [0m | [0m492.6    [0m |
| [0m7        [0m | [0m-22.71   [0m | [0m478.5    [0m |
| [95m8        [0m | [95m-22.71   [0m | [95m432.1    [0m |
| [0m9        [0m | [0m-24.22   [0m | [0m30.26    [0m |
| [0m10       [0m | [0m-22.72   [0m | [0m269.0    [0m |
| [95m11       [0m | [95m-22.71   [0m | [95m435.5    [0m |
| [0m12       [0m | [0m-22.77   [0m | [0m744.9    [0m |
| [0m13       [0m | [0m-22.74   [0m | [0m668.2    [0m |
| [0m14       [0m | [0m-22.71   [0m | [0m331.4    [0m |
| [0m15       [0m | [0m-22.72   [0m | [0m5

KeyboardInterrupt: 

In [60]:
# Neigbors = 383
knn_regressor = KNeighborsRegressor(n_neighbors=383)
knn_regressor.fit(X_train_1_opt,y_train_1_opt )
y_pred_1_opt = knn_regressor.predict(X_test_1_opt)
print(np.sqrt(mean_squared_error(y_test_1_opt, y_pred_1_opt)))

23.361902580572174


In [None]:
def hyperparameter_function_rf(n_estimators, max_features, ):
      """ Function for hyperparameter optimization
      """
      n_estimators = n_estimators.round().astype(int)
      max_features = max_features.round().astype(int)

      rf_regressor = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,random_state=17,n_jobs=-1)
      metric = cross_val_score(rf_regressor, X=X_train_1_opt, y=y_train_1_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'n_estimators': (20, 500),'max_features': (1,1) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_rf,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   | max_fe... | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m-24.24   [0m | [0m1.0      [0m | [0m274.7    [0m |
| [0m2        [0m | [0m-24.66   [0m | [0m1.0      [0m | [0m52.59    [0m |
| [95m3        [0m | [95m-24.22   [0m | [95m1.0      [0m | [95m335.0    [0m |


KeyboardInterrupt: 

In [80]:
# n_estimators=296 , max_features=4 -> 22.93
rf_regressor = RandomForestRegressor(n_estimators=296,max_features=4,random_state=17)
rf_regressor.fit(X_train_1_opt,y_train_1_opt.values.ravel())
y_pred_1_opt = rf_regressor.predict(X_test_1_opt)
print(np.sqrt(mean_squared_error(y_test_1_opt, y_pred_1_opt)))

18.476619772652022


In [70]:
def hyperparameter_function_lasso(alpha, max_iter, ):
      """ Function for hyperparameter optimization
      """
      max_iter = max_iter.round().astype(int)

      lasso_regressor = Lasso(alpha=alpha,max_iter=max_iter,random_state=17)
      metric = cross_val_score(lasso_regressor, X=X_train_1_opt, y=y_train_1_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'alpha': (0.001, 1),'max_iter': (100,10000) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_lasso,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   |   alpha   | max_iter  |
-------------------------------------------------


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m1        [0m | [0m-2.277e+0[0m | [0m0.2954   [0m | [0m5.353e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [95m2        [0m | [95m-2.26e+05[0m | [95m0.1923   [0m | [95m772.2    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [95m3        [0m | [95m-2.19e+05[0m | [95m0.7872   [0m | [95m6.598e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m4        [0m | [0m-2.196e+0[0m | [0m0.6379   [0m | [0m5.798e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m5        [0m | [0m-2.592e+0[0m | [0m0.04002  [0m | [0m3.642e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m6        [0m | [0m-2.246e+0[0m | [0m0.9457   [0m | [0m694.4    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m7        [0m | [0m-2.211e+0[0m | [0m0.8642   [0m | [0m8.785e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


KeyboardInterrupt: 

In [76]:
# alpha=0.05119 , max_iter=6559 -> 22.34
lasso_regressor = Lasso(alpha=0.0319,max_iter=656,random_state=17)
lasso_regressor.fit(X_train_1_opt,y_train_1_opt )
y_pred_1_opt = lasso_regressor.predict(X_test_1_opt)
print(np.sqrt(mean_squared_error(y_test_1_opt, y_pred_1_opt)))

1620.9163222136515


  model = cd_fast.enet_coordinate_descent(


In [None]:
def hyperparameter_function_xgboost(eta, gamma ,max_depth,reg_lambda,reg_alpha ):
      """ Function for hyperparameter optimization
      """
      max_depth = max_depth.round().astype(int)

      xgb_regressor = XGBRegressor(eta=eta,gamma=gamma,max_depth=max_depth,reg_lambda=reg_lambda, reg_alpha=reg_alpha)
      metric = cross_val_score(xgb_regressor, X=X_train_1_opt, y=y_train_1_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'eta': (0, 1),'gamma': (0,2),'max_depth':(1,10),'reg_lambda':(1,1),'reg_alpha':(0,0) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_xgboost,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   |    eta    |   gamma   | max_depth | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-23.24   [0m | [0m0.2947   [0m | [0m1.061    [0m | [0m2.724    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m2        [0m | [0m-27.72   [0m | [0m0.6563   [0m | [0m1.275    [0m | [0m6.18     [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m3        [0m | [0m-30.91   [0m | [0m0.9457   [0m | [0m0.1201   [0m | [0m8.776    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m4        [0m | [0m-27.17   [0m | [0m0.6524   [0m | [0m1.104    [0m | [0m6.378    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [95m5        [0m | [95m-22.62   [0m | [95m0.2977   [0m | [95m1.123    [0m | [95m4.564    [0m | [95m0.0      [0m | [95m1.0      [0m |
| [0m6        [0m | [0m-23.22   [0m | [0m0.1439   [0m | [0m0.3018   [0m | [0m1.497    [0m | [0m0.0  

In [77]:
# eta=0.09569 , gamma=0.05334 , max_depth=3.567 ,lambda=1 , alpha=0 -> 22.62
xgb_regressor = XGBRegressor(eta=0.09569,gamma=0.05334,max_depth=4,reg_lambda=1, reg_alpha=0)#eta=0.3,gamma=0,max_depth=6,reg_lambda=1, reg_alpha=0
xgb_regressor.fit(X_train_1_opt,y_train_1_opt )
y_pred_1_opt = xgb_regressor.predict(X_test_1_opt)

print(np.sqrt(mean_squared_error(y_test_1_opt, y_pred_1_opt)))

19.02862176067683


In [64]:
## Laden des dritten Datensatzes
train_data_3_opt, test_data_3_opt,test_rul_data_3_opt = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)
cleaned_train_3_opt, cleaned_test_3_opt = clean_data(train_data_3_opt, test_data_3_opt, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.5)

# Train Val Split
cl_train_3_opt, cl_val_3_opt = train_val_split_by_group(df = cleaned_train_3_opt,group = "UnitNumber",test_size = 0.18,n_splits = 2,random_state = 7)

## RollingWindowParameter
min_ts_3_opt = 29
max_ts_3_opt = 30
feature_list_ds_3 = ["c3", "quantile", "mean", "root_mean_square", "median", "time_reversal_asymmetry_statistic", "absolute_maximum", "maximum", "minimum", "agg_autocorrelation", "autocorrelation" ]

rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_3_opt,min_timeshift=min_ts_3_opt,feature_extraction_mode= 'custom',feature_list=feature_list_ds_3)
#rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_3_opt,min_timeshift=min_ts_3_opt,feature_extraction_mode= 'minimal',feature_list=["median"])


X_train_3_opt, y_train_3_opt = rwCreator._process_data(cl_train_3_opt, 'train')
X_val_3_opt, y_val_3_opt = rwCreator._process_data(cl_val_3_opt, 'train')
X_test_3_opt, y_test_3_opt = rwCreator._process_data(cleaned_test_3_opt, 'test', test_rul_data_3_opt)

y_train_3_opt = y_train_3_opt.clip(upper=125)

scaler = StandardScaler()
X_train_3_opt[2:] = scaler.fit_transform(X_train_3_opt[2:])
X_val_3_opt[2:] = scaler.fit_transform(X_val_3_opt[2:])
X_test_3_opt[2:] = scaler.fit_transform(X_test_3_opt[2:])

2024-06-01 13:31:40 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 3...[0m


INFO:src.utils:Loading data set 3...


2024-06-01 13:31:40 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 3.[0m


INFO:src.utils:Loaded raw data for dataset 3.


2024-06-01 13:31:40 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (24720, 26)[0m


INFO:src.utils:Train Data: (24720, 26)


2024-06-01 13:31:40 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (16596, 26)[0m


INFO:src.utils:Test Data: (16596, 26)


2024-06-01 13:31:40 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


INFO:src.utils:Test RUL Data: (100, 1)


2024-06-01 13:31:40 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m


INFO:src.data_cleaning:Cleaning train and test data...


2024-06-01 13:31:40 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m


INFO:src.data_cleaning:Formatting column types...


2024-06-01 13:31:40 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-06-01 13:31:40 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-06-01 13:31:40 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m


INFO:src.data_cleaning:Handling duplicates...


2024-06-01 13:31:40 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m


INFO:src.data_cleaning:Removing outliers...


2024-06-01 13:31:40 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-06-01 13:31:40 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-06-01 13:31:40 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-06-01 13:31:40 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-06-01 13:31:40 [[34msrc.data_cleaning:150[0m] [[32mINFO[0m] >>>> Filter features based train data...[0m


INFO:src.data_cleaning:Filter features based train data...


2024-06-01 13:31:40 [[34msrc.data_cleaning:26[0m] [DEBUG[0m] >>>> Found 0 features with missing values above the threshold of 0.1.[0m


DEBUG:src.data_cleaning:Found 0 features with missing values above the threshold of 0.1.


2024-06-01 13:31:40 [[34msrc.data_cleaning:46[0m] [DEBUG[0m] >>>> Found 6 features with only a single unique value: ['Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 5', 'Sensor Measure 16', 'Sensor Measure 18', 'Sensor Measure 19'][0m


DEBUG:src.data_cleaning:Found 6 features with only a single unique value: ['Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 5', 'Sensor Measure 16', 'Sensor Measure 18', 'Sensor Measure 19']


2024-06-01 13:31:40 [[34msrc.data_cleaning:103[0m] [DEBUG[0m] >>>> Found 12 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Cycle', 'Operation Setting 1', 'Operation Setting 2', 'Sensor Measure 6', 'Sensor Measure 7', 'Sensor Measure 10', 'Sensor Measure 12', 'Sensor Measure 14', 'Sensor Measure 15', 'Sensor Measure 20', 'Sensor Measure 21'][0m


DEBUG:src.data_cleaning:Found 12 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Cycle', 'Operation Setting 1', 'Operation Setting 2', 'Sensor Measure 6', 'Sensor Measure 7', 'Sensor Measure 10', 'Sensor Measure 12', 'Sensor Measure 14', 'Sensor Measure 15', 'Sensor Measure 20', 'Sensor Measure 21']


2024-06-01 13:31:40 [[34msrc.data_cleaning:162[0m] [[32mINFO[0m] >>>> Dropping features based on missing values, single unique values, and no target correlation...[0m


INFO:src.data_cleaning:Dropping features based on missing values, single unique values, and no target correlation...


2024-06-01 13:31:40 [[34msrc.data_cleaning:172[0m] [[32mINFO[0m] >>>> Data cleaning completed.[0m


INFO:src.data_cleaning:Data cleaning completed.


2024-06-01 13:31:40 [[34msrc.data_cleaning:173[0m] [[32mINFO[0m] >>>> Original train DataFrame shape: (24720, 10), Resulting train DataFrame shape: (24720, 10)[0m


INFO:src.data_cleaning:Original train DataFrame shape: (24720, 10), Resulting train DataFrame shape: (24720, 10)


2024-06-01 13:31:40 [[34msrc.data_cleaning:174[0m] [[32mINFO[0m] >>>> Original test DataFrame shape: (16596, 10), Resulting test DataFrame shape: (16596, 10)[0m


INFO:src.data_cleaning:Original test DataFrame shape: (16596, 10), Resulting test DataFrame shape: (16596, 10)


2024-06-01 13:31:40 [[34msrc.utils:131[0m] [[32mINFO[0m] >>>> Train set contains 82 different engines --> in total 20683[0m


INFO:src.utils:Train set contains 82 different engines --> in total 20683


2024-06-01 13:31:40 [[34msrc.utils:132[0m] [[32mINFO[0m] >>>>  Test set contains 18 different engines --> in total 4037[0m


INFO:src.utils: Test set contains 18 different engines --> in total 4037


2024-06-01 13:31:40 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 525/525 [00:26<00:00, 19.88it/s]


2024-06-01 13:32:07 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 146440/146440 [08:40<00:00, 281.54it/s]


2024-06-01 13:41:08 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-06-01 13:41:08 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 392/392 [00:05<00:00, 67.73it/s] 


2024-06-01 13:41:14 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 28120/28120 [01:37<00:00, 287.78it/s]


2024-06-01 13:42:56 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-06-01 13:42:56 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for test data...
Rolling: 100%|██████████| 475/475 [00:20<00:00, 22.83it/s] 


2024-06-01 13:43:17 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


INFO:src.rolling_window_creator:Extracting features for test data...
Feature Extraction: 100%|██████████| 800/800 [00:02<00:00, 325.21it/s]


In [None]:
def hyperparameter_function_knn(neighbours):
      """ Function for hyperparameter optimization
      """
      neighbours = neighbours.round().astype(int)
      knn_regressor = KNeighborsRegressor(n_neighbors=neighbours)
      metric = cross_val_score(knn_regressor, X=X_train_3_opt, y=y_train_3_opt, cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'neighbours': (1, 750)}

optimizer = BayesianOptimization(
  f=hyperparameter_function_knn,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   | neighb... |
-------------------------------------


KeyboardInterrupt: 

In [65]:
# Neigbors = 300
knn_regressor = KNeighborsRegressor(n_neighbors=300)
knn_regressor.fit(X_train_3_opt,y_train_3_opt )
y_pred_3_opt = knn_regressor.predict(X_test_3_opt)
print(np.sqrt(mean_squared_error(y_test_3_opt, y_pred_3_opt)))

24.766279241025376


In [None]:
def hyperparameter_function_rf(n_estimators, max_features, ):
      """ Function for hyperparameter optimization
      """
      n_estimators = n_estimators.round().astype(int)
      max_features = max_features.round().astype(int)

      rf_regressor = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,random_state=17,n_jobs=-1)
      metric = cross_val_score(rf_regressor, X=X_train_3_opt, y=y_train_3_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'n_estimators': (20, 500),'max_features': (1,1) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_rf,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   | max_fe... | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m-21.99   [0m | [0m1.0      [0m | [0m274.7    [0m |
| [0m2        [0m | [0m-22.29   [0m | [0m1.0      [0m | [0m52.59    [0m |
| [95m3        [0m | [95m-21.98   [0m | [95m1.0      [0m | [95m335.0    [0m |
| [0m4        [0m | [0m-22.0    [0m | [0m1.0      [0m | [0m296.3    [0m |
| [0m5        [0m | [0m-21.99   [0m | [0m1.0      [0m | [0m191.8    [0m |
| [0m6        [0m | [0m-22.23   [0m | [0m1.0      [0m | [0m48.82    [0m |
| [0m7        [0m | [0m-22.01   [0m | [0m1.0      [0m | [0m441.1    [0m |
| [95m8        [0m | [95m-21.97   [0m | [95m1.0      [0m | [95m333.2    [0m |
| [0m9        [0m | [0m-21.99   [0m | [0m1.0      [0m | [0m306.8    [0m |
| [0m10       [0m | [0m-22.06   [0m | [0m1.0      [0m | [0m155.8    [0m |
| [0m11       [0m | [0m-22.01   [0m | [0m1.0      [0m | [0m223.5 

KeyboardInterrupt: 

In [69]:
# n_estimators= 333 , max_features= 1->
rf_regressor = RandomForestRegressor(n_estimators=333,max_features=1,random_state=17)#296,4
rf_regressor.fit(X_train_3_opt,y_train_3_opt.values.ravel())
y_pred_3_opt = rf_regressor.predict(X_test_3_opt)
print(np.sqrt(mean_squared_error(y_test_3_opt, y_pred_3_opt)))

23.361107460740012


In [None]:
def hyperparameter_function_lasso(alpha, max_iter, ):
      """ Function for hyperparameter optimization
      """
      max_iter = max_iter.round().astype(int)

      lasso_regressor = Lasso(alpha=alpha,max_iter=max_iter,random_state=17)
      metric = cross_val_score(lasso_regressor, X=X_train_3_opt, y=y_train_3_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'alpha': (0.001, 1),'max_iter': (100,10000) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_lasso,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   |   alpha   | max_iter  |
-------------------------------------------------


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m1        [0m | [0m-4.436e+0[0m | [0m0.2954   [0m | [0m5.353e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [95m2        [0m | [95m-4.393e+0[0m | [95m0.1923   [0m | [95m772.2    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m3        [0m | [0m-4.938e+0[0m | [0m0.7872   [0m | [0m6.598e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m4        [0m | [0m-4.824e+0[0m | [0m0.6379   [0m | [0m5.798e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m5        [0m | [0m-6.543e+0[0m | [0m0.04002  [0m | [0m3.642e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m6        [0m | [0m-1.423e+0[0m | [0m0.9457   [0m | [0m694.4    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m7        [0m | [0m-4.982e+0[0m | [0m0.8642   [0m | [0m8.785e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m8        [0m | [0m-1.708e+0[0m | [0m0.05214  [0m | [0m6.559e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m9        [0m | [0m-4.708e+0[0m | [0m0.5522   [0m | [0m6.015e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m10       [0m | [0m-4.59e+04[0m | [0m0.484    [0m | [0m2.902e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m11       [0m | [0m-4.976e+0[0m | [0m0.9033   [0m | [0m5.355e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m12       [0m | [0m-1.619e+1[0m | [0m0.001    [0m | [0m837.5    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m13       [0m | [0m-4.765e+0[0m | [0m0.5926   [0m | [0m5.937e+03[0m |


  model = cd_fast.enet_coordinate_descent(


KeyboardInterrupt: 

In [78]:
# alpha= 0.1923 , max_iter= 772 ->
lasso_regressor = Lasso(alpha=0.1923,max_iter=772,random_state=17)
lasso_regressor.fit(X_train_3_opt,y_train_3_opt )
y_pred_3_opt = lasso_regressor.predict(X_test_3_opt)
print(np.sqrt(mean_squared_error(y_test_3_opt, y_pred_3_opt)))

8080.994375402995


  model = cd_fast.enet_coordinate_descent(


In [None]:
def hyperparameter_function_xgboost(eta, gamma ,max_depth,reg_lambda,reg_alpha ):
      """ Function for hyperparameter optimization
      """
      max_depth = max_depth.round().astype(int)

      xgb_regressor = XGBRegressor(eta=eta,gamma=gamma,max_depth=max_depth,reg_lambda=reg_lambda, reg_alpha=reg_alpha)
      metric = cross_val_score(xgb_regressor, X=X_train_3_opt, y=y_train_3_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'eta': (0, 1),'gamma': (0,2),'max_depth':(1,10),'reg_lambda':(1,1),'reg_alpha':(0,0) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_xgboost,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   |    eta    |   gamma   | max_depth | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-18.06   [0m | [0m0.2947   [0m | [0m1.061    [0m | [0m2.724    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m2        [0m | [0m-20.73   [0m | [0m0.6563   [0m | [0m1.275    [0m | [0m6.18     [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m3        [0m | [0m-23.58   [0m | [0m0.9457   [0m | [0m0.1201   [0m | [0m8.776    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m4        [0m | [0m-20.65   [0m | [0m0.6524   [0m | [0m1.104    [0m | [0m6.378    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m5        [0m | [0m-19.85   [0m | [0m0.2977   [0m | [0m1.123    [0m | [0m4.564    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [95m6        [0m | [95m-17.62   [0m | [95m0.1439   [0m | [95m0.3018   [0m | [95m1.497    [0m | [95m0.0   

KeyboardInterrupt: 

In [68]:
# eta= 0.2079 , gamma= 1.342, max_depth= 2 ,lambda=1 , alpha=0 ->
xgb_regressor = XGBRegressor(eta=0.2079,gamma=1.342,max_depth=2,reg_lambda=1, reg_alpha=0)
xgb_regressor.fit(X_train_3_opt,y_train_3_opt )
y_pred_3_opt = xgb_regressor.predict(X_test_3_opt)

print(np.sqrt(mean_squared_error(y_test_3_opt, y_pred_3_opt)))

21.480055691867083


In [23]:
## Laden des zweiten Datensatzes
train_data_2_opt, test_data_2_opt,test_rul_data_2_opt = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)
cleaned_train_2_opt, cleaned_test_2_opt = clean_data(train_data_2_opt, test_data_2_opt, method=None, ignore_columns=['UnitNumber', 'Cycle','Operation Setting 2','Operation Setting 3','Sensor Measure 2','Sensor Measure 3','Sensor Measure 4','Sensor Measure 8','Sensor Measure 9','Sensor Measure 11','Sensor Measure 15','Sensor Measure 17'], threshold_missing=0.1, threshold_corr=0.5)

# Train Val Split
cl_train_2_opt, cl_val_2_opt = train_val_split_by_group(df = cleaned_train_2_opt,group = "UnitNumber",test_size = 0.18,n_splits = 2,random_state = 7)

## RollingWindowParameter
min_ts_2_opt = 17
max_ts_2_opt = 18
feature_list_ds_2 = ["c3", "quantile", "mean", "median", "root_mean_square", "variance", "mean_abs_change", "standard_deviation", "skewness", "variation_coefficient", "last_location_of_maximum", "first_location_of_maximum"]

#rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_2_opt,min_timeshift=min_ts_2_opt,feature_extraction_mode= 'custom',feature_list=feature_list_ds_3)
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_2_opt,min_timeshift=min_ts_2_opt,feature_extraction_mode= 'minimal',feature_list=["median"])


X_train_2_opt, y_train_2_opt = rwCreator._process_data(cl_train_2_opt, 'train')
X_val_2_opt, y_val_2_opt = rwCreator._process_data(cl_val_2_opt, 'train')
X_test_2_opt, y_test_2_opt = rwCreator._process_data(cleaned_test_2_opt, 'test', test_rul_data_2_opt)

y_train_2_opt = y_train_2_opt.clip(upper=125)

scaler = StandardScaler()
X_train_2_opt[2:] = scaler.fit_transform(X_train_2_opt[2:])
X_val_2_opt[2:] = scaler.fit_transform(X_val_2_opt[2:])
X_test_2_opt[2:] = scaler.fit_transform(X_test_2_opt[2:])

2024-06-01 10:03:10 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m


INFO:src.utils:Loading data set 2...


2024-06-01 10:03:10 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m


INFO:src.utils:Loaded raw data for dataset 2.


2024-06-01 10:03:10 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m


INFO:src.utils:Train Data: (53759, 26)


2024-06-01 10:03:10 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m


INFO:src.utils:Test Data: (33991, 26)


2024-06-01 10:03:10 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


INFO:src.utils:Test RUL Data: (259, 1)


2024-06-01 10:03:10 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m


INFO:src.data_cleaning:Cleaning train and test data...


2024-06-01 10:03:10 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m


INFO:src.data_cleaning:Formatting column types...


2024-06-01 10:03:10 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-06-01 10:03:10 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-06-01 10:03:10 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m


INFO:src.data_cleaning:Handling duplicates...


2024-06-01 10:03:11 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m


INFO:src.data_cleaning:Removing outliers...


2024-06-01 10:03:11 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-06-01 10:03:11 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-06-01 10:03:11 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-06-01 10:03:11 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-06-01 10:03:11 [[34msrc.data_cleaning:150[0m] [[32mINFO[0m] >>>> Filter features based train data...[0m


INFO:src.data_cleaning:Filter features based train data...


2024-06-01 10:03:11 [[34msrc.data_cleaning:26[0m] [DEBUG[0m] >>>> Found 0 features with missing values above the threshold of 0.1.[0m


DEBUG:src.data_cleaning:Found 0 features with missing values above the threshold of 0.1.


2024-06-01 10:03:11 [[34msrc.data_cleaning:46[0m] [DEBUG[0m] >>>> Found 0 features with only a single unique value: [][0m


DEBUG:src.data_cleaning:Found 0 features with only a single unique value: []


2024-06-01 10:03:11 [[34msrc.data_cleaning:103[0m] [DEBUG[0m] >>>> Found 25 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Operation Setting 1', 'Operation Setting 2', 'Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 2', 'Sensor Measure 3', 'Sensor Measure 4', 'Sensor Measure 5', 'Sensor Measure 6', 'Sensor Measure 7', 'Sensor Measure 8', 'Sensor Measure 9', 'Sensor Measure 10', 'Sensor Measure 11', 'Sensor Measure 12', 'Sensor Measure 13', 'Sensor Measure 14', 'Sensor Measure 15', 'Sensor Measure 16', 'Sensor Measure 17', 'Sensor Measure 18', 'Sensor Measure 19', 'Sensor Measure 20', 'Sensor Measure 21'][0m


DEBUG:src.data_cleaning:Found 25 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Operation Setting 1', 'Operation Setting 2', 'Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 2', 'Sensor Measure 3', 'Sensor Measure 4', 'Sensor Measure 5', 'Sensor Measure 6', 'Sensor Measure 7', 'Sensor Measure 8', 'Sensor Measure 9', 'Sensor Measure 10', 'Sensor Measure 11', 'Sensor Measure 12', 'Sensor Measure 13', 'Sensor Measure 14', 'Sensor Measure 15', 'Sensor Measure 16', 'Sensor Measure 17', 'Sensor Measure 18', 'Sensor Measure 19', 'Sensor Measure 20', 'Sensor Measure 21']


2024-06-01 10:03:11 [[34msrc.data_cleaning:162[0m] [[32mINFO[0m] >>>> Dropping features based on missing values, single unique values, and no target correlation...[0m


INFO:src.data_cleaning:Dropping features based on missing values, single unique values, and no target correlation...


2024-06-01 10:03:11 [[34msrc.data_cleaning:172[0m] [[32mINFO[0m] >>>> Data cleaning completed.[0m


INFO:src.data_cleaning:Data cleaning completed.


2024-06-01 10:03:11 [[34msrc.data_cleaning:173[0m] [[32mINFO[0m] >>>> Original train DataFrame shape: (53759, 12), Resulting train DataFrame shape: (53759, 12)[0m


INFO:src.data_cleaning:Original train DataFrame shape: (53759, 12), Resulting train DataFrame shape: (53759, 12)


2024-06-01 10:03:11 [[34msrc.data_cleaning:174[0m] [[32mINFO[0m] >>>> Original test DataFrame shape: (33991, 12), Resulting test DataFrame shape: (33991, 12)[0m


INFO:src.data_cleaning:Original test DataFrame shape: (33991, 12), Resulting test DataFrame shape: (33991, 12)


2024-06-01 10:03:11 [[34msrc.utils:131[0m] [[32mINFO[0m] >>>> Train set contains 213 different engines --> in total 43193[0m


INFO:src.utils:Train set contains 213 different engines --> in total 43193


2024-06-01 10:03:11 [[34msrc.utils:132[0m] [[32mINFO[0m] >>>>  Test set contains 47 different engines --> in total 10566[0m


INFO:src.utils: Test set contains 47 different engines --> in total 10566


2024-06-01 10:03:11 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 365/365 [01:02<00:00,  5.81it/s]


2024-06-01 10:04:16 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 395720/395720 [03:35<00:00, 1836.72it/s]


2024-06-01 10:08:07 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-06-01 10:08:08 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 378/378 [00:13<00:00, 28.13it/s] 

2024-06-01 10:08:21 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m



INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 97670/97670 [00:43<00:00, 2256.37it/s]


2024-06-01 10:09:07 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-06-01 10:09:07 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for test data...
Rolling: 100%|██████████| 367/367 [00:43<00:00,  8.35it/s]


2024-06-01 10:09:52 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


INFO:src.rolling_window_creator:Extracting features for test data...
Feature Extraction: 100%|██████████| 2590/2590 [00:00<00:00, 2690.56it/s]


In [None]:
def hyperparameter_function_knn(neighbours):
      """ Function for hyperparameter optimization
      """
      neighbours = neighbours.round().astype(int)
      knn_regressor = KNeighborsRegressor(n_neighbors=neighbours)
      metric = cross_val_score(knn_regressor, X=X_train_2_opt, y=y_train_2_opt, cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'neighbours': (1, 750)}

optimizer = BayesianOptimization(
  f=hyperparameter_function_knn,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   | neighb... |
-------------------------------------
| [0m1        [0m | [0m-29.66   [0m | [0m221.7    [0m |
| [0m2        [0m | [0m-31.11   [0m | [0m398.4    [0m |
| [95m3        [0m | [95m-28.77   [0m | [95m144.4    [0m |
| [95m4        [0m | [95m-27.48   [0m | [95m51.86    [0m |
| [0m5        [0m | [0m-32.11   [0m | [0m590.5    [0m |
| [0m6        [0m | [0m-31.68   [0m | [0m492.6    [0m |
| [0m7        [0m | [0m-31.61   [0m | [0m478.5    [0m |
| [0m8        [0m | [0m-31.33   [0m | [0m432.1    [0m |
| [95m9        [0m | [95m-27.16   [0m | [95m30.26    [0m |
| [0m10       [0m | [0m-30.12   [0m | [0m269.0    [0m |
| [0m11       [0m | [0m-34.51   [0m | [0m1.0      [0m |
| [0m12       [0m | [0m-27.18   [0m | [0m31.27    [0m |
| [0m13       [0m | [0m-27.76   [0m | [0m71.06    [0m |
| [0m14       [0m | [0m-28.13   [0m | [0m97.85    [0m |
| [0m15       [0m | [0m-28.43   [0m | [0m1

KeyboardInterrupt: 

In [29]:
# Neigbors = 20
knn_regressor = KNeighborsRegressor(n_neighbors=20)
knn_regressor.fit(X_train_2_opt,y_train_2_opt )
y_pred_2_opt = knn_regressor.predict(X_test_2_opt)
print(np.sqrt(mean_squared_error(y_test_2_opt, y_pred_2_opt)))
# default 54.41
# minimal 40.59 / 38
# custom+ 48.67
# custom  59.075

38.46334852215391


In [None]:
def hyperparameter_function_rf(n_estimators, max_features, ):
      """ Function for hyperparameter optimization
      """
      n_estimators = n_estimators.round().astype(int)
      max_features = max_features.round().astype(int)

      rf_regressor = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,random_state=17,n_jobs=-1)
      metric = cross_val_score(rf_regressor, X=X_train_2_opt, y=y_train_2_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'n_estimators': (20, 500),'max_features': (1,1) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_rf,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   | max_fe... | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m-23.74   [0m | [0m1.0      [0m | [0m274.7    [0m |
| [0m2        [0m | [0m-24.05   [0m | [0m1.0      [0m | [0m52.59    [0m |
| [0m3        [0m | [0m-23.74   [0m | [0m1.0      [0m | [0m335.0    [0m |
| [95m4        [0m | [95m-23.73   [0m | [95m1.0      [0m | [95m296.3    [0m |
| [0m5        [0m | [0m-23.75   [0m | [0m1.0      [0m | [0m191.8    [0m |
| [0m6        [0m | [0m-24.11   [0m | [0m1.0      [0m | [0m48.82    [0m |


KeyboardInterrupt: 

In [30]:
# n_estimators= , max_features= ->
rf_regressor = RandomForestRegressor(n_estimators=296,max_features=4)
rf_regressor.fit(X_train_2_opt,y_train_2_opt.values.ravel())
y_pred_2_opt = rf_regressor.predict(X_test_2_opt)
print(np.sqrt(mean_squared_error(y_test_2_opt, y_pred_2_opt)))
#custom 36.47
# 34.925

34.75418809127862


In [None]:
def hyperparameter_function_lasso(alpha, max_iter, ):
      """ Function for hyperparameter optimization
      """
      max_iter = max_iter.round().astype(int)

      lasso_regressor = Lasso(alpha=alpha,max_iter=max_iter,random_state=17)
      metric = cross_val_score(lasso_regressor, X=X_train_2_opt, y=y_train_2_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'alpha': (0.001, 1),'max_iter': (100,10000) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_lasso,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   |   alpha   | max_iter  |
-------------------------------------------------


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m1        [0m | [0m-3.602e+0[0m | [0m0.2954   [0m | [0m5.353e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [95m2        [0m | [95m-2.863e+0[0m | [95m0.1923   [0m | [95m772.2    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [95m3        [0m | [95m-1.321e+0[0m | [95m0.7872   [0m | [95m6.598e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m4        [0m | [0m-2.049e+0[0m | [0m0.6379   [0m | [0m5.798e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m5        [0m | [0m-1.996e+0[0m | [0m0.04002  [0m | [0m3.642e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [95m6        [0m | [95m-858.4   [0m | [95m0.9457   [0m | [95m694.4    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m7        [0m | [0m-1.045e+0[0m | [0m0.8642   [0m | [0m8.785e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m8        [0m | [0m-1.701e+0[0m | [0m0.05214  [0m | [0m6.559e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m9        [0m | [0m-2.449e+0[0m | [0m0.5522   [0m | [0m6.015e+03[0m |


KeyboardInterrupt: 

In [31]:
# alpha= , max_iter= ->
lasso_regressor = Lasso(alpha=0.9457,max_iter=694)
lasso_regressor.fit(X_train_2_opt,y_train_2_opt )
y_pred_2_opt = lasso_regressor.predict(X_test_2_opt)
print(np.sqrt(mean_squared_error(y_test_2_opt, y_pred_2_opt)))
#custom 1714
# 2762/714/86

86.99903836960912


  model = cd_fast.enet_coordinate_descent(


In [None]:
def hyperparameter_function_xgboost(eta, gamma ,max_depth,reg_lambda,reg_alpha ):
      """ Function for hyperparameter optimization
      """
      max_depth = max_depth.round().astype(int)

      xgb_regressor = XGBRegressor(eta=eta,gamma=gamma,max_depth=max_depth,reg_lambda=reg_lambda, reg_alpha=reg_alpha)
      metric = cross_val_score(xgb_regressor, X=X_train_2_opt, y=y_train_2_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'eta': (0, 1),'gamma': (0,2),'max_depth':(1,10),'reg_lambda':(1,1),'reg_alpha':(0,0) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_xgboost,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   |    eta    |   gamma   | max_depth | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-23.25   [0m | [0m0.2947   [0m | [0m1.061    [0m | [0m2.724    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m2        [0m | [0m-26.7    [0m | [0m0.6563   [0m | [0m1.275    [0m | [0m6.18     [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m3        [0m | [0m-30.14   [0m | [0m0.9457   [0m | [0m0.1201   [0m | [0m8.776    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m4        [0m | [0m-26.5    [0m | [0m0.6524   [0m | [0m1.104    [0m | [0m6.378    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m5        [0m | [0m-24.26   [0m | [0m0.2977   [0m | [0m1.123    [0m | [0m4.564    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m6        [0m | [0m-24.4    [0m | [0m0.1439   [0m | [0m0.3018   [0m | [0m1.497    [0m | [0m0.0      [0

In [32]:
# eta=0.02803, gamma=0.8998, max_depth=6,lambda=1 , alpha=0 -> 22.92
xgb_regressor = XGBRegressor(eta=0.02803,gamma=0.8998,max_depth=6,reg_lambda=1, reg_alpha=0)
xgb_regressor.fit(X_train_2_opt,y_train_2_opt )
y_pred_2_opt = xgb_regressor.predict(X_test_2_opt)
print(np.sqrt(mean_squared_error(y_test_2_opt, y_pred_2_opt)))

#custom 35/37 - 34.11

34.113645722674306


In [37]:
## Laden des vierten Datensatzes
train_data_4_opt, test_data_4_opt,test_rul_data_4_opt = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)
cleaned_train_4_opt, cleaned_test_4_opt = clean_data(train_data_4_opt, test_data_4_opt, method=None, ignore_columns=['UnitNumber', 'Cycle','Operation Setting 2','Operation Setting 3','Sensor Measure 2','Sensor Measure 3','Sensor Measure 4','Sensor Measure 8','Sensor Measure 9','Sensor Measure 11','Sensor Measure 15','Sensor Measure 17'], threshold_missing=0.1, threshold_corr=0.5)

# Train Val Split
cl_train_4_opt, cl_val_4_opt = train_val_split_by_group(df = cleaned_train_4_opt,group = "UnitNumber",test_size = 0.18,n_splits = 2,random_state = 7)

## RollingWindowParameter
min_ts_4_opt = 17
max_ts_4_opt = 18
feature_list_ds_4 = ["c3", "quantile", "mean", "root_mean_square", "median", "time_reversal_asymmetry_statistic", "absolute_maximum", "maximum", "minimum", "agg_autocorrelation", "autocorrelation" ]

#rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_4_opt,min_timeshift=min_ts_4_opt,feature_extraction_mode= 'custom',feature_list=feature_list_ds_4)
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts_4_opt,min_timeshift=min_ts_4_opt,feature_extraction_mode= 'minimal',feature_list=["median"])


X_train_4_opt, y_train_4_opt = rwCreator._process_data(cl_train_4_opt,'train')
X_val_4_opt, y_val_4_opt = rwCreator._process_data(cl_val_4_opt, 'train')
X_test_4_opt, y_test_4_opt = rwCreator._process_data(cleaned_test_4_opt, 'test', test_rul_data_4_opt)

y_train_4_opt = y_train_4_opt.clip(upper=125)

scaler = StandardScaler()
X_train_4_opt[2:] = scaler.fit_transform(X_train_4_opt[2:])
X_val_4_opt[2:] = scaler.fit_transform(X_val_4_opt[2:])
X_test_4_opt[2:] = scaler.fit_transform(X_test_4_opt[2:])

2024-06-01 10:27:02 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 4...[0m


INFO:src.utils:Loading data set 4...


2024-06-01 10:27:03 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 4.[0m


INFO:src.utils:Loaded raw data for dataset 4.


2024-06-01 10:27:03 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (61249, 26)[0m


INFO:src.utils:Train Data: (61249, 26)


2024-06-01 10:27:03 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (41214, 26)[0m


INFO:src.utils:Test Data: (41214, 26)


2024-06-01 10:27:03 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (248, 1)[0m


INFO:src.utils:Test RUL Data: (248, 1)


2024-06-01 10:27:03 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m


INFO:src.data_cleaning:Cleaning train and test data...


2024-06-01 10:27:03 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m


INFO:src.data_cleaning:Formatting column types...


2024-06-01 10:27:03 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-06-01 10:27:03 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


DEBUG:src.data_cleaning:Found 0 categorical columns: []


2024-06-01 10:27:03 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m


INFO:src.data_cleaning:Handling duplicates...


2024-06-01 10:27:03 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m


INFO:src.data_cleaning:Removing outliers...


2024-06-01 10:27:03 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-06-01 10:27:03 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-06-01 10:27:03 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m


DEBUG:src.outlier_detection:Removing outliers using method: None ...


2024-06-01 10:27:03 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m


INFO:src.outlier_detection:No outlier detection method specified. Skipping outlier detection.


2024-06-01 10:27:03 [[34msrc.data_cleaning:150[0m] [[32mINFO[0m] >>>> Filter features based train data...[0m


INFO:src.data_cleaning:Filter features based train data...


2024-06-01 10:27:03 [[34msrc.data_cleaning:26[0m] [DEBUG[0m] >>>> Found 0 features with missing values above the threshold of 0.1.[0m


DEBUG:src.data_cleaning:Found 0 features with missing values above the threshold of 0.1.


2024-06-01 10:27:03 [[34msrc.data_cleaning:46[0m] [DEBUG[0m] >>>> Found 0 features with only a single unique value: [][0m


DEBUG:src.data_cleaning:Found 0 features with only a single unique value: []


2024-06-01 10:27:03 [[34msrc.data_cleaning:103[0m] [DEBUG[0m] >>>> Found 25 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Operation Setting 1', 'Operation Setting 2', 'Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 2', 'Sensor Measure 3', 'Sensor Measure 4', 'Sensor Measure 5', 'Sensor Measure 6', 'Sensor Measure 7', 'Sensor Measure 8', 'Sensor Measure 9', 'Sensor Measure 10', 'Sensor Measure 11', 'Sensor Measure 12', 'Sensor Measure 13', 'Sensor Measure 14', 'Sensor Measure 15', 'Sensor Measure 16', 'Sensor Measure 17', 'Sensor Measure 18', 'Sensor Measure 19', 'Sensor Measure 20', 'Sensor Measure 21'][0m


DEBUG:src.data_cleaning:Found 25 uncorrelated features with a correlation threshold of 0.5: ['UnitNumber', 'Operation Setting 1', 'Operation Setting 2', 'Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 2', 'Sensor Measure 3', 'Sensor Measure 4', 'Sensor Measure 5', 'Sensor Measure 6', 'Sensor Measure 7', 'Sensor Measure 8', 'Sensor Measure 9', 'Sensor Measure 10', 'Sensor Measure 11', 'Sensor Measure 12', 'Sensor Measure 13', 'Sensor Measure 14', 'Sensor Measure 15', 'Sensor Measure 16', 'Sensor Measure 17', 'Sensor Measure 18', 'Sensor Measure 19', 'Sensor Measure 20', 'Sensor Measure 21']


2024-06-01 10:27:03 [[34msrc.data_cleaning:162[0m] [[32mINFO[0m] >>>> Dropping features based on missing values, single unique values, and no target correlation...[0m


INFO:src.data_cleaning:Dropping features based on missing values, single unique values, and no target correlation...


2024-06-01 10:27:03 [[34msrc.data_cleaning:172[0m] [[32mINFO[0m] >>>> Data cleaning completed.[0m


INFO:src.data_cleaning:Data cleaning completed.


2024-06-01 10:27:03 [[34msrc.data_cleaning:173[0m] [[32mINFO[0m] >>>> Original train DataFrame shape: (61249, 12), Resulting train DataFrame shape: (61249, 12)[0m


INFO:src.data_cleaning:Original train DataFrame shape: (61249, 12), Resulting train DataFrame shape: (61249, 12)


2024-06-01 10:27:03 [[34msrc.data_cleaning:174[0m] [[32mINFO[0m] >>>> Original test DataFrame shape: (41214, 12), Resulting test DataFrame shape: (41214, 12)[0m


INFO:src.data_cleaning:Original test DataFrame shape: (41214, 12), Resulting test DataFrame shape: (41214, 12)


2024-06-01 10:27:04 [[34msrc.utils:131[0m] [[32mINFO[0m] >>>> Train set contains 204 different engines --> in total 50356[0m


INFO:src.utils:Train set contains 204 different engines --> in total 50356


2024-06-01 10:27:04 [[34msrc.utils:132[0m] [[32mINFO[0m] >>>>  Test set contains 45 different engines --> in total 10893[0m


INFO:src.utils: Test set contains 45 different engines --> in total 10893


2024-06-01 10:27:04 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 543/543 [01:17<00:00,  7.04it/s]


2024-06-01 10:28:22 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 468880/468880 [03:24<00:00, 2291.12it/s]


2024-06-01 10:32:05 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-06-01 10:32:06 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for train data...
Rolling: 100%|██████████| 489/489 [00:14<00:00, 34.77it/s] 


2024-06-01 10:32:20 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


INFO:src.rolling_window_creator:Extracting features for train data...
Feature Extraction: 100%|██████████| 101280/101280 [00:43<00:00, 2316.37it/s]


2024-06-01 10:33:07 [[34msrc.rolling_window_creator:142[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m


INFO:src.rolling_window_creator:Calculating target for train data...


2024-06-01 10:33:07 [[34msrc.rolling_window_creator:128[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


INFO:src.rolling_window_creator:Creating rolling windows for test data...
Rolling: 100%|██████████| 486/486 [00:56<00:00,  8.53it/s]


2024-06-01 10:34:05 [[34msrc.rolling_window_creator:134[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


INFO:src.rolling_window_creator:Extracting features for test data...
Feature Extraction: 100%|██████████| 2480/2480 [00:00<00:00, 2517.20it/s]


In [46]:
def hyperparameter_function_knn(neighbours):
      """ Function for hyperparameter optimization
      """
      neighbours = neighbours.round().astype(int)
      knn_regressor = KNeighborsRegressor(n_neighbors=neighbours)
      metric = cross_val_score(knn_regressor, X=X_train_4_opt, y=y_train_4_opt, cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'neighbours': (1, 750)}

optimizer = BayesianOptimization(
  f=hyperparameter_function_knn,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   | neighb... |
-------------------------------------
| [0m1        [0m | [0m-29.2    [0m | [0m221.7    [0m |
| [0m2        [0m | [0m-30.7    [0m | [0m398.4    [0m |
| [95m3        [0m | [95m-28.2    [0m | [95m144.4    [0m |
| [95m4        [0m | [95m-26.42   [0m | [95m51.86    [0m |
| [0m5        [0m | [0m-31.78   [0m | [0m590.5    [0m |
| [0m6        [0m | [0m-31.27   [0m | [0m492.6    [0m |
| [0m7        [0m | [0m-31.2    [0m | [0m478.5    [0m |
| [0m8        [0m | [0m-30.92   [0m | [0m432.1    [0m |
| [95m9        [0m | [95m-25.86   [0m | [95m30.26    [0m |
| [0m10       [0m | [0m-29.68   [0m | [0m269.0    [0m |
| [0m11       [0m | [0m-25.88   [0m | [0m32.08    [0m |
| [0m12       [0m | [0m-32.41   [0m | [0m750.0    [0m |
| [0m13       [0m | [0m-31.2    [0m | [0m1.0      [0m |
| [0m14       [0m | [0m-27.17   [0m | [0m84.2     [0m |
| [0m15       [0m | [0m-27.7    [0m | [0m1

In [49]:
# Neigbors = 15
knn_regressor = KNeighborsRegressor(n_neighbors=15)
knn_regressor.fit(X_train_4_opt,y_train_4_opt )
y_pred_4_opt = knn_regressor.predict(X_test_4_opt)
print(np.sqrt(mean_squared_error(y_test_4_opt, y_pred_4_opt)))

42.10044525939633


In [50]:
def hyperparameter_function_rf(n_estimators, max_features, ):
      """ Function for hyperparameter optimization
      """
      n_estimators = n_estimators.round().astype(int)
      max_features = max_features.round().astype(int)

      rf_regressor = RandomForestRegressor(n_estimators=n_estimators,max_features=max_features,random_state=17,n_jobs=-1)
      metric = cross_val_score(rf_regressor, X=X_train_4_opt, y=y_train_4_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'n_estimators': (20, 500),'max_features': (1,1) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_rf,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   | max_fe... | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m-22.6    [0m | [0m1.0      [0m | [0m274.7    [0m |
| [0m2        [0m | [0m-22.88   [0m | [0m1.0      [0m | [0m52.59    [0m |
| [0m3        [0m | [0m-22.6    [0m | [0m1.0      [0m | [0m335.0    [0m |
| [0m4        [0m | [0m-22.6    [0m | [0m1.0      [0m | [0m296.3    [0m |
| [0m5        [0m | [0m-22.6    [0m | [0m1.0      [0m | [0m191.8    [0m |
| [0m6        [0m | [0m-22.83   [0m | [0m1.0      [0m | [0m48.82    [0m |
| [0m7        [0m | [0m-22.62   [0m | [0m1.0      [0m | [0m441.1    [0m |
| [0m8        [0m | [0m-22.6    [0m | [0m1.0      [0m | [0m333.2    [0m |
| [0m9        [0m | [0m-22.61   [0m | [0m1.0      [0m | [0m306.8    [0m |
| [95m10       [0m | [95m-22.59   [0m | [95m1.0      [0m | [95m155.8    [0m |
| [0m11       [0m | [0m-22.6    [0m | [0m1.0      [0m | [0m232.5    

KeyboardInterrupt: 

In [75]:
# n_estimators= 264, max_features= ->
rf_regressor = RandomForestRegressor(n_estimators=264,max_features=1,random_state=17)
rf_regressor.fit(X_train_4_opt,y_train_4_opt.values.ravel())
y_pred_4_opt = rf_regressor.predict(X_test_4_opt)
print(np.sqrt(mean_squared_error(y_test_4_opt, y_pred_4_opt)))

39.53115656390862


In [52]:
def hyperparameter_function_lasso(alpha, max_iter, ):
      """ Function for hyperparameter optimization
      """
      max_iter = max_iter.round().astype(int)

      lasso_regressor = Lasso(alpha=alpha,max_iter=max_iter,random_state=17)
      metric = cross_val_score(lasso_regressor, X=X_train_4_opt, y=y_train_4_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'alpha': (0.001, 1),'max_iter': (100,10000) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_lasso,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   |   alpha   | max_iter  |
-------------------------------------------------


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m1        [0m | [0m-1.191e+0[0m | [0m0.2954   [0m | [0m5.353e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m2        [0m | [0m-9.29e+03[0m | [0m0.1923   [0m | [0m772.2    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [95m3        [0m | [95m-481.0   [0m | [95m0.7872   [0m | [95m6.598e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m4        [0m | [0m-884.7   [0m | [0m0.6379   [0m | [0m5.798e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m5        [0m | [0m-751.9   [0m | [0m0.04002  [0m | [0m3.642e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [95m6        [0m | [95m-52.03   [0m | [95m0.9457   [0m | [95m694.4    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m7        [0m | [0m-270.5   [0m | [0m0.8642   [0m | [0m8.785e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m8        [0m | [0m-4.198e+0[0m | [0m0.05214  [0m | [0m6.559e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m9        [0m | [0m-1.074e+0[0m | [0m0.5522   [0m | [0m6.015e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m10       [0m | [0m-1.105e+0[0m | [0m0.484    [0m | [0m2.902e+03[0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m11       [0m | [0m-1.095e+0[0m | [0m0.363    [0m | [0m695.8    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m12       [0m | [0m-856.6   [0m | [0m0.6492   [0m | [0m688.0    [0m |


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


| [0m13       [0m | [0m-611.8   [0m | [0m0.7394   [0m | [0m8.793e+03[0m |


  model = cd_fast.enet_coordinate_descent(


KeyboardInterrupt: 

In [53]:
# alpha= 0.9457, max_iter= 694 -> 52.03
lasso_regressor = Lasso(alpha=0.9457,max_iter=694)
lasso_regressor.fit(X_train_4_opt,y_train_4_opt )
y_pred_4_opt = lasso_regressor.predict(X_test_4_opt)
print(np.sqrt(mean_squared_error(y_test_4_opt, y_pred_4_opt)))

50.39355716657052


  model = cd_fast.enet_coordinate_descent(


In [54]:
def hyperparameter_function_xgboost(eta, gamma ,max_depth,reg_lambda,reg_alpha ):
      """ Function for hyperparameter optimization
      """
      max_depth = max_depth.round().astype(int)

      xgb_regressor = XGBRegressor(eta=eta,gamma=gamma,max_depth=max_depth,reg_lambda=reg_lambda, reg_alpha=reg_alpha)
      metric = cross_val_score(xgb_regressor, X=X_train_4_opt, y=y_train_4_opt.values.ravel(), cv=5, scoring='neg_root_mean_squared_error')
      return metric.min()

# Bounded region of parameter space
pbounds = {'eta': (0, 1),'gamma': (0,2),'max_depth':(1,10),'reg_lambda':(1,1),'reg_alpha':(0,0) }

optimizer = BayesianOptimization(
  f=hyperparameter_function_xgboost,
  pbounds=pbounds,
  random_state=17,
  allow_duplicate_points= True
)

optimizer.maximize(
  init_points=10,
  n_iter=50,
)

|   iter    |  target   |    eta    |   gamma   | max_depth | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------
| [0m1        [0m | [0m-23.01   [0m | [0m0.2947   [0m | [0m1.061    [0m | [0m2.724    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m2        [0m | [0m-25.45   [0m | [0m0.6563   [0m | [0m1.275    [0m | [0m6.18     [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m3        [0m | [0m-27.83   [0m | [0m0.9457   [0m | [0m0.1201   [0m | [0m8.776    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m4        [0m | [0m-25.7    [0m | [0m0.6524   [0m | [0m1.104    [0m | [0m6.378    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m5        [0m | [0m-23.11   [0m | [0m0.2977   [0m | [0m1.123    [0m | [0m4.564    [0m | [0m0.0      [0m | [0m1.0      [0m |
| [0m6        [0m | [0m-23.56   [0m | [0m0.1439   [0m | [0m0.3018   [0m | [0m1.497    [0m | [0m0.0      [0

In [57]:
# eta= 0.1149 , gamma= 0.4352, max_depth=4 ,lambda=1 , alpha=0 ->
xgb_regressor = XGBRegressor(eta=0.1149,gamma=0.4352,max_depth=4,reg_lambda=1, reg_alpha=0)
xgb_regressor.fit(X_train_4_opt,y_train_4_opt )
y_pred_4_opt = xgb_regressor.predict(X_test_4_opt)
print(np.sqrt(mean_squared_error(y_test_4_opt, y_pred_4_opt)))

43.26899689673861
