In [5]:
%load_ext autoreload
%autoreload 2

In [6]:

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [7]:
# third-party libraries
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# previous
from sklearn.metrics import accuracy_score,f1_score
from sklearn.svm import SVC
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE


In [8]:
import os

# Zeigt das aktuelle Arbeitsverzeichnis an
print(os.getcwd())
# Setzt das Arbeitsverzeichnis auf das Projektverzeichnis
os.chdir('/Users/niklasquendt/Documents/Uni/PSDA/Uebung2/damage-propagation-modeling')
from src.utils import flatten

from src.data_loading import load_data, load_config
from src.data_cleaning import clean_data, format_dtype
from src.rolling_window_creator import calculate_RUL, RollingWindowDatasetCreator

/Users/niklasquendt/Documents/Uni/PSDA/Uebung2/damage-propagation-modeling


ImportError: cannot import name 'winsorize' from 'scipy.stats' (/Users/niklasquendt/Documents/Uni/PSDA/Uebung2/damage-propagation-modeling/.venv/lib/python3.12/site-packages/scipy/stats/__init__.py)

In [5]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [6]:
np.random.seed(42)

# Paths

In [5]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
os.chdir("/Users/niklasquendt/Documents/Uni/PSDA/Uebung2/damage-propagation-modeling") # set working directory to root of project
os.getcwd() # check current working directory

'/Users/niklasquendt/Documents/Uni/PSDA/Uebung2/damage-propagation-modeling'

In [6]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [7]:
config = load_config(PATH_TO_CONFIG) # config is dict

NameError: name 'load_config' is not defined

In [46]:
train_data_1, test_data_1, test_RUL_data_1 = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)

NameError: name 'load_data' is not defined

In [62]:
train_data_2, test_data_2, test_RUL_data_2 = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)

2024-05-25 10:02:58 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m
2024-05-25 10:02:58 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m
2024-05-25 10:02:58 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m
2024-05-25 10:02:58 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m
2024-05-25 10:02:58 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


In [63]:
train_data_3, test_data_3, test_RUL_data_3 = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)

2024-05-25 10:03:11 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 3...[0m
2024-05-25 10:03:11 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 3.[0m
2024-05-25 10:03:11 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (24720, 26)[0m
2024-05-25 10:03:11 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (16596, 26)[0m
2024-05-25 10:03:11 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


In [64]:
train_data_4, test_data_4, test_RUL_data_4 = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)

2024-05-25 10:03:21 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 4...[0m
2024-05-25 10:03:21 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 4.[0m
2024-05-25 10:03:21 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (61249, 26)[0m
2024-05-25 10:03:21 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (41214, 26)[0m
2024-05-25 10:03:21 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (248, 1)[0m


In [65]:
train_data = [train_data_1, train_data_2, train_data_3, train_data_4]
test_data = [test_data_1, test_data_2, test_data_3, test_data_4]

# 📍 << Models >>

[TEMPLATE]

Findings:
* Interpretation of plots
* or other key take aways from previous code

In [68]:
# [TEMPLATE] - save processed data (as pickle)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_pickle(f"{config['paths']['processed_data_dir']}ex2_topic_{timestamp}.pkl")

In [69]:
# [TEMPLATE] - save data predictions (as csv)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_csv(f"{config['paths']['prediction_dir']}ex2_topic_{timestamp}.csv", sep=',', decimal='.')

In [70]:
# [TEMPLATE] - save plot results (as png)
fig = plt.figure(figsize=(9, 6))
timestamp = time.strftime("%Y%m%d-%H%M%S")
fig.savefig(f"{config['paths']['plot_dir']}ex2_topic_{timestamp}.png")

<Figure size 900x600 with 0 Axes>

In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import accuracy_score,f1_score
from sklearn.svm import SVC
#from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score



klassische Ansätze

In [85]:
# Trainingsdaten in Trainings- und Validierungsdaten aufteilen
X_train, X_val, y_train, y_val = train_test_split(train_data_1[["Sensor Measure 2", "Sensor Measure 3", "Sensor Measure 4", "Sensor Measure 7", "Sensor Measure 8", "Sensor Measure 9", "Sensor Measure 11", "Sensor Measure 12", "Sensor Measure 13", "Sensor Measure 14", "Sensor Measure 15", "Sensor Measure 17", "Sensor Measure 20", "Sensor Measure 21"]],
train_data_1["Cycle"],test_size=0.2, random_state=42)

X_test = test_data_1[["Sensor Measure 2", "Sensor Measure 3", "Sensor Measure 4", "Sensor Measure 7", "Sensor Measure 8", "Sensor Measure 9", "Sensor Measure 11", "Sensor Measure 12", "Sensor Measure 13", "Sensor Measure 14", "Sensor Measure 15", "Sensor Measure 17", "Sensor Measure 20", "Sensor Measure 21"]]
y_test = test_data_1 ["Cycle"]
train_df = pd.DataFrame(X_train)
train_df['cycle'] = y_train

val_df = pd.DataFrame(X_val)
val_df['cycle'] = y_val

test_df = pd.DataFrame(X_test)
test_df['cycle'] = y_test

# Features und Zielvariable trennen
X_train = train_df.drop('cycle', axis=1)
y_train = train_df['cycle']
X_val = val_df.drop('cycle', axis=1)
y_val = val_df['cycle']

X_test = test_df.drop('cycle', axis=1)
y_test = test_df['cycle']

# Standardisieren der Daten
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

SVM

In [87]:
# Support Vector Machine implementieren
svm = SVC(kernel='linear', random_state=42)

# Cross-Validation
svm_cv_scores = cross_val_score(svm, X_train_scaled, y_train, cv=5)
print(f"SVC Cross-Validation Accuracy: {svm_cv_scores.mean():.4f} ± {svm_cv_scores.std():.4f}")

# Training
svm.fit(X_train_scaled, y_train)

# Vorhersagen und Bewerten des SVM Classifiers auf den Validierungsdaten
svm_predictions = svm.predict(X_val_scaled)
print("SVC Training Accuracy:", accuracy_score(y_val, svm_predictions))
print("SVC Training Classification Report:\n", classification_report(y_val, svm_predictions))
print("="*60)



SVC Cross-Validation Accuracy: 0.0095 ± 0.0015
SVC Training Accuracy: 0.008238429852192876
SVC Training Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00        25
           2       0.00      0.00      0.00        18
           3       0.01      0.05      0.02        19
           4       0.05      0.05      0.05        22
           5       0.00      0.00      0.00        22
           6       0.02      0.05      0.02        22
           7       0.00      0.00      0.00        22
           8       0.00      0.00      0.00        24
           9       0.00      0.00      0.00        23
          10       0.00      0.00      0.00        24
          11       0.00      0.00      0.00        16
          12       0.00      0.00      0.00        18
          13       0.00      0.00      0.00        25
          14       0.00      0.00      0.00        22
          15       0.00      0.00      0.00        16
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
prediction = svm.predict(X_test_scaled)
plt.figure(figsize=(12,5))
plt.plot((prediction[:2000]), label="Prediction")
plt.plot((test_RUL_data_1[:2000]), label="Reale RUL")
plt.legend()
plt.show()

In [81]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rms = sqrt(mean_squared_error(x.values.reshape(-1), prediction))
print("The RMSE auf Trainingsdaten ist :", rms)


ValueError: Found input variables with inconsistent numbers of samples: [100, 20631]

Gaussian Process Regressor

In [4]:
from sklearn.gaussian_process import GaussianProcessRegressor

gaussian_process = GaussianProcessRegressor(random_state=6)

#Cross Validation
gp_cv_scores = cross_val_score(gaussian_process, X_train_scaled, y_train, cv=5)
print(f"Gaussian Process Regressor Cross-Validation Accuracy: {gp_cv_scores.mean():.4f} ± {gp_cv_scores.std():.4f}")

# Gaussian Process Regressor trainieren
gaussian_process.fit(X_train_scaled, y_train)

# Vorhersagen und Bewerten des Gaussian Process Regressors
gp_predictions = gaussian_process.predict(X_val_scaled)
gp_predictions_rounded = np.round(gp_predictions)
gp_predictions_rounded = np.clip(gp_predictions_rounded, 0, None)
print("Gaussian Process Regressor Accuracy:", accuracy_score(y_val, gp_predictions_rounded))
print("Gaussian Process Regressor Classification Report:\n", classification_report(y_val, gp_predictions_rounded))


NameError: name 'X_train_scaled' is not defined

MLP

In [84]:
from sklearn.model_selection import cross_val_score

# MLPClassifier implementieren
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

# Cross-Validation
mlp_cv_scores = cross_val_score(mlp, X_train_scaled, y_train, cv=5)
print(f"MLPClassifier Cross-Validation Accuracy: {mlp_cv_scores.mean():.4f} ± {mlp_cv_scores.std():.4f}")

mlp.fit(X_train_scaled, y_train)

# Vorhersagen und Bewerten des MLPClassifiers
mlp_predictions = mlp.predict(X_val_scaled)
print("MLP Classifier Accuracy:", accuracy_score(y_val, mlp_predictions))
print("MLP Classifier Classification Report:\n", classification_report(y_val, mlp_predictions))



MLPClassifier Cross-Validation Accuracy: 0.0069 ± 0.0019
MLP Classifier Accuracy: 0.052445349231738644
MLP Classifier Classification Report:
               precision    recall  f1-score   support

           1       0.07      0.04      0.05       100
           2       0.03      0.06      0.04       100
           3       0.04      0.11      0.06       100
           4       0.04      0.05      0.04       100
           5       0.05      0.02      0.03       100
           6       0.01      0.01      0.01       100
           7       0.07      0.06      0.07       100
           8       0.12      0.07      0.09       100
           9       0.04      0.04      0.04       100
          10       0.16      0.05      0.08       100
          11       0.03      0.03      0.03       100
          12       0.09      0.03      0.04       100
          13       0.05      0.01      0.02       100
          14       0.04      0.02      0.03       100
          15       0.05      0.15      0.07    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest implementieren
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-Validation
rf_cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5)
print(f"RandomForest Cross-Validation Accuracy: {rf_cv_scores.mean():.4f} ± {rf_cv_scores.std():.4f}")

# Random Forest Classifier trainieren
rf.fit(X_train_scaled, y_train)

# Vorhersagen und Bewerten des Random Forest Classifiers
rf_predictions = rf.predict(X_val_scaled)
print("Random Forest Classifier Accuracy:", accuracy_score(y_val, rf_predictions))
print("Random Forest Classifier Classification Report:\n", classification_report(y_val, rf_predictions))

AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost implementieren
ada = AdaBoostClassifier(n_estimators=100, random_state=42)

# Cross-Validation
ada_cv_scores = cross_val_score(ada, X_train_scaled, y_train, cv=5)
print(f"AdaBoost Cross-Validation Accuracy: {ada_cv_scores.mean():.4f} ± {ada_cv_scores.std():.4f}")

# AdaBoost Classifier trainieren
ada.fit(X_train_scaled, y_train)

# Vorhersagen und Bewerten des AdaBoost Classifiers
ada_predictions = ada.predict(X_val_scaled)
print("AdaBoost Classifier Accuracy:", accuracy_score(y_val, ada_predictions))
print("AdaBoost Classifier Classification Report:\n", classification_report(y_val, ada_predictions))

knn

In [None]:
# k-Nearest Neighbors implementieren
knn = KNeighborsClassifier(n_neighbors=5)


Optimization

In [None]:
#hyperparameter tuning

