In [2]:
%load_ext autoreload
%autoreload 2

In [3]:

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [7]:
# third-party libraries
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# previous
from sklearn.metrics import accuracy_score,f1_score
from sklearn.svm import SVC
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE


In [8]:
import os
os.chdir('/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling')
# Zeigt das aktuelle Arbeitsverzeichnis an
print(os.getcwd())
# Setzt das Arbeitsverzeichnis auf das Projektverzeichnis

#from src.utils import flatten

from src.data_loading import load_data, load_config
from src.data_cleaning import clean_data, format_dtype
from src.rolling_window_creator import calculate_RUL, RollingWindowDatasetCreator

/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling


In [9]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [10]:
np.random.seed(42)

# Paths

In [11]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
os.chdir("/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling") # set working directory to root of project
os.getcwd() # check current working directory

'/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling'

In [12]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [13]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [14]:
train_data_1, test_data_1, test_RUL_data_1 = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)

2024-05-26 20:15:33 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-05-26 20:15:33 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-05-26 20:15:33 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-05-26 20:15:33 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-05-26 20:15:33 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


In [15]:
train_data_2, test_data_2, test_RUL_data_2 = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)

2024-05-26 20:15:36 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m
2024-05-26 20:15:36 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m
2024-05-26 20:15:36 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m
2024-05-26 20:15:36 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m
2024-05-26 20:15:36 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


In [16]:
train_data_3, test_data_3, test_RUL_data_3 = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)

2024-05-26 20:15:38 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 3...[0m
2024-05-26 20:15:38 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 3.[0m
2024-05-26 20:15:38 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (24720, 26)[0m
2024-05-26 20:15:38 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (16596, 26)[0m
2024-05-26 20:15:38 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


In [17]:
train_data_4, test_data_4, test_RUL_data_4 = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)

2024-05-26 20:15:39 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 4...[0m
2024-05-26 20:15:39 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 4.[0m
2024-05-26 20:15:39 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (61249, 26)[0m
2024-05-26 20:15:39 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (41214, 26)[0m
2024-05-26 20:15:39 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (248, 1)[0m


In [18]:
train_data = [train_data_1, train_data_2, train_data_3, train_data_4]
test_data = [test_data_1, test_data_2, test_data_3, test_data_4]

# 📍 << Models >>

[TEMPLATE]

Findings:
* Interpretation of plots
* or other key take aways from previous code

In [22]:
# [TEMPLATE] - save processed data (as pickle)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_pickle(f"{config['paths']['processed_data_dir']}ex2_topic_{timestamp}.pkl")

In [23]:
# [TEMPLATE] - save data predictions (as csv)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_csv(f"{config['paths']['prediction_dir']}ex2_topic_{timestamp}.csv", sep=',', decimal='.')

In [24]:
# [TEMPLATE] - save plot results (as png)
fig = plt.figure(figsize=(9, 6))
timestamp = time.strftime("%Y%m%d-%H%M%S")
fig.savefig(f"{config['paths']['plot_dir']}ex2_topic_{timestamp}.png")

<Figure size 900x600 with 0 Axes>

Data Cleaning

In [19]:
train_data_1 = format_dtype(train_data_1)
train_data_2 = format_dtype(train_data_2)
train_data_3 = format_dtype(train_data_3)
train_data_4 = format_dtype(train_data_4)
train_data = [train_data_1, train_data_2, train_data_3, train_data_4]

2024-05-26 20:15:47 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-26 20:15:47 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-26 20:15:47 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-26 20:15:47 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


In [63]:
cleaned_train_1, cleaned_test_1 = clean_data(train_data_1, test_data_1, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)
cleaned_train_2, cleaned_test_2 = clean_data(train_data_2, test_data_2, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)
cleaned_train_3, cleaned_test_3 = clean_data(train_data_3, test_data_3, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)
cleaned_train_4, cleaned_test_4 = clean_data(train_data_4, test_data_4, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)

cleaned_train = [cleaned_train_1, cleaned_train_2, cleaned_train_3, cleaned_train_4]
cleaned_test = [cleaned_test_1, cleaned_test_2, cleaned_test_3, cleaned_test_4]



2024-05-26 22:27:24 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-05-26 22:27:24 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-05-26 22:27:24 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-26 22:27:24 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-26 22:27:24 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-05-26 22:27:24 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-05-26 22:27:24 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-05-26 22:27:24 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-05-26 22:27:24 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

In [65]:
print(cleaned_train_1.shape)
print(cleaned_test_1.columns)
print(cleaned_train_2.shape)
print(cleaned_test_2.columns)
print(cleaned_train_3.shape)
print(cleaned_test_3.columns)
print(cleaned_train_4.shape)
print(cleaned_test_4.columns)


(20631, 16)
Index(['UnitNumber', 'Cycle', 'Sensor Measure 2', 'Sensor Measure 3',
       'Sensor Measure 4', 'Sensor Measure 7', 'Sensor Measure 8',
       'Sensor Measure 9', 'Sensor Measure 11', 'Sensor Measure 12',
       'Sensor Measure 13', 'Sensor Measure 14', 'Sensor Measure 15',
       'Sensor Measure 17', 'Sensor Measure 20', 'Sensor Measure 21'],
      dtype='object')
(53759, 2)
Index(['UnitNumber', 'Cycle'], dtype='object')
(24720, 14)
Index(['UnitNumber', 'Cycle', 'Sensor Measure 2', 'Sensor Measure 3',
       'Sensor Measure 4', 'Sensor Measure 7', 'Sensor Measure 8',
       'Sensor Measure 9', 'Sensor Measure 10', 'Sensor Measure 11',
       'Sensor Measure 12', 'Sensor Measure 13', 'Sensor Measure 14',
       'Sensor Measure 17'],
      dtype='object')
(61249, 2)
Index(['UnitNumber', 'Cycle'], dtype='object')


Feature Engineering

In [36]:
# Currently using minimal to ease optimization so feature_list is not necessary
feature_list = []

# feature_list for dataset 1
# TODO: create function to make variable for each dataset to ease optimization
currentpath = os.getcwd()
ft_list = pd.read_pickle(currentpath+ "/data/processed/feature_list.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling/data/processed/feature_list.pkl'

In [66]:
min_ts = 5
max_ts = 15

Windowing

In [67]:
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts,min_timeshift=min_ts,feature_extraction_mode= 'minimal')


In [68]:
X_train_1, y_train_1, X_test_1, y_test_1 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_1, test_data=cleaned_test_1,test_RUL_data=test_RUL_data_1,)


2024-05-26 22:28:31 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:03<00:00,  6.35it/s]


2024-05-26 22:28:34 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:19<00:00,  1.03it/s]


2024-05-26 22:28:56 [[34msrc.rolling_window_creator:131[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-05-26 22:28:56 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 19/19 [00:01<00:00, 11.47it/s]

2024-05-26 22:28:58 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m



Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 21.01it/s]

2024-05-26 22:28:59 [[34msrc.rolling_window_creator:159[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m
2024-05-26 22:28:59 [[34msrc.rolling_window_creator:160[0m] [[32mINFO[0m] >>>> Shape of X_train: (20131, 140)[0m
2024-05-26 22:28:59 [[34msrc.rolling_window_creator:161[0m] [[32mINFO[0m] >>>> Shape of y_train: (20131, 1)[0m
2024-05-26 22:28:59 [[34msrc.rolling_window_creator:162[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 140)[0m
2024-05-26 22:28:59 [[34msrc.rolling_window_creator:163[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m





In [69]:
X_train_2, y_train_2, X_test_2, y_test_2 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_2, test_data=cleaned_test_2,test_RUL_data=test_RUL_data_2)


2024-05-26 22:29:04 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:03<00:00,  6.35it/s]


2024-05-26 22:29:08 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


ValueError: Could not guess the value column! Please hand it to the function as an argument.

In [39]:
X_train_3, y_train_3, X_test_3, y_test_3 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_3, test_data=cleaned_train_3,test_RUL_data=test_RUL_data_3,)


2024-05-26 21:16:58 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:03<00:00,  5.75it/s]


2024-05-26 21:17:01 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:21<00:00,  1.06s/it]


2024-05-26 21:17:25 [[34msrc.rolling_window_creator:131[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-05-26 21:17:25 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 20/20 [00:02<00:00,  7.31it/s]


2024-05-26 21:17:28 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:00<00:00, 20.83it/s]

2024-05-26 21:17:29 [[34msrc.rolling_window_creator:159[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m
2024-05-26 21:17:29 [[34msrc.rolling_window_creator:160[0m] [[32mINFO[0m] >>>> Shape of X_train: (24220, 120)[0m
2024-05-26 21:17:29 [[34msrc.rolling_window_creator:161[0m] [[32mINFO[0m] >>>> Shape of y_train: (24220, 1)[0m
2024-05-26 21:17:29 [[34msrc.rolling_window_creator:162[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 120)[0m
2024-05-26 21:17:29 [[34msrc.rolling_window_creator:163[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m





In [62]:
X_train_4, y_train_4, X_test_4, y_test_4 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_4, test_data=cleaned_train_4,test_RUL_data=test_RUL_data_4,)


2024-05-26 22:15:39 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:04<00:00,  4.36it/s]


2024-05-26 22:15:44 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


ValueError: Could not guess the value column! Please hand it to the function as an argument.

In [40]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import sklearn
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.metrics import accuracy_score,f1_score
from sklearn.svm import SVC
#from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,f1_score, root_mean_squared_error


klassische Ansätze

In [45]:
# Standardisieren der Daten
scaler_1 = StandardScaler()
X_train_scaled_1 = scaler_1.fit_transform(X_train_1)
#X_val_scaled = scaler.transform(X_val)
X_test_scaled_1 = scaler_1.transform(X_test_1)

scaler_3 = StandardScaler()
X_train_scaled_3 = scaler_3.fit_transform(X_train_3)
#X_val_scaled = scaler.transform(X_val)
X_test_scaled_3 = scaler_3.transform(X_test_3)

Classifier

SVM

In [70]:
# Support Vector Machine implementieren
svm_1 = SVC(kernel='linear', random_state=42)
svm_3 = SVC(kernel='linear', random_state=42)

# Cross-Validation
#svm_cv_scores = cross_val_score(svm, X_train_scaled, y_train, cv=5)
#print(f"SVC Cross-Validation Accuracy: {svm_cv_scores.mean():.4f} ± {svm_cv_scores.std():.4f}")

# Training
svm_1.fit(X_train_scaled_1, y_train_1)
svm_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des SVM Classifiers auf den Validierungsdaten
svm_predictions_1 = svm.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, svm_predictions_1))

svm_predictions_3 = svm.predict(X_test_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_test_3, svm_predictions_3))

#print("SVC Training Accuracy:", accuracy_score(y_test, svm_predictions))
#print("SVC Training Classification Report:\n", classification_report(y_test, svm_predictions))
#print("="*60)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


85.06544539353214


ValueError: X has 120 features, but SVC is expecting 140 features as input.

Gaussian Process Regressor

In [50]:
from sklearn.gaussian_process import GaussianProcessRegressor

gaussian_process_1 = GaussianProcessRegressor(random_state=6)
gaussian_process_3 = GaussianProcessRegressor(random_state=6)


#Cross Validation
#gp_cv_scores = cross_val_score(gaussian_process, X_train_scaled, y_train, cv=5)
#print(f"Gaussian Process Regressor Cross-Validation Accuracy: {gp_cv_scores.mean():.4f} ± {gp_cv_scores.std():.4f}")

# Gaussian Process Regressor trainieren
gaussian_process_1.fit(X_train_scaled_1, y_train_1)
gaussian_process_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des Gaussian Process Regressors
gp_predictions_1 = gaussian_process_1.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, gp_predictions_1))

gp_predictions_3 = gaussian_process_3.predict(X_test_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_test_3, gp_predictions_3))

"""
gp_predictions_rounded = np.round(gp_predictions)
gp_predictions_rounded = np.clip(gp_predictions_rounded, 0, None)
print("Gaussian Process Regressor Accuracy:", accuracy_score(y_val, gp_predictions_rounded))
print("Gaussian Process Regressor Classification Report:\n", classification_report(y_val, gp_predictions_rounded))"""


86.17455298332204
85.90521822355312


'\ngp_predictions_rounded = np.round(gp_predictions)\ngp_predictions_rounded = np.clip(gp_predictions_rounded, 0, None)\nprint("Gaussian Process Regressor Accuracy:", accuracy_score(y_val, gp_predictions_rounded))\nprint("Gaussian Process Regressor Classification Report:\n", classification_report(y_val, gp_predictions_rounded))'

MLP

In [49]:
from sklearn.model_selection import cross_val_score

# MLPClassifier implementieren
mlp_1 = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
mlp_3 = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

# Cross-Validation
#mlp_cv_scores = cross_val_score(mlp, X_train_scaled, y_train, cv=5)
#print(f"MLPClassifier Cross-Validation Accuracy: {mlp_cv_scores.mean():.4f} ± {mlp_cv_scores.std():.4f}")

mlp_1.fit(X_train_scaled_1, y_train_1)
mlp_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des MLPClassifiers
mlp_predictions_1 = mlp_1.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, mlp_predictions_1))

mlp_predictions_3 = mlp_3.predict(X_test_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_test_3, mlp_predictions_3))

#print("MLP Classifier Accuracy:", accuracy_score(y_val, mlp_predictions))
#print("MLP Classifier Classification Report:\n", classification_report(y_val, mlp_predictions))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


85.32350203783247
85.07091159732568




Random Forests

In [61]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest implementieren
rf_1 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_3 = RandomForestClassifier(n_estimators=100, random_state=42)

# Cross-Validation
#rf_cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5)
#print(f"RandomForest Cross-Validation Accuracy: {rf_cv_scores.mean():.4f} ± {rf_cv_scores.std():.4f}")

# Random Forest Classifier trainieren
rf_1.fit(X_train_scaled_1, y_train_1)
rf_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des Random Forest Classifiers
rf_predictions_1 = rf_1.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, rf_predictions_1))
print("hello")
rf_predictions_3 = rf_3.predict(X_test_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_test_3, rf_predictions_3))
#print("Random Forest Classifier Accuracy:", accuracy_score(y_val, rf_predictions))
#print("Random Forest Classifier Classification Report:\n", classification_report(y_val, rf_predictions))

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


85.18227515158303
hello
85.0174099817208


AdaBoost

In [47]:
from sklearn.ensemble import AdaBoostClassifier

# AdaBoost implementieren
ada_1 = AdaBoostClassifier(n_estimators=100, random_state=42)
ada_3 = AdaBoostClassifier(n_estimators=100, random_state=42)

# Cross-Validation
#ada_cv_scores = cross_val_score(ada, X_train_scaled, y_train, cv=5)
#print(f"AdaBoost Cross-Validation Accuracy: {ada_cv_scores.mean():.4f} ± {ada_cv_scores.std():.4f}")

# AdaBoost Classifier trainieren
ada_1.fit(X_train_scaled_1, y_train_1)
ada_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des AdaBoost Classifiers
ada_predictions_1 = ada_1.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, ada_predictions_1))

ada_predictions_3 = ada_3.predict(X_test_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_test_3, ada_predictions_3))

#print("AdaBoost Classifier Accuracy:", accuracy_score(y_val, ada_predictions))
#print("AdaBoost Classifier Classification Report:\n", classification_report(y_val, ada_predictions))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


82.71626200451759
82.46296138267168


knn

In [48]:
# k-Nearest Neighbors implementieren
knn_1 = KNeighborsClassifier(n_neighbors=5)
knn_3 = KNeighborsClassifier(n_neighbors=5)
#trainieren des k-Nearest Neighbors
knn_1.fit(X_train_scaled_1, y_train_1)
knn_3.fit(X_train_scaled_3, y_train_3)

#vohersagen und Bewerten des k-Nearest Neighbors
knn_predictions_1 = knn_1.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, knn_predictions_1))

knn_predictions_3 = knn_3.predict(X_test_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_test_3, knn_predictions_3))




  return self._fit(X, y)
  return self._fit(X, y)


85.04463533933225
84.84244220907364


Optimization

In [None]:
#hyperparameter tuning

