In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [3]:
# third-party libraries
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# previous
from sklearn.metrics import accuracy_score,f1_score
from sklearn.svm import SVC
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE


In [4]:
import os
os.chdir('/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling')
# Zeigt das aktuelle Arbeitsverzeichnis an
print(os.getcwd())
# Setzt das Arbeitsverzeichnis auf das Projektverzeichnis

#from src.utils import flatten

from src.data_loading import load_data, load_config
from src.data_cleaning import clean_data, format_dtype
from src.rolling_window_creator import calculate_RUL, RollingWindowDatasetCreator

/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling


In [6]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [7]:
np.random.seed(42)

# Paths

In [8]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
os.chdir("/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling") # set working directory to root of project
os.getcwd() # check current working directory

'/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling'

In [9]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [10]:
config = load_config(PATH_TO_CONFIG) # config is dict

anwenden der Load_data Funktionen auf die Datensätze 

In [11]:
train_data_1, test_data_1, test_RUL_data_1 = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)

2024-06-01 16:36:07 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-06-01 16:36:07 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-06-01 16:36:07 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-06-01 16:36:07 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-06-01 16:36:07 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


In [12]:
train_data_2, test_data_2, test_RUL_data_2 = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)

2024-06-01 16:36:09 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m
2024-06-01 16:36:09 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m
2024-06-01 16:36:09 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m
2024-06-01 16:36:09 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m
2024-06-01 16:36:09 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


In [13]:
train_data_3, test_data_3, test_RUL_data_3 = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)

2024-06-01 16:36:10 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 3...[0m
2024-06-01 16:36:10 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 3.[0m
2024-06-01 16:36:10 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (24720, 26)[0m
2024-06-01 16:36:10 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (16596, 26)[0m
2024-06-01 16:36:10 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


In [14]:
train_data_4, test_data_4, test_RUL_data_4 = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)

2024-06-01 16:36:11 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 4...[0m
2024-06-01 16:36:12 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 4.[0m
2024-06-01 16:36:12 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (61249, 26)[0m
2024-06-01 16:36:12 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (41214, 26)[0m
2024-06-01 16:36:12 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (248, 1)[0m


In [15]:
train_data = [train_data_1, train_data_2, train_data_3, train_data_4]
test_data = [test_data_1, test_data_2, test_data_3, test_data_4]

# 📍 << Models >>

Data Cleaning

In [16]:
# anwenden der Funktionen auf die Daten
train_data_1 = format_dtype(train_data_1)
train_data_2 = format_dtype(train_data_2)
train_data_3 = format_dtype(train_data_3)
train_data_4 = format_dtype(train_data_4)
train_data = [train_data_1, train_data_2, train_data_3, train_data_4]

2024-06-01 16:36:14 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 16:36:14 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 16:36:14 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 16:36:14 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


In [17]:
#anwenden der clean_data Funktion auf alle Datensätze
cleaned_train_1, cleaned_test_1 = clean_data(train_data_1, test_data_1, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)
cleaned_train_2, cleaned_test_2 = clean_data(train_data_2, test_data_2, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.0, threshold_corr=0.0)
cleaned_train_3, cleaned_test_3 = clean_data(train_data_3, test_data_3, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)
cleaned_train_4, cleaned_test_4 = clean_data(train_data_4, test_data_4, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.0, threshold_corr=0.0)

cleaned_train = [cleaned_train_1, cleaned_train_2, cleaned_train_3, cleaned_train_4]
cleaned_test = [cleaned_test_1, cleaned_test_2, cleaned_test_3, cleaned_test_4]



2024-06-01 16:36:15 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 16:36:15 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 16:36:15 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 16:36:15 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 16:36:15 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-06-01 16:36:15 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-06-01 16:36:15 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-06-01 16:36:15 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-06-01 16:36:15 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

In [18]:
#ausgeben der gecleanten Daten
print(cleaned_train_1.shape)
print(cleaned_test_1.columns)
print(cleaned_train_2.shape)
print(cleaned_test_2.columns)
print(cleaned_train_3.shape)
print(cleaned_test_3.columns)
print(cleaned_train_4.shape)
print(cleaned_test_4.columns)


(20631, 16)
Index(['UnitNumber', 'Cycle', 'Sensor Measure 2', 'Sensor Measure 3',
       'Sensor Measure 4', 'Sensor Measure 7', 'Sensor Measure 8',
       'Sensor Measure 9', 'Sensor Measure 11', 'Sensor Measure 12',
       'Sensor Measure 13', 'Sensor Measure 14', 'Sensor Measure 15',
       'Sensor Measure 17', 'Sensor Measure 20', 'Sensor Measure 21'],
      dtype='object')
(53759, 26)
Index(['UnitNumber', 'Cycle', 'Operation Setting 1', 'Operation Setting 2',
       'Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 2',
       'Sensor Measure 3', 'Sensor Measure 4', 'Sensor Measure 5',
       'Sensor Measure 6', 'Sensor Measure 7', 'Sensor Measure 8',
       'Sensor Measure 9', 'Sensor Measure 10', 'Sensor Measure 11',
       'Sensor Measure 12', 'Sensor Measure 13', 'Sensor Measure 14',
       'Sensor Measure 15', 'Sensor Measure 16', 'Sensor Measure 17',
       'Sensor Measure 18', 'Sensor Measure 19', 'Sensor Measure 20',
       'Sensor Measure 21'],
      dtype='object

Feature Engineering

In [19]:
# Currently using minimal to ease optimization so feature_list is not necessary
feature_list = []

# feature_list for dataset 1
# TODO: create function to make variable for each dataset to ease optimization
currentpath = os.getcwd()
ft_list = pd.read_pickle(currentpath+ "/data/processed/feature_list.pkl")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/niklasquendt/Documents/Uni/PSDA/Übung 2/damage-propagation-modeling/data/processed/feature_list.pkl'

In [19]:
min_ts = 5
max_ts = 15

Windowing

In [20]:
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts,min_timeshift=min_ts,feature_extraction_mode= 'minimal')


In [21]:
#X_train_1, y_train_1, X_test_1, y_test_1 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_1, test_data=cleaned_test_1,test_RUL_data=test_RUL_data_1,)

from sklearn.model_selection import train_test_split

# Erstellen  die Rollfenster-Datensätze
X_train_1, y_train_1, X_test_1, y_test_1 = rwCreator.create_rolling_windows_datasets(
    train_data=cleaned_train_1, 
    test_data=cleaned_test_1,
    test_RUL_data=test_RUL_data_1,
)

# Teilt  die Trainingsdaten in Trainings- und Validierungsdaten auf
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(
    X_train_1, 
    y_train_1, 
    test_size=0.2,  # 20% der Daten werden für die Validierung verwendet
    random_state=42  
)



2024-06-01 16:36:27 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:02<00:00,  8.20it/s]


2024-06-01 16:36:30 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:22<00:00,  1.10s/it]


2024-06-01 16:36:55 [[34msrc.rolling_window_creator:131[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 16:36:55 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 19/19 [00:02<00:00,  8.73it/s]

2024-06-01 16:36:57 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m



Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 15.31it/s]

2024-06-01 16:36:58 [[34msrc.rolling_window_creator:159[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m
2024-06-01 16:36:58 [[34msrc.rolling_window_creator:160[0m] [[32mINFO[0m] >>>> Shape of X_train: (20131, 140)[0m
2024-06-01 16:36:58 [[34msrc.rolling_window_creator:161[0m] [[32mINFO[0m] >>>> Shape of y_train: (20131, 1)[0m
2024-06-01 16:36:58 [[34msrc.rolling_window_creator:162[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 140)[0m
2024-06-01 16:36:58 [[34msrc.rolling_window_creator:163[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m





In [22]:
X_train_2, y_train_2, X_test_2, y_test_2 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_2, test_data=cleaned_test_2,test_RUL_data=test_RUL_data_2)



2024-06-01 16:36:58 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:06<00:00,  3.19it/s]


2024-06-01 16:37:05 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [02:40<00:00,  8.01s/it]


2024-06-01 16:40:04 [[34msrc.rolling_window_creator:131[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 16:40:05 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 20/20 [00:06<00:00,  3.27it/s]


2024-06-01 16:40:11 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 11.57it/s]


2024-06-01 16:40:13 [[34msrc.rolling_window_creator:159[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m
2024-06-01 16:40:13 [[34msrc.rolling_window_creator:160[0m] [[32mINFO[0m] >>>> Shape of X_train: (52459, 240)[0m
2024-06-01 16:40:13 [[34msrc.rolling_window_creator:161[0m] [[32mINFO[0m] >>>> Shape of y_train: (52459, 1)[0m
2024-06-01 16:40:13 [[34msrc.rolling_window_creator:162[0m] [[32mINFO[0m] >>>> Shape of X_test: (259, 240)[0m
2024-06-01 16:40:13 [[34msrc.rolling_window_creator:163[0m] [[32mINFO[0m] >>>> Shape of y_test: (259, 1)[0m


In [23]:
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(
    X_train_2, 
    y_train_2, 
    test_size=0.2,  # 20% der Daten werden für die Validierung verwendet
    random_state=42  )

In [24]:
X_train_3, y_train_3, X_test_3, y_test_3 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_3, test_data=cleaned_train_3,test_RUL_data=test_RUL_data_3,)

# Teile die Trainingsdaten in Trainings- und Validierungsdaten auf
X_train_3, X_val_3, y_train_3, y_val_3 = train_test_split(
    X_train_3, 
    y_train_3, 
    test_size=0.2,  # 20% der Daten werden für Validierung verwendet
    random_state=42  
)

2024-06-01 16:40:13 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:04<00:00,  4.61it/s]


2024-06-01 16:40:18 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:28<00:00,  1.42s/it]


2024-06-01 16:40:50 [[34msrc.rolling_window_creator:131[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 16:40:50 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 20/20 [00:04<00:00,  4.94it/s]


2024-06-01 16:40:55 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 11.53it/s]

2024-06-01 16:40:56 [[34msrc.rolling_window_creator:159[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m
2024-06-01 16:40:56 [[34msrc.rolling_window_creator:160[0m] [[32mINFO[0m] >>>> Shape of X_train: (24220, 120)[0m
2024-06-01 16:40:56 [[34msrc.rolling_window_creator:161[0m] [[32mINFO[0m] >>>> Shape of y_train: (24220, 1)[0m
2024-06-01 16:40:56 [[34msrc.rolling_window_creator:162[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 120)[0m
2024-06-01 16:40:56 [[34msrc.rolling_window_creator:163[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m





In [56]:
X_train_4, y_train_4, X_test_4, y_test_4 = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train_4, test_data=cleaned_train_4,test_RUL_data=test_RUL_data_4,)

# Teile die Trainingsdaten in Trainings- und Validierungsdaten auf
X_train_4, X_val_4, y_train_4, y_val_4 = train_test_split(
    X_train_4, 
    y_train_4, 
    test_size=0.2,  # 20% der Daten werden für Validierung verwendet
    random_state=42  
)

2024-06-01 10:08:37 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:05<00:00,  3.58it/s]


2024-06-01 10:08:43 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [02:09<00:00,  6.48s/it]


2024-06-01 10:11:12 [[34msrc.rolling_window_creator:131[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-06-01 10:11:12 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 20/20 [00:05<00:00,  3.43it/s]


2024-06-01 10:11:18 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:01<00:00, 15.31it/s]


ValueError: Length mismatch: Expected axis has 248 elements, new values have 249 elements

Import of used classifiers

In [25]:
import sklearn
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import accuracy_score,f1_score
from sklearn.svm import SVR
#from bayes_opt import BayesianOptimization
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVR
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,f1_score, root_mean_squared_error


klassische Ansätze

Data Scaling

In [26]:
# Standardisieren der Daten
scaler_1 = StandardScaler()
X_train_scaled_1 = scaler_1.fit_transform(X_train_1)
X_val_scaled_1 = scaler_1.transform(X_val_1)
X_test_scaled_1 = scaler_1.transform(X_test_1)

scaler_2 = StandardScaler()
X_train_scaled_2 = scaler_2.fit_transform(X_train_2)
X_val_scaled_2 = scaler_2.transform(X_val_2)
X_test_scaled_2 = scaler_2.transform(X_test_2)

scaler_3 = StandardScaler()
X_train_scaled_3 = scaler_3.fit_transform(X_train_3)
X_val_scaled_3 = scaler_3.transform(X_val_3)
X_test_scaled_3 = scaler_3.transform(X_test_3)





Classifier

AutoML with TPOT


In [93]:
#tpot
import pandas as pd
from sklearn.model_selection import train_test_split
from tpot import TPOTRegressor

# TPOT
tpot = TPOTRegressor(generations=3, population_size=20, cv=3, verbosity=2, random_state=42)

tpot.fit(X_train_scaled_1, y_train_1)

print(tpot.score(X_test_scaled_1, y_test_1))

tpot.export('best_model_pipeline.py')

  y = column_or_1d(y, warn=True)


                                                                              
                                                                            
TPOT closed during evaluation in one generation.
                                                                            
                                                                            
TPOT closed prematurely. Will use the current best pipeline.
                                                                            

In [28]:
print(tpot.fitted_pipeline_)


NameError: name 'tpot' is not defined

Classifiers

SVM

In [50]:
# Support Vector Machine implementieren
svm_1 = SVR(kernel='linear', random_state=42)
svm_3 = SVR(kernel='linear', random_state=42)

# Cross-Validation
#svm_cv_scores = cross_val_score(svm, X_train_scaled, y_train, cv=5)
#print(f"SVC Cross-Validation Accuracy: {svm_cv_scores.mean():.4f} ± {svm_cv_scores.std():.4f}")

# Training
svm_1.fit(X_train_scaled_1, y_train_1)
svm_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des SVM Classifiers auf den Validierungsdaten
svm_predictions_1 = svm_1.predict(X_val_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_val_1, svm_predictions_1))

svm_predictions_3 = svm_1.predict(X_val_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_val_3, svm_predictions_3))

#print("SVC Training Accuracy:", accuracy_score(y_test, svm_predictions))
#print("SVC Training Classification Report:\n", classification_report(y_test, svm_predictions))
#print("="*60)

TypeError: SVR.__init__() got an unexpected keyword argument 'random_state'

Extra Trees


In [27]:
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier

# ExtraTreesRegressor implementieren
extra_trees = ExtraTreesRegressor(n_estimators=140, max_depth= 15, min_samples_leaf=5, random_state=42)

# Training
extra_trees.fit(X_train_scaled_1, y_train_1)

# Vorhersagen und Bewerten des ExtraTreesRegressor
et_predictions_1 = extra_trees.predict(X_val_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_val_1, et_predictions_1))

# Training des ExtraTreesRegressor auf den Trainingsdaten
extra_trees.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des ExtraTreeRegressor auf den Validierungsdaten
et_predictions_3 = extra_trees.predict(X_val_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_val_3, et_predictions_3))

# Vorhersage auf den Testdaten
#et_test_predictions_1 = extra_trees.predict(X_test_scaled_1)
#print(sklearn.metrics.root_mean_squared_error(y_test_1, et_test_predictions_1))

#et_test_predictions_3 = extra_trees.predict(X_test_scaled_3)
#print(sklearn.metrics.root_mean_squared_error(y_test_3, et_test_predictions_3))


  return fit_method(estimator, *args, **kwargs)


15.19112355795682


  return fit_method(estimator, *args, **kwargs)


20.945102403706947


In [30]:
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
#ExtraTreesRegressor implementieren
extra_trees = ExtraTreesRegressor(n_estimators=140, max_depth= 15, min_samples_leaf=5, random_state=42)
#extra_trees = ExtraTreesRegressor(n_estimators=151, max_depth= 30, min_samples_leaf=1, random_state=42)

#Beste Hyperparameter-Kombination: {'max_depth': 30.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 151.63447537285498}

# Training
extra_trees.fit(X_train_scaled_2
                , y_train_2)

# Vorhersagen und Bewerten des ExtraTreesRegressor
et_predictions_2 = extra_trees.predict(X_val_scaled_2)
print(sklearn.metrics.root_mean_squared_error(y_val_2, et_predictions_2))



[autoreload of scipy._lib._array_api failed: Traceback (most recent call last):
  File "/Users/niklasquendt/Library/Python/3.12/lib/python/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/niklasquendt/Library/Python/3.12/lib/python/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
             ^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/importlib/__init__.py", line 131, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 860, in _exec
  File "<frozen importlib._bootstrap_external>", line 994, in exec_module
  File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/scipy/_lib/_array_api.py", line 17, in <module>
    from scipy._lib.array_api_compat import (
ImportError:

23.26411858995878


In [27]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split

# Define the function to optimize
def evaluate_model(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Make sure parameters are integer
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    
    # Define the model with the parameters
    model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, 
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                                random_state=42, n_jobs=-1)
    
    # Fit and predict
    model.fit(X_train_1, y_train_1)
    pred = model.predict(X_val_1)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_val_1, pred, squared=False)
    
    # We want to minimize RMSE, so we return the negative value
    return -rmse


# Define the bounds of the parameters
param_bounds = {
    'n_estimators': (100, 200),
    'max_depth': (10, 30),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4)
}
# Create the BayesianOptimization object
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=param_bounds,
    random_state=42,
)

# Perform the optimization
optimizer.maximize(init_points=5, n_iter=12)

# Print the best parameters
print("Beste Hyperparameter-Kombination:", optimizer.max['params'])



|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


| [0m1        [0m | [0m-11.46   [0m | [0m17.49    [0m | [0m3.852    [0m | [0m7.856    [0m | [0m159.9    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m2        [0m | [0m-18.3    [0m | [0m13.12    [0m | [0m1.468    [0m | [0m2.465    [0m | [0m186.6    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m3        [0m | [95m-9.02    [0m | [95m22.02    [0m | [95m3.124    [0m | [95m2.165    [0m | [95m197.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m4        [0m | [95m-7.457   [0m | [95m26.65    [0m | [95m1.637    [0m | [95m3.455    [0m | [95m118.3    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m5        [0m | [0m-12.43   [0m | [0m16.08    [0m | [0m2.574    [0m | [0m5.456    [0m | [0m129.1    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m6        [0m | [0m-8.913   [0m | [0m24.46    [0m | [0m3.004    [0m | [0m3.783    [0m | [0m106.4    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m7        [0m | [0m-8.192   [0m | [0m26.89    [0m | [0m2.389    [0m | [0m2.85     [0m | [0m117.6    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m8        [0m | [0m-8.556   [0m | [0m24.96    [0m | [0m1.0      [0m | [0m10.0     [0m | [0m116.2    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m9        [0m | [0m-8.06    [0m | [0m30.0     [0m | [0m1.0      [0m | [0m7.529    [0m | [0m124.3    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m10       [0m | [0m-9.713   [0m | [0m30.0     [0m | [0m4.0      [0m | [0m9.384    [0m | [0m200.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m11       [0m | [0m-15.8    [0m | [0m14.63    [0m | [0m1.0      [0m | [0m2.0      [0m | [0m112.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m12       [0m | [0m-8.433   [0m | [0m30.0     [0m | [0m1.0      [0m | [0m10.0     [0m | [0m106.6    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m13       [0m | [0m-10.03   [0m | [0m24.4     [0m | [0m4.0      [0m | [0m10.0     [0m | [0m100.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m14       [0m | [0m-7.517   [0m | [0m29.97    [0m | [0m1.544    [0m | [0m2.512    [0m | [0m136.2    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m15       [0m | [0m-8.31    [0m | [0m30.0     [0m | [0m1.0      [0m | [0m10.0     [0m | [0m141.8    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m16       [0m | [0m-9.406   [0m | [0m30.0     [0m | [0m4.0      [0m | [0m2.0      [0m | [0m146.0    [0m |
| [0m17       [0m | [0m-8.738   [0m | [0m28.57    [0m | [0m3.978    [0m | [0m2.053    [0m | [0m126.7    [0m |
Beste Hyperparameter-Kombination: {'max_depth': 26.648852816008436, 'min_samples_leaf': 1.6370173320348285, 'min_samples_split': 3.454599737656805, 'n_estimators': 118.34045098534338}


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


KeyboardInterrupt: 

In [29]:
# Vorhersagen auf den Testdaten
#Beste Hyperparameter-Kombination: {'max_depth': 26.648852816008436, 'min_samples_leaf': 1.6370173320348285, 'min_samples_split': 3.454599737656805, 'n_estimators': 118.34045098534338}
extra_trees = ExtraTreesRegressor(n_estimators=118, max_depth=27, min_samples_split=3, min_samples_leaf=2, random_state=42)
# Training
extra_trees.fit(X_train_scaled_1, y_train_1)
test_predictions = extra_trees.predict(X_test_scaled_1)
test_rmse = np.sqrt(sklearn.metrics.root_mean_squared_error(y_test_1, test_predictions))
print(f"Test RMSE: {test_rmse:.4f}")

  return fit_method(estimator, *args, **kwargs)


Test RMSE: 7.9324


In [31]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
#Bayesian Optimzier für Dataset 2
# Define the function to optimize
def evaluate_model(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Make sure parameters are integer
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    
    # Define the model with the parameters
    model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, 
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                                random_state=42, n_jobs=-1)
    
    # Fit and predict
    model.fit(X_train_2, y_train_2)
    pred = model.predict(X_val_2)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_val_2, pred, squared=False)
    
    # We want to minimize RMSE, so we return the negative value
    return -rmse


# Define the bounds of the parameters
param_bounds = {
    'n_estimators': (100, 200),
    'max_depth': (10, 30),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4)
}
# Create the BayesianOptimization object
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=param_bounds,
    random_state=42,
)

# Perform the optimization
optimizer.maximize(init_points=5, n_iter=12)

# Print the best parameters
print("Beste Hyperparameter-Kombination:", optimizer.max['params'])



|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


| [0m1        [0m | [0m-18.42   [0m | [0m17.49    [0m | [0m3.852    [0m | [0m7.856    [0m | [0m159.9    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m2        [0m | [0m-27.89   [0m | [0m13.12    [0m | [0m1.468    [0m | [0m2.465    [0m | [0m186.6    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m3        [0m | [95m-12.65   [0m | [95m22.02    [0m | [95m3.124    [0m | [95m2.165    [0m | [95m197.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m4        [0m | [95m-10.22   [0m | [95m26.65    [0m | [95m1.637    [0m | [95m3.455    [0m | [95m118.3    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m5        [0m | [0m-20.09   [0m | [0m16.08    [0m | [0m2.574    [0m | [0m5.456    [0m | [0m129.1    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m6        [0m | [0m-12.07   [0m | [0m24.46    [0m | [0m3.004    [0m | [0m3.783    [0m | [0m106.4    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m7        [0m | [0m-10.29   [0m | [0m26.4     [0m | [0m1.0      [0m | [0m4.09     [0m | [0m119.2    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m8        [0m | [0m-11.47   [0m | [0m29.38    [0m | [0m2.609    [0m | [0m9.273    [0m | [0m112.8    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m9        [0m | [0m-12.73   [0m | [0m30.0     [0m | [0m4.0      [0m | [0m10.0     [0m | [0m200.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m10       [0m | [0m-35.6    [0m | [0m10.0     [0m | [0m1.0      [0m | [0m10.0     [0m | [0m100.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m11       [0m | [0m-11.0    [0m | [0m30.0     [0m | [0m1.0      [0m | [0m10.0     [0m | [0m141.5    [0m |




| [0m12       [0m | [0m-12.37   [0m | [0m30.0     [0m | [0m4.0      [0m | [0m2.0      [0m | [0m131.5    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m13       [0m | [95m-9.968   [0m | [95m30.0     [0m | [95m1.0      [0m | [95m2.0      [0m | [95m151.6    [0m |


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


| [0m14       [0m | [0m-10.09   [0m | [0m30.0     [0m | [0m1.0      [0m | [0m2.0      [0m | [0m100.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m15       [0m | [0m-10.98   [0m | [0m30.0     [0m | [0m1.0      [0m | [0m10.0     [0m | [0m168.7    [0m |




| [0m16       [0m | [0m-12.31   [0m | [0m30.0     [0m | [0m4.0      [0m | [0m2.0      [0m | [0m183.6    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m17       [0m | [0m-12.75   [0m | [0m30.0     [0m | [0m4.0      [0m | [0m10.0     [0m | [0m155.4    [0m |
Beste Hyperparameter-Kombination: {'max_depth': 30.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 151.63447537285498}




In [65]:
#vorhersagen auf den Testdaten
#Beste Hyperparameter-Kombination: {'max_depth': 30.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 151.63447537285498}
extra_trees = ExtraTreesRegressor(n_estimators=140, max_depth= 15, min_samples_leaf=5, random_state=42)

#extra_trees = ExtraTreesRegressor(n_estimators=112, max_depth=15, min_samples_split=4, min_samples_leaf=2, random_state=42)
# Training
extra_trees.fit(X_train_scaled_2, y_train_2)
test_predictions = extra_trees.predict(X_test_scaled_2)
test_rmse = np.sqrt(sklearn.metrics.root_mean_squared_error(y_test_2, test_predictions))
print(f"Test RMSE: {test_rmse:.4f}")



  return fit_method(estimator, *args, **kwargs)


Test RMSE: 8.7472


In [38]:
#Bayesian Optimzier für Dataset 3
# Define the function to optimize
def evaluate_model(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Make sure parameters are integer
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    
    # Define the model with the parameters
    model = ExtraTreesRegressor(n_estimators=n_estimators, max_depth=max_depth, 
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                                random_state=42, n_jobs=-1)
    
    # Fit and predict
    model.fit(X_train_3, y_train_3)
    pred = model.predict(X_val_3)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_val_3, pred, squared=False)
    
    # We want to minimize RMSE, so we return the negative value
    return -rmse


# Define the bounds of the parameters
param_bounds = {
    'n_estimators': (100, 200),
    'max_depth': (10, 30),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4)
}
# Create the BayesianOptimization object
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=param_bounds,
    random_state=42,
)

# Perform the optimization
optimizer.maximize(init_points=5, n_iter=12)

# Print the best parameters
print("Beste Hyperparameter-Kombination:", optimizer.max['params'])



|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


| [0m1        [0m | [0m-16.14   [0m | [0m17.49    [0m | [0m3.852    [0m | [0m7.856    [0m | [0m159.9    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m2        [0m | [0m-24.69   [0m | [0m13.12    [0m | [0m1.468    [0m | [0m2.465    [0m | [0m186.6    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m3        [0m | [95m-12.57   [0m | [95m22.02    [0m | [95m3.124    [0m | [95m2.165    [0m | [95m197.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m4        [0m | [95m-10.34   [0m | [95m26.65    [0m | [95m1.637    [0m | [95m3.455    [0m | [95m118.3    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m5        [0m | [0m-16.99   [0m | [0m16.08    [0m | [0m2.574    [0m | [0m5.456    [0m | [0m129.1    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m6        [0m | [0m-12.47   [0m | [0m24.46    [0m | [0m3.004    [0m | [0m3.783    [0m | [0m106.4    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m7        [0m | [0m-11.23   [0m | [0m26.88    [0m | [0m2.375    [0m | [0m2.861    [0m | [0m117.6    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m8        [0m | [0m-11.82   [0m | [0m24.76    [0m | [0m1.0      [0m | [0m10.0     [0m | [0m115.9    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m9        [0m | [0m-11.06   [0m | [0m30.0     [0m | [0m1.0      [0m | [0m8.069    [0m | [0m124.4    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m10       [0m | [0m-13.79   [0m | [0m30.0     [0m | [0m4.0      [0m | [0m10.0     [0m | [0m200.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m11       [0m | [0m-32.55   [0m | [0m11.61    [0m | [0m1.0      [0m | [0m3.345    [0m | [0m107.5    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m12       [0m | [0m-13.88   [0m | [0m30.0     [0m | [0m4.0      [0m | [0m10.0     [0m | [0m100.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m13       [0m | [95m-10.29   [0m | [95m30.0     [0m | [95m1.0      [0m | [95m2.0      [0m | [95m136.5    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m14       [0m | [0m-11.62   [0m | [0m30.0     [0m | [0m1.0      [0m | [0m10.0     [0m | [0m144.4    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m15       [0m | [0m-12.65   [0m | [0m22.45    [0m | [0m3.181    [0m | [0m2.31     [0m | [0m145.8    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m16       [0m | [95m-10.26   [0m | [95m30.0     [0m | [95m1.0      [0m | [95m2.0      [0m | [95m155.4    [0m |
| [95m17       [0m | [95m-10.19   [0m | [95m30.0     [0m | [95m1.0      [0m | [95m2.0      [0m | [95m168.0    [0m |
Beste Hyperparameter-Kombination: {'max_depth': 30.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 167.97080791958393}




In [40]:
#vorhersagen auf den Testdaten
#Beste Hyperparameter-Kombination: {'max_depth': 30.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 167.97080791958393}
extra_trees = ExtraTreesRegressor(n_estimators=168, max_depth=30, min_samples_split=2, min_samples_leaf=1, random_state=42)
#X_test_scaled_3 = X_test_scaled_3[:, :120] #Problem war, dass die Anzahl der Features in den Testdaten nicht mit denen in den Trainingsdaten übereinstimmte
# Training
extra_trees.fit(X_train_scaled_3, y_train_3)
test_predictions = extra_trees.predict(X_test_scaled_3)
test_rmse = np.sqrt(sklearn.metrics.root_mean_squared_error(y_test_3, test_predictions))
print(f"Test RMSE: {test_rmse:.4f}")



  return fit_method(estimator, *args, **kwargs)


Test RMSE: 8.7817


MLP

In [30]:
from sklearn.model_selection import cross_val_score
# MLPClassifier implementieren
mlp_1 = MLPRegressor(hidden_layer_sizes=(100,50,25), alpha=0.001,activation='relu', early_stopping=True, random_state=42)
#mlp_3 = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)

# Cross-Validation
#mlp_cv_scores = cross_val_score(mlp, X_train_scaled, y_train, cv=5)
#print(f"MLPClassifier Cross-Validation Accuracy: {mlp_cv_scores.mean():.4f} ± {mlp_cv_scores.std():.4f}")

mlp_1.fit(X_train_scaled_1, y_train_1)
#mlp_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des MLPClassifiers
mlp_predictions_1 = mlp_1.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, mlp_predictions_1))

#mlp_predictions_3 = mlp_3.predict(X_test_scaled_3)
#print(sklearn.metrics.root_mean_squared_error(y_test_3, mlp_predictions_3))


  y = column_or_1d(y, warn=True)


79.66455642537167


In [31]:
#Bayes Optimizer für MLP
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization

# Define the function to optimize
def evaluate_model(learning_rate_init, alpha, hidden_layer_sizes):
    # Make sure parameters are integer
    hidden_layer_sizes = int(hidden_layer_sizes)
    alpha = int(alpha)
    
    # Define the model with the parameters
    model = MLPRegressor(hidden_layer_sizes=(100,50,25), alpha=0.001,activation='relu', early_stopping=True, random_state=42)
    
    # Fit and predict
    model.fit(X_train_1, y_train_1)
    pred = model.predict(X_val_1)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_val_1, pred, squared=False)
    
    # We want to minimize RMSE, so we return the negative value
    return -rmse


# Define the bounds of the parameters
param_bounds = {
    'learning_rate_init': (0.001, 0.1),
    'alpha': (0.0001, 0.1),
    'hidden_layer_sizes': (1, 100)
}
# Create the BayesianOptimization object
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=param_bounds,
    random_state=42,
)

# Perform the optimization
optimizer.maximize(init_points=5, n_iter=12)

# Print the best parameters
print("Beste Hyperparameter-Kombination:", optimizer.max['params'])

# Vorhersagen auf den Testdaten
#Beste Hyperparameter-Kombination: {'alpha': 0.1, 'hidden_layer_sizes': 100.0, 'learning_rate_init': 0.1}
mlp = MLPRegressor(hidden_layer_sizes=(100,50,25), alpha=0.001,activation='relu', early_stopping=True, random_state=42)
# Training
mlp.fit(X_train_1, y_train_1)
test_predictions = mlp.predict(X_test_1)
test_rmse = np.sqrt(mean_squared_error(y_test_1, test_predictions))
print(f"Test RMSE: {test_rmse:.4f}")


|   iter    |  target   |   alpha   | hidden... | learni... |
-------------------------------------------------------------


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


| [0m1        [0m | [0m-61.17   [0m | [0m0.03752  [0m | [0m95.12    [0m | [0m0.07347  [0m |


  y = column_or_1d(y, warn=True)


| [0m2        [0m | [0m-61.17   [0m | [0m0.05991  [0m | [0m16.45    [0m | [0m0.01644  [0m |


  y = column_or_1d(y, warn=True)


| [0m3        [0m | [0m-61.17   [0m | [0m0.005903 [0m | [0m86.75    [0m | [0m0.06051  [0m |


  y = column_or_1d(y, warn=True)


| [0m4        [0m | [0m-61.17   [0m | [0m0.07084  [0m | [0m3.038    [0m | [0m0.09702  [0m |


  y = column_or_1d(y, warn=True)


| [0m5        [0m | [0m-61.17   [0m | [0m0.08326  [0m | [0m22.02    [0m | [0m0.019    [0m |


  y = column_or_1d(y, warn=True)


| [0m6        [0m | [0m-61.17   [0m | [0m0.07029  [0m | [0m99.99    [0m | [0m0.04409  [0m |


  y = column_or_1d(y, warn=True)


| [0m7        [0m | [0m-61.17   [0m | [0m0.04876  [0m | [0m1.007    [0m | [0m0.08898  [0m |


  y = column_or_1d(y, warn=True)


| [0m8        [0m | [0m-61.17   [0m | [0m0.03259  [0m | [0m48.43    [0m | [0m0.01514  [0m |


  y = column_or_1d(y, warn=True)


| [0m9        [0m | [0m-61.17   [0m | [0m0.09958  [0m | [0m99.99    [0m | [0m0.02966  [0m |


  y = column_or_1d(y, warn=True)


| [0m10       [0m | [0m-61.17   [0m | [0m0.08942  [0m | [0m86.29    [0m | [0m0.04191  [0m |


  y = column_or_1d(y, warn=True)


| [0m11       [0m | [0m-61.17   [0m | [0m0.02947  [0m | [0m1.011    [0m | [0m0.02486  [0m |


  y = column_or_1d(y, warn=True)


| [0m12       [0m | [0m-61.17   [0m | [0m0.01632  [0m | [0m1.005    [0m | [0m0.01883  [0m |


  y = column_or_1d(y, warn=True)


| [0m13       [0m | [0m-61.17   [0m | [0m0.02236  [0m | [0m99.99    [0m | [0m0.09131  [0m |


  y = column_or_1d(y, warn=True)


| [0m14       [0m | [0m-61.17   [0m | [0m0.06107  [0m | [0m22.92    [0m | [0m0.09764  [0m |


  y = column_or_1d(y, warn=True)


| [0m15       [0m | [0m-61.17   [0m | [0m0.004182 [0m | [0m49.4     [0m | [0m0.05957  [0m |


  y = column_or_1d(y, warn=True)


| [0m16       [0m | [0m-61.17   [0m | [0m0.0309   [0m | [0m86.25    [0m | [0m0.01029  [0m |


  y = column_or_1d(y, warn=True)


| [0m17       [0m | [0m-61.17   [0m | [0m0.09067  [0m | [0m1.018    [0m | [0m0.0994   [0m |
Beste Hyperparameter-Kombination: {'alpha': 0.03751655787285152, 'hidden_layer_sizes': 95.1207163345817, 'learning_rate_init': 0.07346740023932911}
Test RMSE: 64.9884


In [32]:
#Vorhersage auf den Testdaten
#Beste Hyperparameter-Kombination: {'alpha': 0.03751655787285152, 'hidden_layer_sizes': 95.1207163345817, 'learning_rate_init': 0.07346740023932911}
mlp = MLPRegressor(hidden_layer_sizes=(95,50,25), alpha=0.0375,activation='relu', learning_rate_init=0.07346, early_stopping=True, random_state=42)
# Training
mlp.fit(X_train_1, y_train_1)
test_predictions = mlp.predict(X_test_1)
test_rmse = np.sqrt(mean_squared_error(y_test_1, test_predictions))
print(f"Test RMSE: {test_rmse:.4f}")


  y = column_or_1d(y, warn=True)


Test RMSE: 47.7990


Random Forests

In [37]:
#RandomForestREgreossor implementieren

# Random Forest Regressor implementieren
from sklearn.ensemble import RandomForestRegressor

rf_1 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_3 = RandomForestRegressor(n_estimators=100, random_state=42)

# Cross-Validation
#rf_cv_scores = cross_val_score(rf, X_train_scaled, y_train, cv=5)
#print(f"RandomForest Cross-Validation Accuracy: {rf_cv_scores.mean():.4f} ± {rf_cv_scores.std():.4f}")

# Random Forest Classifier trainieren
rf_1.fit(X_train_scaled_1, y_train_1)
rf_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des Random Forest Classifiers
rf_predictions_1 = rf_1.predict(X_val_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_val_1, rf_predictions_1))


rf_predictions_3 = rf_3.predict(X_val_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_val_3, rf_predictions_3))


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


10.623605477657845
14.525024512799973


In [39]:
#Bayesian Optimizer für Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from bayes_opt import BayesianOptimization

# Define the function to optimize
def evaluate_model(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    # Make sure parameters are integer
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    
    # Define the model with the parameters
    model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, 
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, 
                                random_state=42, n_jobs=-1)
    
    # Fit and predict
    model.fit(X_train_1, y_train_1)
    pred = model.predict(X_val_1)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_val_1, pred, squared=False)
    
    # We want to minimize RMSE, so we return the negative value
    return -rmse


# Define the bounds of the parameters
param_bounds = {
    'n_estimators': (100, 200),
    'max_depth': (10, 30),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 4)
}
# Create the BayesianOptimization object
optimizer = BayesianOptimization(
    f=evaluate_model,
    pbounds=param_bounds,
    random_state=42,
)

# Perform the optimization
optimizer.maximize(init_points=5, n_iter=12)

# Print the best parameters
print("Beste Hyperparameter-Kombination:", optimizer.max['params'])



|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


| [0m1        [0m | [0m-12.66   [0m | [0m17.49    [0m | [0m3.852    [0m | [0m7.856    [0m | [0m159.9    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m2        [0m | [0m-16.7    [0m | [0m13.12    [0m | [0m1.468    [0m | [0m2.465    [0m | [0m186.6    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m3        [0m | [95m-11.71   [0m | [95m22.02    [0m | [95m3.124    [0m | [95m2.165    [0m | [95m197.0    [0m |


  return fit_method(estimator, *args, **kwargs)


| [95m4        [0m | [95m-10.59   [0m | [95m26.65    [0m | [95m1.637    [0m | [95m3.455    [0m | [95m118.3    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m5        [0m | [0m-12.85   [0m | [0m16.08    [0m | [0m2.574    [0m | [0m5.456    [0m | [0m129.1    [0m |




| [0m6        [0m | [0m-13.75   [0m | [0m15.24    [0m | [0m2.473    [0m | [0m5.354    [0m | [0m129.1    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m7        [0m | [0m-10.61   [0m | [0m25.12    [0m | [0m1.816    [0m | [0m3.811    [0m | [0m120.1    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m8        [0m | [0m-11.22   [0m | [0m28.64    [0m | [0m2.518    [0m | [0m5.096    [0m | [0m121.2    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m9        [0m | [0m-11.91   [0m | [0m25.11    [0m | [0m3.339    [0m | [0m7.236    [0m | [0m117.7    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m10       [0m | [0m-10.71   [0m | [0m22.59    [0m | [0m1.0      [0m | [0m2.0      [0m | [0m116.9    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m11       [0m | [0m-11.26   [0m | [0m22.32    [0m | [0m2.724    [0m | [0m5.144    [0m | [0m127.1    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m12       [0m | [0m-11.04   [0m | [0m19.93    [0m | [0m1.0      [0m | [0m2.0      [0m | [0m122.0    [0m |


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


| [0m13       [0m | [0m-12.5    [0m | [0m21.58    [0m | [0m4.0      [0m | [0m8.096    [0m | [0m133.0    [0m |




| [0m14       [0m | [0m-10.63   [0m | [0m26.87    [0m | [0m1.0      [0m | [0m2.0      [0m | [0m112.4    [0m |


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


| [0m15       [0m | [0m-10.77   [0m | [0m21.43    [0m | [0m1.0      [0m | [0m2.0      [0m | [0m110.1    [0m |




| [0m16       [0m | [0m-12.42   [0m | [0m26.1     [0m | [0m4.0      [0m | [0m2.0      [0m | [0m106.8    [0m |


  return fit_method(estimator, *args, **kwargs)


| [0m17       [0m | [0m-12.46   [0m | [0m16.84    [0m | [0m1.0      [0m | [0m2.0      [0m | [0m113.9    [0m |
Beste Hyperparameter-Kombination: {'max_depth': 26.648852816008436, 'min_samples_leaf': 1.6370173320348285, 'min_samples_split': 3.454599737656805, 'n_estimators': 118.34045098534338}




In [41]:
# Vorhersagen auf den Testdaten
#Beste Hyperparameter-Kombination: {'max_depth': 26.648852816008436, 'min_samples_leaf': 1.6370173320348285, 'min_samples_split': 3.454599737656805, 'n_estimators': 118.34045098534338}
rf = RandomForestRegressor(n_estimators=118, max_depth=27, min_samples_split=3, min_samples_leaf=2, random_state=42)
#X_test_scaled_1 = X_test_scaled_1[:, :120] #Problem war, dass die Anzahl der Features in den Testdaten nicht mit denen in den Trainingsdaten übereinstimmte
# Training
rf.fit(X_train_scaled_1, y_train_1)
test_predictions = rf.predict(X_test_scaled_1)
test_rmse = np.sqrt(mean_squared_error(y_test_1, test_predictions))
print(f"Test RMSE: {test_rmse:.4f}")


  return fit_method(estimator, *args, **kwargs)


Test RMSE: 121.4358


AdaBoost

In [54]:
from sklearn.ensemble import AdaBoostRegressor

# AdaBoost implementieren
ada_1 = AdaBoostRegressor(n_estimators=100, random_state=42)
ada_3 = AdaBoostRegressor(n_estimators=100, random_state=42)

# Cross-Validation
#ada_cv_scores = cross_val_score(ada, X_train_scaled, y_train, cv=5)
#print(f"AdaBoost Cross-Validation Accuracy: {ada_cv_scores.mean():.4f} ± {ada_cv_scores.std():.4f}")

# AdaBoost Classifier trainieren
ada_1.fit(X_train_scaled_1, y_train_1)
ada_3.fit(X_train_scaled_3, y_train_3)

# Vorhersagen und Bewerten des AdaBoost Classifiers
ada_predictions_1 = ada_1.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, ada_predictions_1))

ada_predictions_3 = ada_3.predict(X_test_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_test_3, ada_predictions_3))

#print("AdaBoost Classifier Accuracy:", accuracy_score(y_val, ada_predictions))
#print("AdaBoost Classifier Classification Report:\n", classification_report(y_val, ada_predictions))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


101.88238580002302
119.52938215403276


knn

In [55]:
# k-Nearest Neighbors implementieren
knn_1 = KNeighborsRegressor(n_neighbors=5)
knn_3 = KNeighborsRegressor(n_neighbors=5)
#trainieren des k-Nearest Neighbors
knn_1.fit(X_train_scaled_1, y_train_1)
knn_3.fit(X_train_scaled_3, y_train_3)

#vohersagen und Bewerten des k-Nearest Neighbors
knn_predictions_1 = knn_1.predict(X_test_scaled_1)
print(sklearn.metrics.root_mean_squared_error(y_test_1, knn_predictions_1))

knn_predictions_3 = knn_3.predict(X_test_scaled_3)
print(sklearn.metrics.root_mean_squared_error(y_test_3, knn_predictions_3))




52.90510750390741
83.33116103835347


Optimization

In [None]:
#hyperparameter tuning

