In [1]:
from sklearn.preprocessing import StandardScaler
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [14]:
# third-party libraries
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm


# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html; Examples on how to approach classic classifiers
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
# previous
from sklearn.metrics import accuracy_score,f1_score, root_mean_squared_error
from sklearn.svm import SVC
from bayes_opt import BayesianOptimization
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE


import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# source code
from src.utils import load_data, load_config
from src.data_cleaning import clean_data, format_dtype
from src.rolling_window_creator import calculate_RUL, RollingWindowDatasetCreator


In [16]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [17]:
np.random.seed(42)

# Paths

In [18]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
os.chdir("../") # set working directory to root of project
os.getcwd() # check current working directory

'C:\\Users\\Christoph\\PycharmProjects'

In [19]:
os.chdir("C:\\Users\\Christoph\\PycharmProjects\\damage-propagation-modeling_ml_classic")

In [20]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [21]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [22]:
%%time

train_data, test_data,test_rul_data = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)
train_data_2, test_data_2,test_rul_data_2 = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)
train_data_3, test_data_3,test_rul_data_3 = load_data(config_path=PATH_TO_CONFIG, dataset_num=3)
train_data_4, test_data_4,test_rul_data_4 = load_data(config_path=PATH_TO_CONFIG, dataset_num=4)

2024-05-26 17:14:19 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-05-26 17:14:19 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-05-26 17:14:19 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-05-26 17:14:19 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-05-26 17:14:19 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m
2024-05-26 17:14:19 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m
2024-05-26 17:14:21 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m
2024-05-26 17:14:21 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m
2024-05-26 17:14:21 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m
2024-05-26 17:14:21 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m
2024-05-26 17:14:21 [[34msrc.utils:60[0m] [[32mINFO[0m] >>

# 📍 << Task 2: Classic Machine learning >>

[TEMPLATE]

Findings:
* Interpretation of plots
* or other key take aways from previous code

In [19]:
# [TEMPLATE] - save processed data (as pickle)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_pickle(f"{config['paths']['processed_data_dir']}ex2_topic_{timestamp}.pkl")

In [12]:
# [TEMPLATE] - save data predictions (as csv)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_csv(f"{config['paths']['prediction_dir']}ex2_topic_{timestamp}.csv", sep=',', decimal='.')

In [13]:
# [TEMPLATE] - save plot results (as png)
fig = plt.figure(figsize=(9, 6))
timestamp = time.strftime("%Y%m%d-%H%M%S")
fig.savefig(f"{config['paths']['plot_dir']}ex2_topic_{timestamp}.png")

<Figure size 900x600 with 0 Axes>

## Data Cleaning

Use the created in advanced functions to remove non-helpful data as determined by the EDA 
(Currently without outlier removal)

In [35]:
train_data = format_dtype(train_data)

2024-05-25 09:31:10 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m


In [23]:
cleaned_train, cleaned_test = clean_data(train_data, test_data, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3)

2024-05-26 17:14:31 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-05-26 17:14:31 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-05-26 17:14:31 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-26 17:14:31 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-26 17:14:31 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-05-26 17:14:31 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-05-26 17:14:31 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-05-26 17:14:31 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-05-26 17:14:31 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

In [34]:
print(cleaned_train.shape)
print(cleaned_test.columns)

(20631, 16)
Index(['UnitNumber', 'Cycle', 'Sensor Measure 2', 'Sensor Measure 3',
       'Sensor Measure 4', 'Sensor Measure 7', 'Sensor Measure 8',
       'Sensor Measure 9', 'Sensor Measure 11', 'Sensor Measure 12',
       'Sensor Measure 13', 'Sensor Measure 14', 'Sensor Measure 15',
       'Sensor Measure 17', 'Sensor Measure 20', 'Sensor Measure 21'],
      dtype='object')


## Feature Engineering

Choose a set of feature options for tsfresh windowing


In [50]:
# Currently using minimal to ease optimization so feature_list is not necessary
feature_list = []

# feature_list for dataset 1
# TODO: create function to make variable for each dataset to ease optimization
currentpath = os.getcwd()
ft_list = pd.read_pickle(currentpath+ "/data/processed/dataset1_remaining_features_0521.pkl")

In [41]:
min_ts = 5
max_ts = 20

## Windowing

via tsfresh

In [42]:
rwCreator = RollingWindowDatasetCreator(max_timeshift=max_ts,min_timeshift=min_ts,feature_extraction_mode= 'minimal')
# feature_list=feature_list

In [43]:
X_train, y_train, X_test, y_test = rwCreator.create_rolling_windows_datasets(train_data=cleaned_train, test_data=cleaned_test,test_RUL_data=test_rul_data,)

2024-05-25 10:14:00 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 20/20 [00:12<00:00,  1.54it/s]


2024-05-25 10:14:14 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 20/20 [01:13<00:00,  3.65s/it]


2024-05-25 10:15:35 [[34msrc.rolling_window_creator:131[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-05-25 10:15:35 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 19/19 [00:08<00:00,  2.15it/s]


2024-05-25 10:15:44 [[34msrc.rolling_window_creator:123[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 20/20 [00:04<00:00,  4.20it/s]


2024-05-25 10:15:49 [[34msrc.rolling_window_creator:159[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m
2024-05-25 10:15:49 [[34msrc.rolling_window_creator:160[0m] [[32mINFO[0m] >>>> Shape of X_train: (20131, 140)[0m
2024-05-25 10:15:49 [[34msrc.rolling_window_creator:161[0m] [[32mINFO[0m] >>>> Shape of y_train: (20131, 1)[0m
2024-05-25 10:15:49 [[34msrc.rolling_window_creator:162[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 140)[0m
2024-05-25 10:15:49 [[34msrc.rolling_window_creator:163[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m


## Scaling

- StandardScaler

In [59]:
scaler = StandardScaler()

In [55]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Feature Selection

- RFE
- SimpleCorrApproach

## Models
- RandomForestClassifier - niklas
- SVC 
- MLP - niklas
- GPC
- KNeighboursClassifier - niklas
- Gaussian Naive Bayes 
- AdaBoostClassifier - niklas
- QuadraticDiscrimantAnalysis


In [60]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(gamma=2, C=1, random_state=42),
    GaussianProcessClassifier(1.0 * RBF(1.0), random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    MLPClassifier(alpha=1, max_iter=1000, random_state=42),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    
]

## Training & Evaluation

In [86]:
type(y_train.values.ravel())

numpy.ndarray

In [96]:
for clf in classifiers:
    
    clf.fit(X_train, y_train.values.ravel())
    y_pred = clf.predict(X_test)
    
    print(sklearn.metrics.root_mean_squared_error(y_test, y_pred))
    
    #rmse = sklearn.model_selection.cross_val_score(clf, X=X_train, y=y_train, cv=5, scoring='root_mean_squared_error')
    #print(rmse)
    

43.13710699618137
34.496086734584836
75.13128509482584


MemoryError: Unable to allocate 3.02 GiB for an array with shape (20131, 20131) and data type float64

## First Observations

Size is too big -> notebook outsourced to colab 


## Optimizations

-- TODO -- 

In [None]:
def hyperparameter_function(clf):
    ''' Hyperparameter Opti
    '''
    classifier = clf
    rmse = sklearn.model_selection.cross_val_score(classifier, X=X_train, y=y_train, cv=5, scoring='root_mean_squared_error')
    return rmse



# Bounded region of parameter space
    pbounds = {'neighbours': (3, 7)}

    optimizer = BayesianOptimization(
        f=hyperparameter_function_knn,
        pbounds=pbounds,
        random_state=17,
        allow_duplicate_points= True
    )   

    optimizer.maximize(
        init_points=50,
        n_iter=100,
    )