In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a regression problem.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis.
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization. -> 🎯 **Focus on this task** feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.

# Imports + Settings

In [4]:
# third-party libraries
import pandas as pd
import numpy as np
import os
from typing import List
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [5]:
# source code
from src.data_loading import load_data, load_config
from src.data_cleaning import clean_data
from src.rolling_window_creator import RollingWindowDatasetCreator
from src.data_splitting import train_val_split_by_group

In [6]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi": 100, 'savefig.dpi': 200})
sns.set_context('notebook')

np.random.seed(42)

# Paths

In [7]:
os.chdir("../")  # set working directory to root of project
os.getcwd() # check current working directory

'C:\\Users\\merti\\PycharmProjects\\damage-propagation-modeling'

In [59]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load config

In [61]:
config = load_config(PATH_TO_CONFIG) # config is dict

# Dataset 1

load data

In [10]:
train_data, test_data, test_RUL_data = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)

2024-05-30 14:16:02 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-05-30 14:16:03 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-05-30 14:16:03 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-05-30 14:16:03 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-05-30 14:16:03 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


preprocess data

In [53]:
# clean - see data_preprocessing.ipynb for details
cleaned_train, cleaned_test = clean_data(train_data, test_data, method='winsorize', ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.5, threshold_corr=0.1, contamination=0.05)

2024-05-30 14:52:54 [[34msrc.data_cleaning:142[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-05-30 14:52:54 [[34msrc.data_cleaning:145[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-05-30 14:52:54 [[34msrc.data_cleaning:77[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-30 14:52:54 [[34msrc.data_cleaning:77[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-30 14:52:54 [[34msrc.data_cleaning:150[0m] [[32mINFO[0m] >>>> Filter and drop features based on missing values, single unique values, and target correlation...[0m
2024-05-30 14:52:54 [[34msrc.data_cleaning:30[0m] [DEBUG[0m] >>>> Found 0 features with missing values above the threshold of 0.5.[0m
2024-05-30 14:52:54 [[34msrc.data_cleaning:54[0m] [DEBUG[0m] >>>> Found 7 features with only a single unique value: ['Operation Setting 3', 'Sensor Measure 1', 'Sensor Measure 5', 'Sensor Measure 10', 'Sensor Measure 16', 'Sensor Measure 18', 'Sensor Measure

In [62]:
%%time
# create rolling windows - see sliding_window.ipynb for details
creator = RollingWindowDatasetCreator(column_id="UnitNumber",
                                      column_sort="Cycle",
                                      max_window_size=config["preprocessing"]["max_window_size"],
                                      min_window_size=config["preprocessing"]["min_window_size"],
                                      feature_extraction_mode=config["preprocessing"]["feature_extraction_mode"])
X_train, y_train, X_test, y_test = creator.create_rolling_windows_datasets(cleaned_train, cleaned_test, test_RUL_data)

2024-05-30 15:05:55 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for train data...[0m


Rolling: 100%|██████████| 37/37 [00:05<00:00,  7.34it/s]


2024-05-30 15:06:01 [[34msrc.rolling_window_creator:125[0m] [[32mINFO[0m] >>>> Extracting features for train data...[0m


Feature Extraction: 100%|██████████| 40/40 [03:01<00:00,  4.54s/it]


2024-05-30 15:09:56 [[34msrc.rolling_window_creator:133[0m] [[32mINFO[0m] >>>> Calculating target for train data...[0m
2024-05-30 15:09:56 [[34msrc.rolling_window_creator:117[0m] [[32mINFO[0m] >>>> Creating rolling windows for test data...[0m


Rolling: 100%|██████████| 38/38 [00:03<00:00, 10.01it/s]


2024-05-30 15:10:01 [[34msrc.rolling_window_creator:125[0m] [[32mINFO[0m] >>>> Extracting features for test data...[0m


Feature Extraction: 100%|██████████| 40/40 [00:03<00:00, 13.26it/s]


2024-05-30 15:10:04 [[34msrc.rolling_window_creator:161[0m] [[32mINFO[0m] >>>> Datasets created successfully.[0m
2024-05-30 15:10:04 [[34msrc.rolling_window_creator:162[0m] [[32mINFO[0m] >>>> Shape of X_train: (20131, 720)[0m
2024-05-30 15:10:04 [[34msrc.rolling_window_creator:163[0m] [[32mINFO[0m] >>>> Shape of y_train: (20131, 1)[0m
2024-05-30 15:10:04 [[34msrc.rolling_window_creator:164[0m] [[32mINFO[0m] >>>> Shape of X_test: (100, 720)[0m
2024-05-30 15:10:04 [[34msrc.rolling_window_creator:165[0m] [[32mINFO[0m] >>>> Shape of y_test: (100, 1)[0m
CPU times: total: 1min 27s
Wall time: 4min 9s


In [82]:
# split
# using a GroupShuffleSplit, to make sure that the same unit is not in both train and test set, to avoid data leakage and make the model more robust
X_train, X_val, y_train, y_val  = train_val_split_by_group(X=X_train, y=y_train, group="UnitNumber", test_size=0.2)

2024-05-30 15:28:17 [[34msrc.data_splitting:65[0m] [[32mINFO[0m] >>>> Split data successfully.[0m
2024-05-30 15:28:17 [[34msrc.data_splitting:66[0m] [[32mINFO[0m] >>>> Train set contains 80 different engines --> in total 16054[0m
2024-05-30 15:28:17 [[34msrc.data_splitting:67[0m] [[32mINFO[0m] >>>> Validation set contains 20 different engines --> in total 4077[0m


In [83]:
# scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [86]:
# check if all shapes match
X_train_scaled.shape, X_val_scaled.shape, X_test_scaled.shape

((16054, 720), (4077, 720), (100, 720))

In [54]:
# feature selection


Unnamed: 0,UnitNumber,Cycle,Sensor Measure 2,Sensor Measure 3,Sensor Measure 4,Sensor Measure 6,Sensor Measure 7,Sensor Measure 8,Sensor Measure 9,Sensor Measure 11,Sensor Measure 12,Sensor Measure 13,Sensor Measure 14,Sensor Measure 15,Sensor Measure 17,Sensor Measure 20,Sensor Measure 21
0,1,1,641.92,1589.70,1400.60,21.61,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.4190
1,1,2,642.15,1591.82,1403.14,21.61,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.00,23.4236
2,1,3,642.35,1587.99,1404.20,21.61,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,391,38.95,23.3442
3,1,4,642.35,1582.79,1401.87,21.61,554.45,2388.11,9049.48,47.15,522.50,2388.08,8133.83,8.3859,392,38.88,23.3739
4,1,5,642.37,1582.85,1406.22,21.61,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393,38.90,23.4044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,643.49,1597.98,1425.67,21.61,551.74,2388.19,9065.52,48.05,520.04,2388.23,8137.60,8.4956,396,38.49,23.0934
20627,100,197,643.54,1601.47,1425.67,21.61,551.74,2388.22,9065.11,48.04,520.04,2388.22,8136.50,8.5110,395,38.49,23.1594
20628,100,198,643.42,1601.47,1425.67,21.61,551.74,2388.22,9065.90,48.05,520.04,2388.23,8141.05,8.5110,396,38.49,23.0934
20629,100,199,643.23,1601.47,1425.67,21.61,551.74,2388.22,9073.72,48.05,520.04,2388.23,8139.29,8.5110,395,38.49,23.0934
