In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [4]:
# third-party libraries
import pandas as pd
import numpy as np
import os
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

import time

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# source code
from src.data_loading import load_data, load_config
from src.data_splitting import train_val_split_by_group
from src.nn_utils import create_sliding_window, create_sliding_window_test
from src.rolling_window_creator import calculate_RUL
from src.data_processing import apply_padding_on_train_data_and_test_data, drop_samples_with_clipped_values, extract_peaks_from_sensor_signal
from src.nn_util.nn_models.ligthning.cnnModel2 import CNNModel2 as CNNModel
from src.nn_util.datamodule.lightning.turbofanDatamodule import TurbofanDatamodule
from src.data_cleaning import clean_data

In [6]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [7]:
np.random.seed(42)

# Paths

In [8]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
os.chdir("../") # set working directory to root of project
os.getcwd() # check current working directory

'C:\\Users\\Johannes\\PycharmProjects\\damage-propagation-modeling'

In [9]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [10]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [11]:
train_data, test_data, test_RUL_data = load_data(config_path=PATH_TO_CONFIG, dataset_num=2)

2024-05-31 13:40:59 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m
2024-05-31 13:41:00 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m
2024-05-31 13:41:00 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m
2024-05-31 13:41:00 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m
2024-05-31 13:41:00 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


# 📍 << Subtask X: TOPIC >>

[TEMPLATE]

Findings:
* Interpretation of plots
* or other key take aways from previous code

TODO: Write down explanation of pipeline!

In [12]:
# some hyperparameters
time_column = 'Cycle'
group_column = 'UnitNumber'

window_size = 30
clip_value = 125
apply_data_cleaning = True
# If activated, adds for every sensor a new column with the commutative sum of the peaks
apply_peaks_generation = True

# Apply scaler. The order in the list represents the order in which they are applied
std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
robust_scaler = RobustScaler()
scaler = [std_scaler, minmax_scaler, robust_scaler]

# The model_type can be 'CNN_2D' or 'CNN_1D' and defines what type of convolutional layers has been used.
model_type = 'CNN_1D' 

# Setting the seed
pl.seed_everything(21)

Seed set to 21


21

In [14]:
if apply_peaks_generation:
    train_data = extract_peaks_from_sensor_signal(train_data)
    test_data = extract_peaks_from_sensor_signal(test_data)
    
if apply_data_cleaning:
    train_data, test_data = clean_data(train_data, test_data, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.0, contamination=0.05)
    
# Add column RUL to train_data
train_data = calculate_RUL(train_data, time_column, group_column, clip_value)

train_data, test_data = apply_padding_on_train_data_and_test_data(train_data=train_data, test_data=test_data, window_size=window_size)

X_test, _ = create_sliding_window_test(test_data, column_RUL=False, drop_columns=['UnitNumber'])
y_test = test_RUL_data.values

train, val = train_val_split_by_group(train_data, test_size=0.2, random_state=12)

X_train, y_train = create_sliding_window(train, window_size=window_size)  #, drop_columns=['UnitNumber', 'Cycle, 'RUL'])
X_val, y_val = create_sliding_window(val, window_size=window_size)  #, drop_columns=['UnitNumber', 'Cycle', 'RUL'])

X_train_temp, y_train_temp = drop_samples_with_clipped_values(X_train, y_train, clip_value)
X_val_temp, y_val_temp = drop_samples_with_clipped_values(X_val, y_val, clip_value)

2024-05-31 13:21:46 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-05-31 13:21:46 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-05-31 13:21:46 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-31 13:21:46 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-31 13:21:46 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-05-31 13:21:46 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-05-31 13:21:46 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-05-31 13:21:46 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-05-31 13:21:46 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

Scale the data

In [15]:
X_train_shape = X_train.shape
X_val_shape = X_val.shape
X_test_shape = X_test.shape

# Note: Do not normalize the cycle value! That is why we start with one
for single_scaler in scaler:
    for i in range(1, X_train.shape[-1]):
        X_train[:, :, i] = single_scaler.fit_transform(X_train[:, :, i])
        X_val[:, :, i] = single_scaler.transform(X_val[:, :, i])
        X_test[:, :, i] = single_scaler.transform(X_test[:, :, i])

Change data types of arrays to float32 and swap axes if necessary:

In [16]:
model_type = 'CNN_1D'

print(X_train.shape)
if model_type == 'CNN_1D':
    X_train = np.swapaxes(X_train, 1, 2)
elif model_type == 'CNN_2D':
    X_train = np.swapaxes(X_train, 1, 2)
    X_train = X_train[:, np.newaxis, :, :]
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
print(X_train.shape)

print(X_val.shape)
if model_type == 'CNN_1D':
    X_val = np.swapaxes(X_val, 1, 2)
elif model_type == 'CNN_2D':
    X_val = np.swapaxes(X_val, 1, 2)
    X_val = X_val[:, np.newaxis, :, :]
X_val = np.array(X_val, dtype=np.float32)
y_val = np.array(y_val, dtype=np.float32)
print(X_val.shape)

print(X_test.shape)
if model_type == 'CNN_1D':
    X_test = np.swapaxes(X_test, 1, 2)
elif model_type == 'CNN_2D':
    X_test = np.swapaxes(X_test, 1, 2)
    X_test = X_test[:, np.newaxis, :, :]
X_test = np.array(X_test, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)
print(X_test.shape)

(38747, 30, 46)
(38747, 46, 30)
(9812, 30, 46)
(9812, 46, 30)
(259, 30, 46)
(259, 46, 30)


## Only CNN

Inspiration: Paper Dynamic predictive maintenance for multiple components using data-driven
probabilistic RUL prognostics: The case of turbofan engines

In [17]:
# Select hyperparameters of trainer!
checkpoint_callback = ModelCheckpoint(monitor="val_loss")
trainer = Trainer(min_epochs=1, max_epochs=150, callbacks=[checkpoint_callback], deterministic=True)
datamodule = TurbofanDatamodule(batch_size=128)
datamodule.set_train_dataset(X_train, y_train)
datamodule.set_val_dataset(X_val, y_val)
datamodule.set_predict_dataset(X_test)
datamodule.set_test_dataset(X_test, y_test[:, 0])
model = CNNModel(lr=0.001, window_size=window_size, features=46, dropout_rate=0.2)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [18]:
%%capture
# For visualization write 'tensorboard --logdir=lightning_logs/' in console

trainer.fit(model, datamodule=datamodule)


  | Name        | Type    | Params
----------------------------------------
0 | loss        | MSELoss | 0     
1 | dropout     | Dropout | 0     
2 | layer1_conv | Conv1d  | 9.2 K 
3 | layer2_conv | Conv1d  | 8.0 K 
4 | layer3_conv | Conv1d  | 8.0 K 
5 | layer4_conv | Conv1d  | 8.0 K 
6 | fc1         | Linear  | 153 K 
7 | fc2         | Linear  | 8.3 K 
8 | fc3         | Linear  | 65    
----------------------------------------
195 K     Trainable params
0         Non-trainable params
195 K     Total params
0.782     Total estimated model params size (MB)


In [19]:
trainer.validate(model, datamodule=datamodule, ckpt_path='best')

Restoring states from the checkpoint path at C:\Users\Johannes\PycharmProjects\damage-propagation-modeling\lightning_logs\version_0\checkpoints\epoch=40-step=12423.ckpt
Loaded model weights from the checkpoint at C:\Users\Johannes\PycharmProjects\damage-propagation-modeling\lightning_logs\version_0\checkpoints\epoch=40-step=12423.ckpt
C:\Users\Johannes\anaconda3\envs\PSDA\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Validation: |          | 0/? [00:00<?, ?it/s]

[{'val_loss': 330.914306640625}]

In [20]:
pred = trainer.test(model, datamodule=datamodule, ckpt_path='best')

Restoring states from the checkpoint path at C:\Users\Johannes\PycharmProjects\damage-propagation-modeling\lightning_logs\version_0\checkpoints\epoch=40-step=12423.ckpt
Loaded model weights from the checkpoint at C:\Users\Johannes\PycharmProjects\damage-propagation-modeling\lightning_logs\version_0\checkpoints\epoch=40-step=12423.ckpt
C:\Users\Johannes\anaconda3\envs\PSDA\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

In [22]:
from sklearn.metrics import root_mean_squared_error
from lightning import LightningModule
import torch 
model_from_checkpoint = LightningModule.load_from_checkpoint("C:/Users/Johannes/PycharmProjects/damage-propagation-modeling/lightning_logs/version_134/checkpoints/epoch=114-step=26105.ckpt")
# disable randomness, dropout, etc...
model_from_checkpoint.eval()
# predict with the model
pred = model_from_checkpoint(torch.tensor(X_test)).detach().numpy()
root_mean_squared_error(pred, torch.tensor(y_test))

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Johannes/PycharmProjects/damage-propagation-modeling/lightning_logs/version_134/checkpoints/epoch=114-step=26105.ckpt'

In [23]:
pred = model(torch.tensor(X_test)).detach().numpy()
model.eval()
root_mean_squared_error(pred, torch.tensor(y_test))

29.64586

In [33]:
pred

array([ 51.862972,  67.89349 ,  26.498564,  95.374695,  79.14195 ,
        99.26876 ,  70.10647 ,  60.206066,  78.893234,  75.16445 ,
        68.97512 ,  75.68833 , 106.93179 ,  31.458738, 111.88981 ,
        70.50948 , 121.09294 , 133.38309 ,  88.324196,  93.30068 ,
        20.652811, 116.759964,  92.665634,  16.95975 , 100.89441 ,
        16.584661, 142.51463 , 109.31307 , 110.87555 ,  78.90963 ,
        85.24842 ,  45.71385 ,  86.61725 ,  63.96631 , 103.03752 ,
       113.956795, 127.93344 ,  37.64642 ,   9.226481,  43.823063,
       112.50242 ,  74.93342 , 115.161705,  78.06849 , 146.9397  ,
         9.05889 , 125.63947 ,  41.578754, 106.79327 ,  13.959208,
        63.320995, 141.03683 ,  81.06794 , 112.00667 ,  65.58863 ,
        62.390923,  57.31184 ,  38.89475 ,  36.391994,  92.64628 ,
       134.31812 ,  64.032   ,  80.11501 ,  34.75124 ,  94.39489 ,
        83.86167 ,  88.42881 , 127.99012 , 112.436226, 118.27959 ,
        51.29933 ,  80.9912  , 141.13812 , 146.47522 ,  49.284

In [34]:
y_test[:, 0]

array([ 44.,  51.,  27., 120., 101.,  99.,  71.,  55.,  55.,  66.,  77.,
       115., 115.,  31., 108.,  56., 136., 132.,  85.,  56.,  18., 119.,
        78.,   9.,  58.,  11.,  88., 144., 124.,  89.,  79.,  55.,  71.,
        65.,  87., 137., 145.,  22.,   8.,  41., 131., 115., 128.,  69.,
       111.,   7., 137.,  55., 135.,  11.,  78., 120.,  87.,  87.,  55.,
        93.,  88.,  40.,  49., 128., 129.,  58., 117.,  28., 115.,  87.,
        92., 103., 100.,  63.,  35.,  45.,  99., 117.,  45.,  27.,  86.,
        20.,  18., 133.,  15.,   6., 145., 104.,  56.,  25.,  68., 144.,
        41.,  51.,  81.,  14.,  67.,  10., 127., 113., 123.,  17.,   8.,
        28.], dtype=float32)