In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [3]:
# third-party libraries
import pandas as pd
import numpy as np
import os
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.metrics import root_mean_squared_error
import torch 

import time

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
# source code
from src.data_loading import load_data, load_config
from src.data_splitting import train_val_split_by_group
from src.nn_utils import create_sliding_window, create_sliding_window_test
from src.rolling_window_creator import calculate_RUL
from src.data_processing import apply_padding_on_train_data_and_test_data, drop_samples_with_clipped_values, extract_peaks_from_sensor_signal
from src.nn_util.nn_models.ligthning.cnnModel1 import CNNModel1 as CNNModel
from src.nn_util.datamodule.lightning.turbofanDatamodule import TurbofanDatamodule
from src.data_cleaning import clean_data

In [5]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [6]:
np.random.seed(42)

# Paths

In [7]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
os.chdir("../") # set working directory to root of project
os.getcwd() # check current working directory

'C:\\Users\\Johannes\\PycharmProjects\\damage-propagation-modeling'

In [8]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [9]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [61]:
dataset_num = 2
train_data, test_data, test_RUL_data = load_data(config_path=PATH_TO_CONFIG, dataset_num=dataset_num)

2024-06-01 12:41:48 [[34msrc.data_loading:43[0m] [[32mINFO[0m] >>>> Loading data set 2...[0m
2024-06-01 12:41:49 [[34msrc.data_loading:72[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 2.[0m
2024-06-01 12:41:49 [[34msrc.data_loading:73[0m] [[32mINFO[0m] >>>> Train Data: (53759, 26)[0m
2024-06-01 12:41:49 [[34msrc.data_loading:74[0m] [[32mINFO[0m] >>>> Test Data: (33991, 26)[0m
2024-06-01 12:41:49 [[34msrc.data_loading:75[0m] [[32mINFO[0m] >>>> Test RUL Data: (259, 1)[0m


# Create Neural Regression Models

Pipeline:
1.	Data Cleaning
2.	Optional: Padding
3.	Create sliding windows
4.	Split train data in validation and train data
5.	Drop some samples with the clipped value
6.	Scale the Data
7.	Find the best hyperparameters
8.	Create Model with found hyperparameters

Explanation of selected hyperparameters:
*	Window size: We selected a window size of 30 due to some experiments with other window sizes. Furthermore, the window size is also used in the paper from Mitici [1] which shows good results with a CNN architecture.
*	Clipping value: The clipping value of 125 has been selected because it has proven useful and is used in paper [1] 

References:
1.	Mihaela Mitici, Ingeborg de Pater, Anne Barros, Zhiguo Zeng, “Dynamic predictive maintenance for multiple components using data-driven probabilistic RUL prognostics: The case of turbofan engines”, Reliability Engineering & System Safety, Volume 234, 2023, https://doi.org/10.1016/j.ress.2023.109199.


In [62]:
# some hyperparameters
time_column = 'Cycle'
group_column = 'UnitNumber'

window_size = 30
clip_value = 125
test_size = 0.1
apply_data_cleaning = True
# If activated, adds for every sensor a new column with the commutative sum of the peaks
apply_peaks_generation = False

# Apply scaler. The order in the list represents the order in which they are applied
std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
robust_scaler = RobustScaler()
scaler = [std_scaler, minmax_scaler, robust_scaler]

Explanation of each step:
- Optional:  Extraction of peaks
    - Adds for each sensor measure a column with the sum over all the peaks from the first cycle till the current cycle
    - This additional feature made no significant difference, so it is deactivated 
- Data Cleaning
    - The outlier detection and replacement method has been deactivated. 
    - The removal of columns based on the correlation of a single value has been deactivated because the neural model makes the feature selection.
    - Features with a unique single value will be removed
- Padding:
    -	Only applied for the datasets with a sample in test or train data smaller than the window size.
    -	The padding length is exactly the difference between the window size and the timesteps of the sample with the fewest timesteps
    -	The padding is applied on all the time series
- Create sliding window
    -	The sliding window approach for the NN techniques differs from the previous approach.
        Now, we do not have any aggregation but we keep the data as it is in windows so that the NN model can extract its own features
- Split train data in validation and train data
    -	Splitting training and validation sets based on the UnitNumber
- Drop some samples with the clipped value:
    -	To make the data more evenly distributed, in this step some of the samples with the clipping value as RUL are removed 
    -	Therefore, the median of the frequency of other RUL values is computed and the number of samples with the clipping value is a multiple of the median. 
    -	We selected two to not drop too many samples


In [63]:
if apply_peaks_generation:
    train_data = extract_peaks_from_sensor_signal(train_data)
    test_data = extract_peaks_from_sensor_signal(test_data)


if apply_data_cleaning:
    train_data, test_data = clean_data(train_data, test_data, method=None, ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.0, contamination=0.05)
    
# Add column RUL to train_data
train_data = calculate_RUL(train_data, time_column, group_column, clip_value)

train_data, test_data = apply_padding_on_train_data_and_test_data(train_data=train_data, test_data=test_data, window_size=window_size)

train, val = train_val_split_by_group(train_data, test_size=test_size, random_state=12)

X_train, y_train = create_sliding_window(train, window_size=window_size)  #, drop_columns=['UnitNumber', 'Cycle, 'RUL'])
X_val, y_val = create_sliding_window(val, window_size=window_size)  #, drop_columns=['UnitNumber', 'Cycle', 'RUL'])
X_test, _ = create_sliding_window_test(test_data, column_RUL=False, drop_columns=['UnitNumber'])
y_test = test_RUL_data.values

X_train, y_train = drop_samples_with_clipped_values(X_train, y_train, clip_value)
X_val, y_val = drop_samples_with_clipped_values(X_val, y_val, clip_value)

2024-06-01 12:41:54 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-06-01 12:41:54 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-06-01 12:41:54 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 12:41:54 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-06-01 12:41:54 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-06-01 12:41:54 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-06-01 12:41:54 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: None ...[0m
2024-06-01 12:41:54 [[34msrc.outlier_detection:162[0m] [[32mINFO[0m] >>>> No outlier detection method specified. Skipping outlier detection.[0m
2024-06-01 12:41:54 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: N

Scale the data
*	The applied scalers are the StandardScaler, the MinMaxScaler, and the RobustScaler 
*	These three scalers have been selected because the training has been most robust with them


In [64]:
# Note: Do not normalize the cycle value! That is why we start with one
for single_scaler in scaler:
    for i in range(1, X_train.shape[-1]):
        X_train[:, :, i] = single_scaler.fit_transform(X_train[:, :, i])
        X_val[:, :, i] = single_scaler.transform(X_val[:, :, i])
        X_test[:, :, i] = single_scaler.transform(X_test[:, :, i])

Change data types of arrays to float32 and swap axes if necessary:

In [65]:
print(X_train.shape)
X_train = np.swapaxes(X_train, 1, 2)
X_train = np.array(X_train, dtype=np.float32)
y_train = np.array(y_train, dtype=np.float32)
print(X_train.shape)

print(X_val.shape)
X_val = np.swapaxes(X_val, 1, 2)
X_val = np.array(X_val, dtype=np.float32)
y_val = np.array(y_val, dtype=np.float32)
print(X_val.shape)

print(X_test.shape)
X_test = np.swapaxes(X_test, 1, 2)
X_test = np.array(X_test, dtype=np.float32)
y_test = np.array(y_test, dtype=np.float32)
print(X_test.shape)

(29420, 30, 46)
(29420, 46, 30)
(3276, 30, 46)
(3276, 46, 30)
(259, 30, 46)
(259, 46, 30)


Save processed test data

In [66]:
save_test_data = False
if save_test_data:
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    np.save(f"{config['paths']['processed_data_dir']}ex2_preprocessed_X_test_from_dataset_{dataset_num}_for_CNNModel1_{timestamp}.npy", X_test)
    np.save(f"{config['paths']['processed_data_dir']}ex2_preprocessed_y_test_from_dataset_{dataset_num}_for_CNNModel1_{timestamp}.npy", y_test)

## CNN

Architecture
*	The architecture of the first CNN model (“ExampleCNNModel”) is a minimalistic approach with only two convolutional layers and some fully connected layers 
*	The second CNN model uses more convolutional layers and one fully connected layers more
*	More convolutional layers are used to be more like the architecture from the paper from Mitici [1]
*	Both architectures use only 1D convolutional layers as is done in the paper [1]
*	Both use dropout to enable generalization and prevent overfitting
*	Adam is used as an optimizer and the mean squared error as a loss function
*	Because the possible targets are higher or equal to one in the second CNN the max function with one is applied on the output.


Hyperparameter search
*	The best hyperparameters are found with Bayesian Optimization
*	For each dataset a new set of hyperparameters has been searched
*	The search has been done on the SCC JupyterHub and to parallelize the computation for each data set a separate Notebook has been created
*	The notebooks are stored in the “notebooks/cnn_hyperparameter_search” folder


Note: By writing 'tensorboard --logdir=lightning_logs/' in the console the runs get visualized.

In [67]:
hyper_params = [{'batch_size': 114.84809532072403, 'beta_1': 0.9586517323123119, 'beta_2': 0.9558431375026947, 'dropout': 0.021025382021542985, 'learning_rate_init': 0.01}, 
                {'batch_size': 127.48322018921996, 'beta_1': 0.8003393739374182, 'beta_2': 0.9058535052032789, 'dropout': 0.2584373840086995, 'learning_rate_init': 0.0015861602059778223},
                {'batch_size': 92.4798215637139, 'beta_1': 0.9635139876762263, 'beta_2': 0.9432583039935667, 'dropout': 0.2119494320551308, 'learning_rate_init': 0.0004461791916105841}, 
                {'batch_size': 153.588222351065, 'beta_1': 0.9644278054982097, 'beta_2': 0.926610728635691, 'dropout': 0.02279168671841337, 'learning_rate_init': 0.007943006245227067},
                ]

seeds = [21, 21, 21, 21]

In [68]:
pl.seed_everything(seeds[dataset_num-1])

# Select hyperparameters of trainer!
checkpoint_callback = ModelCheckpoint(monitor="val_loss")
trainer = Trainer(min_epochs=1, max_epochs=150, callbacks=[checkpoint_callback], deterministic=True)
datamodule = TurbofanDatamodule(batch_size=int(hyper_params[dataset_num-1]['batch_size']))
datamodule.set_train_dataset(X_train, y_train)
datamodule.set_val_dataset(X_val, y_val)
datamodule.set_predict_dataset(X_test)
datamodule.set_test_dataset(X_test, y_test[:, 0])
model = CNNModel(lr=hyper_params[dataset_num-1]['learning_rate_init'], beta_1=hyper_params[dataset_num-1]['beta_1'], beta_2=hyper_params[dataset_num-1]['beta_2'], window_size=window_size, features=X_train.shape[1], dropout_rate=hyper_params[dataset_num-1]['dropout'])

Seed set to 21
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [69]:
%%capture
# For visualization write 'tensorboard --logdir=lightning_logs/' in console

trainer.fit(model, datamodule=datamodule)


  | Name        | Type    | Params
----------------------------------------
0 | loss        | MSELoss | 0     
1 | dropout     | Dropout | 0     
2 | layer1_conv | Conv1d  | 9.2 K 
3 | layer2_conv | Conv1d  | 8.0 K 
4 | layer3_conv | Conv1d  | 8.0 K 
5 | layer4_conv | Conv1d  | 8.0 K 
6 | fc1         | Linear  | 153 K 
7 | fc2         | Linear  | 8.3 K 
8 | fc3         | Linear  | 65    
----------------------------------------
195 K     Trainable params
0         Non-trainable params
195 K     Total params
0.782     Total estimated model params size (MB)


In [70]:
%%capture
pred = trainer.test(model, datamodule=datamodule, ckpt_path="best")

Restoring states from the checkpoint path at C:\Users\Johannes\PycharmProjects\damage-propagation-modeling\lightning_logs\version_3\checkpoints\epoch=22-step=5336.ckpt
Loaded model weights from the checkpoint at C:\Users\Johannes\PycharmProjects\damage-propagation-modeling\lightning_logs\version_3\checkpoints\epoch=22-step=5336.ckpt


In [None]:
pred

## Scores on all testsets 

In [21]:
all_test_data = []
paths = [
    ('data/processed/ex2_preprocessed_X_test_from_dataset_1_for_CNNModel1_20240531-232248.npy', 'data/processed/ex2_preprocessed_y_test_from_dataset_1_for_CNNModel1_20240531-232248.npy'),
    ('data/processed/ex2_preprocessed_X_test_from_dataset_2_for_CNNModel1_20240531-233230.npy', 'data/processed/ex2_preprocessed_y_test_from_dataset_2_for_CNNModel1_20240531-233230.npy'),
    ('data/processed/ex2_preprocessed_X_test_from_dataset_3_for_CNNModel1_20240531-232732.npy', 'data/processed/ex2_preprocessed_y_test_from_dataset_3_for_CNNModel1_20240531-232732.npy'),
    ('data/processed/ex2_preprocessed_X_test_from_dataset_4_for_CNNModel1_20240531-234033.npy', 'data/processed/ex2_preprocessed_y_test_from_dataset_4_for_CNNModel1_20240531-234033.npy'),
]
for i in range(len(paths)):
    X_temp = np.load(paths[i][0])
    y_temp = np.load(paths[i][1])
    all_test_data.append((X_temp, y_temp))

In [112]:
dataset_num_temp = 1
model = CNNModel(lr=hyper_params[dataset_num_temp-1]['learning_rate_init'], beta_1=hyper_params[dataset_num_temp-1]['beta_1'], beta_2=hyper_params[dataset_num_temp-1]['beta_2'], window_size=window_size, features=all_test_data[dataset_num_temp-1][0].shape[1], dropout_rate=hyper_params[dataset_num_temp-1]['dropout'])
checkpoint = torch.load("models/cnn_dataset_1.ckpt")
model.load_state_dict(checkpoint['state_dict'])

model.eval()

pred = model(torch.tensor(all_test_data[dataset_num_temp-1][0])).detach().numpy()
rmse_cnn_1 = root_mean_squared_error(pred, torch.tensor(all_test_data[dataset_num_temp-1][1]))
print(f'The RMSE score on dataset FD00{dataset_num_temp} is {rmse_cnn_1}.')

The RMSE score on dataset FD001 is 16.927452087402344.


In [22]:
dataset_num_temp = 2
model = CNNModel(lr=hyper_params[dataset_num_temp-1]['learning_rate_init'], beta_1=hyper_params[dataset_num_temp-1]['beta_1'], beta_2=hyper_params[dataset_num_temp-1]['beta_2'], window_size=window_size, features=all_test_data[dataset_num_temp-1][0].shape[1], dropout_rate=hyper_params[dataset_num_temp-1]['dropout'])
checkpoint = torch.load("models/cnn_dataset_2.ckpt")
model.load_state_dict(checkpoint['state_dict'])

model.eval()

pred = model(torch.tensor(all_test_data[dataset_num_temp-1][0])).detach().numpy()
rmse_cnn_2 = root_mean_squared_error(pred, torch.tensor(all_test_data[dataset_num_temp-1][1]))
print(f'The RMSE score on dataset FD00{dataset_num_temp} is {rmse_cnn_2}.')

The RMSE score on dataset FD002 is 34.540584564208984.


In [113]:
dataset_num_temp = 3
model = CNNModel(lr=hyper_params[dataset_num_temp-1]['learning_rate_init'], beta_1=hyper_params[dataset_num_temp-1]['beta_1'], beta_2=hyper_params[dataset_num_temp-1]['beta_2'], window_size=window_size, features=all_test_data[dataset_num_temp-1][0].shape[1], dropout_rate=hyper_params[dataset_num_temp-1]['dropout'])
checkpoint = torch.load("models/cnn_dataset_3.ckpt")
model.load_state_dict(checkpoint['state_dict'])

model.eval()

pred = model(torch.tensor(all_test_data[dataset_num_temp-1][0])).detach().numpy()
rmse_cnn_3 = root_mean_squared_error(pred, torch.tensor(all_test_data[dataset_num_temp-1][1]))
print(f'The RMSE score on dataset FD00{dataset_num_temp} is {rmse_cnn_3}.')

The RMSE score on dataset FD003 is 19.158700942993164.


In [34]:
dataset_num_temp = 4
model = CNNModel(lr=hyper_params[dataset_num_temp-1]['learning_rate_init'], beta_1=hyper_params[dataset_num_temp-1]['beta_1'], beta_2=hyper_params[dataset_num_temp-1]['beta_2'], window_size=window_size, features=all_test_data[dataset_num_temp-1][0].shape[1], dropout_rate=hyper_params[dataset_num_temp-1]['dropout'])
checkpoint = torch.load("models/cnn_dataset_4.ckpt")
model.load_state_dict(checkpoint['state_dict'])

model.eval()

pred = model(torch.tensor(all_test_data[dataset_num_temp-1][0])).detach().numpy()
rmse_cnn_4 = root_mean_squared_error(pred, torch.tensor(all_test_data[dataset_num_temp-1][1]))
print(f'The RMSE score on dataset FD00{dataset_num_temp} is {rmse_cnn_4}.')

The RMSE score on dataset FD004 is 39.244022369384766.


# !!!!! Transformer Approach down below !!!!!!
We thought to add the Transformer approach as well because we spend a lot of time to figure out different techniques.
The transformer approach has a similar performance as the CNN approach for dataset 1 and 3. Nevertheless, it is not our best attempt but worth mentioning.

The complete notebooks with the cell outputs is in "Transformer_pipeline.ipynb". Due to time and computing power limitations, we did run the experiments only on the cluster and copied the code in this file.

!!! Go to "Transformer_pipeline.ipynb" for more details!!!

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction with Transformer
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a regression problem.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


## Transformer Description
Next to the previous CNN Architecture, we also had a look at Transformer Models.
The Transformer Model consists of a projection layer, an Transformer Encoder Block and a output layer at the end. <br>
The Transformer Models in general are bigger than the CNN Models with 1 to 10 million parameters depending on the projection size and window length, because at the end the (projection size x window_size) is flattened and inputed into the output layer.

### Results
The Transformer Model showed impressive results on the first and third dataset with an RMSE of 18.6 and 21.8. The models are stored in the models folder in pt format. However our Transformer Model was not able to find a fit for dataset 2 and 4. Having in mind the good results of the CNN approach, we did not further investigate the Transformer Model.



# Imports + Settings

In [None]:
## download important libraries
!pip install colorlog
!pip install ray
!pip install ax-platform botorch
# third-party libraries
import pandas as pd
import numpy as np
import os
from typing import List, Union
import time
from tqdm.notebook import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from scipy import stats
from scipy.stats import multivariate_normal, zscore
from scipy.stats._mstats_basic import winsorize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim.lr_scheduler import StepLR

In [None]:
# source code
os.chdir("../") # set working directory to root of project

# import own libraries
from src.utils import load_data, load_config, train_val_split_by_group
from src.rolling_window_creator import RollingWindowDatasetCreator, calculate_RUL
from src.data_cleaning import identify_missing_values, identify_single_unique_features, format_dtype, clean_data
import src.nn_utils as nu
import src.transformer_fred as tff
np.random.seed(42)

# Load config + Data

In [None]:
PATH_TO_CONFIG = "configs/config.yaml"
config = load_config(PATH_TO_CONFIG) # config is dict

In [None]:
%%time
## define data set you want to use in dataset_num
dataset_num = 1
train_data, test_data, test_RUL_data = load_data(config_path=PATH_TO_CONFIG, dataset_num=dataset_num)

In [None]:
## overview over test dataset
# count unit numbers in test set
print(f"Number of unique unit numbers in test set: {test_data['UnitNumber'].nunique()}")
# count min number of cycles in test set for each unit number --> window size must be in the range of these values, for example a window size of 10 would be too large if there is a unit number with only 10 cycles
print("Min number of cycles in test set for a unit number: ", test_data.groupby("UnitNumber")["Cycle"].count().min())

---
Test Data Cleaning Functionality and its impact on Rolling Window Creation


In [None]:
############################
## Finding: The Transformer Model works better, if it contains all variable.
##          Even those that have no clear correlation to RUL.
##          The Transformer Model can still use the contextual information given in complex time series

# clean data (with outlier removal, where no samples are dropped but the outliers are replaced, method='winsorize')
cleaned_train, cleaned_test = clean_data(train_data, test_data, method='winsorize', ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.0, contamination=0.05)

# calculate RUL for test dataset
cleaned_train_data = calculate_RUL(cleaned_train, time_column= "Cycle", group_column= "UnitNumber")
cleaned_test_data = nu.calculate_RUL_test(cleaned_test, test_RUL_data)


In [None]:
############################
## Finding: The minimun window length in the test datasets are
##          significantly smaller than in the train datasets
##          --> one explanation for sometimes poorer performance on test dataset
group_sizes = test_data.groupby('UnitNumber').size()

# Calculate min, max, and mean of the group sizes
min_size = group_sizes.min()
max_size = group_sizes.max()
mean_size = group_sizes.mean()
sd_size = group_sizes.std()

print(f"Min group size: {min_size}")
print(f"Max group size: {max_size}")
print(f"Mean group size: {mean_size}")
print(f"Sd group size: {sd_size}")

# Hyper parameter search with Ray Tune
The hyper parameter search was done with Ray Tune on the cluster.
However we were not fully satisfied by the library and many tests and finetuning was also
done manually.

In [None]:
############################
## Finding: The window length is a import hyperparameter.
##          The Transformer Model needs bigger window size than the CNN models.
##          A good window size is around 80 instead of 30.
##          We have the feeling that an even bigger window size is better (140+) but it is limited 
##          due to the test dataset and its limited length.
##          Another aspect is the model size: only one layer is suitable due to size issues and also the 
##          combination of window size * project dim should not be too big

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

# Import your necessary functions and modules (assuming nu module and TransformerModel are defined)
# from your_module import TransformerModel, TurbofanDataset, scale_data, create_sliding_window, train_val_split_by_group, count_parameters

# Define training function
def train_model(config, checkpoint_dir=None):
    window_size = config["window_size"]
    project_dim = config["project_dim"]
    num_heads = config["num_heads"]
    
    # Assume your data preparation functions are defined and work as shown in your example
    train_data, val_data = nu.scale_data(cleaned_train_data, cleaned_test_data)
    X_train, y_train = nu.create_sliding_window(train_data, window_size=window_size)
    X_val, y_val = nu.create_sliding_window(val_data, window_size=window_size, typ = "test")
    y_train = np.clip(y_train, a_min=None, a_max=130)
    y_train, X_train = nu.cut_high_RUL(y_train, X_train, 140, delete = 0.3)
    
    
    # Create datasets and dataloaders
    train_dataset = tff.TurbofanDataset(X_train, y_train)
    val_dataset = tff.TurbofanDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = tff.TransformerModel(feature_size=X_train.shape[2], num_heads=num_heads, num_layers=config["num_layers"], project_dim=project_dim, window_size=window_size, dropout = config["dropout"]).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    scheduler = StepLR(optimizer, step_size=30, gamma=0.5)
    
    for epoch in range(config["num_epochs"]):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            targets = targets.view(-1, 1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        train_loss = running_loss / len(train_loader.dataset)
        
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                targets = targets.view(-1, 1)
                outputs = model(inputs)

                ## preprocessing
                outputs = torch.round(outputs)
                # Set minimal value to 1
                min_value = 1
                outputs = torch.where(outputs < min_value, torch.tensor(min_value), outputs)

                loss = criterion(outputs, targets)
                running_loss += loss.item() * inputs.size(0)
        val_loss = running_loss / len(val_loader.dataset)
        
        train.report({"val_loss":val_loss, "train_loss":train_loss})
        scheduler.step()

# Define search space and Ray Tune configuration
search_space = {
    "window_size": tune.choice([50, 60, 70, 80, 90, 100]),
    "project_dim": tune.choice([16*3*2, 16*3*3]),
    "num_heads": tune.choice([8, 12]),
    "num_layers": 1,
    "batch_size": 128,
    "dropout": tune.choice([0.12, 0.18]),
    "num_epochs": tune.choice([5, 8, 11, 13, 17, 19, 22])  # Reduced for quicker tuning
}

# Use ASHAScheduler for efficient hyperparameter search
scheduler = ASHAScheduler(
    metric="val_loss",
    mode="min",
    max_t=25,
    grace_period=5,
    reduction_factor=2
)

# Configure the reporter
reporter = CLIReporter(
    metric_columns=["val_loss", "train_loss", "training_iteration"]
)

# Run hyperparameter search
result = tune.run(
    train_model,
    resources_per_trial={"cpu": 1, "gpu": 1},
    config=search_space,
    num_samples=50,
    scheduler=scheduler,
    progress_reporter=reporter
)

# Get the best trial
best_trial = result.get_best_trial("val_loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["val_loss"]))

## Train the best model again from scratch and save the best result

In [None]:
##################################
## Data specifics
window_size = best_trial.config["window_size"]
train_data, test_data = nu.scale_data(cleaned_train_data, cleaned_test_data)

## the validation split is done in away that one UnitNumber is either in Train or val but not in both
#train, val = train_val_split_by_group(train_data)

X_train, y_train = nu.create_sliding_window(train_data, window_size = window_size)

# Set values to maximum of the 130 according to the papers because in the early stage, everything looks the same
y_train = np.clip(y_train, a_min=None, a_max=130)
# now we have a overrepresentation of high RUL values in the train data --> randomly delete some of them
y_train, X_train = nu.cut_high_RUL(y_train, X_train, 130, delete = 0.3)

#X_val, y_val = nu.create_sliding_window(val, window_size = window_size)

#test_data = nu.scale_data(cleaned_test_data)
X_test, y_test = nu.create_sliding_window(test_data, typ = "test", window_size = window_size)

##################################
## Model specifics
seq_len, batch_size, feature_size = X_train.shape[1], best_trial.config["batch_size"], X_train.shape[2]
num_heads, num_layers, project_dim  = best_trial.config["num_heads"], best_trial.config["num_layers"], best_trial.config["project_dim"]
num_epochs = 40
learning_rate = 0.0001

print(seq_len)
# Create dataset and dataloaders
train_dataset = tff.TurbofanDataset(X_train, y_train)
test_dataset = tff.TurbofanDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
# Initialize model, criterion, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = tff.TransformerModel(feature_size, num_heads, num_layers, project_dim = project_dim, window_size = seq_len, dropout = best_trial.config["dropout"]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

## scheduler plays no role more in the later experiements
scheduler = StepLR(optimizer, step_size=30, gamma=0.5)

print(f"The model has in total {tff.count_parameters(model)} parameters!!")

prev_acc = 1200
# Training loop save the best model on the fly
for epoch in range(num_epochs):
    train_loss = tff.train_model(model, train_loader, criterion, optimizer, device)
    #val_loss = evaluate_model(model, val_loader, criterion, device)
    test_loss = tff.evaluate_model(model, test_loader, criterion, device)
    scheduler.step()

    if prev_acc > test_loss:
        save_path = f"tranM_df_{dataset_num}_{int(test_loss)}.pth"
        print(f"new best RMSE: {test_loss}")
        prev_acc = test_loss
        #torch.save(model, save_path)

    
    print(f"Epoch {epoch+1}/{num_epochs}, Train_L: {train_loss:.2f}, Test_L: {test_loss:.2f}, Test_RMSE: {np.sqrt(test_loss):.2f} ")
    

In [None]:
##############################
## Finding: good config for df1 and df3:
window_size = 80
seq_len, batch_size, feature_size = X_train.shape[1], 128, X_train.shape[2]
num_heads, num_layers, project_dim  = 8, 1, 120
num_epochs = 100
learning_rate = 0.0001
drop_out = 0.18


## Load best Transformer Models for DF1 and DF3

In [None]:
## load best models for dataset 1 and 3
PATH_DF1 = "models/tranM_df_1_351.pth"
PATH_DF3= "models/tranM_df_3_476.pth"
loaded_model_1 = torch.load(PATH_DF1)
loaded_model_3 = torch.load(PATH_DF3)