In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction with Transformer
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a regression problem.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.



## Transformer Description
Next to the previous CNN Architecture, we also had a look at Transformer Models.
The Transformer Model consists of a projection layer, an Transformer Encoder Block and a output layer at the end. <br>
The Transformer Models in general are bigger than the CNN Models with 1 to 10 million parameters depending on the projection size and window length, because at the end the (projection size x window_size) is flattened and inputed into the output layer.

### Results
The Transformer Model showed impressive results on the first and third dataset with an RMSE of 18.6 and 21.8. The models are stored in the models folder in pt format. However our Transformer Model was not able to find a fit for dataset 2 and 4. Having in mind the good results of the CNN approach, we did not further investigate the Transformer Model.


# Imports + Settings

In [3]:
## download important libraries
!pip install colorlog
!pip install ray
!pip install ax-platform botorch

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [4]:
# third-party libraries
import pandas as pd
import numpy as np
import os
from typing import List, Union
import time
from tqdm.notebook import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from scipy import stats
from scipy.stats import multivariate_normal, zscore
from scipy.stats._mstats_basic import winsorize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim.lr_scheduler import StepLR

In [5]:
# source code
os.chdir("../") # set working directory to root of project

# import own libraries
from src.utils import load_data, load_config, train_val_split_by_group
from src.rolling_window_creator import RollingWindowDatasetCreator, calculate_RUL
from src.data_cleaning import identify_missing_values, identify_single_unique_features, format_dtype, clean_data
import src.nn_utils as nu
import src.transformer_fred as tff
np.random.seed(42)

# Load config + Data

In [6]:
PATH_TO_CONFIG = "configs/config.yaml"
config = load_config(PATH_TO_CONFIG) # config is dict

In [7]:
%%time
## define data set you want to use in dataset_num
dataset_num = 1
train_data, test_data, test_RUL_data = load_data(config_path=PATH_TO_CONFIG, dataset_num=dataset_num)

2024-05-31 15:24:56 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-05-31 15:24:56 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-05-31 15:24:56 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-05-31 15:24:56 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-05-31 15:24:56 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m
CPU times: user 57.7 ms, sys: 18.8 ms, total: 76.4 ms
Wall time: 76.5 ms


In [8]:
## overview over test dataset
# count unit numbers in test set
print(f"Number of unique unit numbers in test set: {test_data['UnitNumber'].nunique()}")
# count min number of cycles in test set for each unit number --> window size must be in the range of these values, for example a window size of 10 would be too large if there is a unit number with only 10 cycles
print("Min number of cycles in test set for a unit number: ", test_data.groupby("UnitNumber")["Cycle"].count().min())

Number of unique unit numbers in test set: 100
Min number of cycles in test set for a unit number:  31


---
Test Data Cleaning Functionality and its impact on Rolling Window Creation

In [9]:
############################
## Finding: The Transformer Model works better, if it contains all variable.
##          Even those that have no clear correlation to RUL.
##          The Transformer Model can still use the contextual information given in complex time series

# clean data (with outlier removal, where no samples are dropped but the outliers are replaced, method='winsorize')
cleaned_train, cleaned_test = clean_data(train_data, test_data, method='winsorize', ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.0, contamination=0.05)

# calculate RUL for test dataset
cleaned_train_data = calculate_RUL(cleaned_train, time_column= "Cycle", group_column= "UnitNumber")
cleaned_test_data = nu.calculate_RUL_test(cleaned_test, test_RUL_data)

2024-05-31 15:24:56 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-05-31 15:24:56 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-05-31 15:24:56 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-31 15:24:56 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-31 15:24:56 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-05-31 15:24:56 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-05-31 15:24:56 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: winsorize ...[0m
2024-05-31 15:24:56 [[34msrc.outlier_detection:98[0m] [DEBUG[0m] >>>> Found 1031 outliers to be replaced (winsorized).[0m
2024-05-31 15:24:56 [[34msrc.outlier_detection:100[0m] [DEBUG[0m] >>>> Original DataFrame shape: (20631, 26), Resulting Da

In [10]:
############################
## Finding: The minimun window length in the test datasets are
##          significantly smaller than in the train datasets
##          --> one explanation for sometimes poorer performance on test dataset
group_sizes = test_data.groupby('UnitNumber').size()

# Calculate min, max, and mean of the group sizes
min_size = group_sizes.min()
max_size = group_sizes.max()
mean_size = group_sizes.mean()
sd_size = group_sizes.std()

print(f"Min group size: {min_size}")
print(f"Max group size: {max_size}")
print(f"Mean group size: {mean_size}")
print(f"Sd group size: {sd_size}")

Min group size: 31
Max group size: 303
Mean group size: 130.96
Sd group size: 53.593479175185195


# Hyper parameter search with Ray Tune

The hyper parameter search was done with Ray Tune on the cluster.
However we were not fully satisfied by the library and many tests and finetuning was also
done manually.

In [None]:
############################
## Finding: The window length is a import hyperparameter.
##          The Transformer Model needs bigger window size than the CNN models.
##          A good window size is around 80 instead of 30.
##          We have the feeling that an even bigger window size is better (140+) but it is limited 
##          due to the test dataset and its limited length.
##          Another aspect is the model size: only one layer is suitable due to size issues and also the 
##          combination of window size * project dim should not be too big

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from ray import tune, train
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler

# Import your necessary functions and modules (assuming nu module and TransformerModel are defined)
# from your_module import TransformerModel, TurbofanDataset, scale_data, create_sliding_window, train_val_split_by_group, count_parameters

# Define training function
def train_model(config, checkpoint_dir=None):
    window_size = config["window_size"]
    project_dim = config["project_dim"]
    num_heads = config["num_heads"]
    
    # Assume your data preparation functions are defined and work as shown in your example
    train_data, val_data = nu.scale_data(cleaned_train_data, cleaned_test_data)
    X_train, y_train = nu.create_sliding_window(train_data, window_size=window_size)
    X_val, y_val = nu.create_sliding_window(val_data, window_size=window_size, typ = "test")
    y_train = np.clip(y_train, a_min=None, a_max=130)
    y_train, X_train = nu.cut_high_RUL(y_train, X_train, 140, delete = 0.3)
    
    
    # Create datasets and dataloaders
    train_dataset = tff.TurbofanDataset(X_train, y_train)
    val_dataset = tff.TurbofanDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], shuffle=False)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = tff.TransformerModel(feature_size=X_train.shape[2], num_heads=num_heads, num_layers=config["num_layers"], project_dim=project_dim, window_size=window_size, dropout = config["dropout"]).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)
    scheduler = StepLR(optimizer, step_size=30, gamma=0.5)
    
    for epoch in range(config["num_epochs"]):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            targets = targets.view(-1, 1)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        train_loss = running_loss / len(train_loader.dataset)
        
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                targets = targets.view(-1, 1)
                outputs = model(inputs)

                ## preprocessing
                outputs = torch.round(outputs)
                # Set minimal value to 1
                min_value = 1
                outputs = torch.where(outputs < min_value, torch.tensor(min_value), outputs)

                loss = criterion(outputs, targets)
                running_loss += loss.item() * inputs.size(0)
        val_loss = running_loss / len(val_loader.dataset)
        
        train.report({"val_loss":val_loss, "train_loss":train_loss})
        scheduler.step()

# Define search space and Ray Tune configuration
search_space = {
    "window_size": tune.choice([50, 60, 70, 80, 90, 100]),
    "project_dim": tune.choice([16*3*2, 16*3*3]),
    "num_heads": tune.choice([8, 12]),
    "num_layers": 1,
    "batch_size": 128,
    "dropout": tune.choice([0.12, 0.18]),
    "num_epochs": tune.choice([5, 8, 11, 13, 17, 19, 22])  # Reduced for quicker tuning
}

# Use ASHAScheduler for efficient hyperparameter search
scheduler = ASHAScheduler(
    metric="val_loss",
    mode="min",
    max_t=25,
    grace_period=5,
    reduction_factor=2
)

# Configure the reporter
reporter = CLIReporter(
    metric_columns=["val_loss", "train_loss", "training_iteration"]
)

# Run hyperparameter search
result = tune.run(
    train_model,
    resources_per_trial={"cpu": 1, "gpu": 1},
    config=search_space,
    num_samples=50,
    scheduler=scheduler,
    progress_reporter=reporter
)

# Get the best trial
best_trial = result.get_best_trial("val_loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["val_loss"]))

## Train the best model again from scratch and save the best result

In [15]:
##################################
## Data specifics
window_size = best_trial.config["window_size"]
train_data, test_data = nu.scale_data(cleaned_train_data, cleaned_test_data)

## the validation split is done in away that one UnitNumber is either in Train or val but not in both
#train, val = train_val_split_by_group(train_data)

X_train, y_train = nu.create_sliding_window(train_data, window_size = window_size)

# Set values to maximum of the 130 according to the papers because in the early stage, everything looks the same
y_train = np.clip(y_train, a_min=None, a_max=130)
# now we have a overrepresentation of high RUL values in the train data --> randomly delete some of them
y_train, X_train = nu.cut_high_RUL(y_train, X_train, 130, delete = 0.3)

#X_val, y_val = nu.create_sliding_window(val, window_size = window_size)

#test_data = nu.scale_data(cleaned_test_data)
X_test, y_test = nu.create_sliding_window(test_data, typ = "test", window_size = window_size)

##################################
## Model specifics
seq_len, batch_size, feature_size = X_train.shape[1], best_trial.config["batch_size"], X_train.shape[2]
num_heads, num_layers, project_dim  = best_trial.config["num_heads"], best_trial.config["num_layers"], best_trial.config["project_dim"]
num_epochs = 40
learning_rate = 0.0001

print(seq_len)
# Create dataset and dataloaders
train_dataset = tff.TurbofanDataset(X_train, y_train)
test_dataset = tff.TurbofanDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
# Initialize model, criterion, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = tff.TransformerModel(feature_size, num_heads, num_layers, project_dim = project_dim, window_size = seq_len, dropout = best_trial.config["dropout"]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

## scheduler plays no role more in the later experiements
scheduler = StepLR(optimizer, step_size=30, gamma=0.5)

print(f"The model has in total {tff.count_parameters(model)} parameters!!")

prev_acc = 1200
# Training loop save the best model on the fly
for epoch in range(num_epochs):
    train_loss = tff.train_model(model, train_loader, criterion, optimizer, device)
    #val_loss = evaluate_model(model, val_loader, criterion, device)
    test_loss = tff.evaluate_model(model, test_loader, criterion, device)
    scheduler.step()

    if prev_acc > test_loss:
        save_path = f"tranM_df_{dataset_num}_{int(test_loss)}.pth"
        print(f"new best RMSE: {test_loss}")
        prev_acc = test_loss
        #torch.save(model, save_path)

    
    print(f"Epoch {epoch+1}/{num_epochs}, Train_L: {train_loss:.2f}, Test_L: {test_loss:.2f}, Test_RMSE: {np.sqrt(test_loss):.2f} ")

80
The model has in total 2850721 parameters!!
new best RMSE: 929.6300048828125
Epoch 1/100, Train_L: 2281.26, Test_L: 929.63, Test_RMSE: 30.49 
Epoch 2/100, Train_L: 496.54, Test_L: 1497.66, Test_RMSE: 38.70 
Epoch 3/100, Train_L: 222.20, Test_L: 1635.33, Test_RMSE: 40.44 
Epoch 4/100, Train_L: 167.65, Test_L: 1492.91, Test_RMSE: 38.64 
Epoch 5/100, Train_L: 137.30, Test_L: 1508.10, Test_RMSE: 38.83 
Epoch 6/100, Train_L: 120.33, Test_L: 1075.27, Test_RMSE: 32.79 
new best RMSE: 819.8800048828125
Epoch 7/100, Train_L: 110.36, Test_L: 819.88, Test_RMSE: 28.63 
Epoch 8/100, Train_L: 99.14, Test_L: 1149.84, Test_RMSE: 33.91 
new best RMSE: 696.4199829101562
Epoch 9/100, Train_L: 89.77, Test_L: 696.42, Test_RMSE: 26.39 
Epoch 10/100, Train_L: 82.87, Test_L: 774.73, Test_RMSE: 27.83 
Epoch 11/100, Train_L: 81.13, Test_L: 714.82, Test_RMSE: 26.74 
new best RMSE: 488.6600036621094
Epoch 12/100, Train_L: 76.54, Test_L: 488.66, Test_RMSE: 22.11 
Epoch 13/100, Train_L: 76.19, Test_L: 555.40, Te

KeyboardInterrupt: 

In [20]:
##############################
## Finding: good config for df1 and df3:
window_size = 80
seq_len, batch_size, feature_size = X_train.shape[1], 128, X_train.shape[2]
num_heads, num_layers, project_dim  = 8, 1, 120
num_epochs = 100
learning_rate = 0.0001
drop_out = 0.18

## Load best Transformer Models for DF1 and DF3

In [19]:
## load best models for dataset 1 and 3
PATH_DF1 = "models/tranM_df_1_351.pth"
PATH_DF3= "models/tranM_df_3_476.pth"
loaded_model_1 = torch.load(PATH_DF1)
loaded_model_3 = torch.load(PATH_DF3)