In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a regression problem.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis.
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization. -> 🎯 **Focus on this task** data preparation and feature selection (feature extraction part of sliding window method).
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [3]:
# third-party libraries
import pandas as pd
import numpy as np
import os
from typing import List, Union
import time
from tqdm.notebook import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from scipy import stats
from scipy.stats import multivariate_normal, zscore
from scipy.stats._mstats_basic import winsorize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.optim.lr_scheduler import StepLR

In [4]:
# source code
os.chdir("../") # set working directory to root of project
#os.getcwd() # check current working directory

from src.utils import load_data, load_config, train_val_split_by_group
from src.rolling_window_creator import RollingWindowDatasetCreator, calculate_RUL
from src.data_cleaning import identify_missing_values, identify_single_unique_features, format_dtype, clean_data
import src.nn_utils as nu

In [5]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [6]:
np.random.seed(42)

# Paths

In [7]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load config + Data

In [8]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [9]:
%%time
train_data, test_data, test_RUL_data = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)

2024-05-24 14:30:51 [[34msrc.utils:60[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-05-24 14:30:51 [[34msrc.utils:89[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-05-24 14:30:51 [[34msrc.utils:90[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-05-24 14:30:51 [[34msrc.utils:91[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-05-24 14:30:51 [[34msrc.utils:92[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m
CPU times: user 62.6 ms, sys: 12.8 ms, total: 75.4 ms
Wall time: 75.5 ms


In [10]:
# count unit numbers in test set
print(f"Number of unique unit numbers in test set: {test_data['UnitNumber'].nunique()}")
# count min number of cycles in test set for each unit number --> window size must be in the range of these values, for example a window size of 10 would be too large if there is a unit number with only 10 cycles
print("Min number of cycles in test set for a unit number: ", test_data.groupby("UnitNumber")["Cycle"].count().min())

Number of unique unit numbers in test set: 100
Min number of cycles in test set for a unit number:  31


---
Test Data Cleaning Functionality and its impact on Rolling Window Creation

In [11]:
def calculate_RUL_test(test_data, RUL_data):
    RUL = []
    for i in RUL_data.iterrows():
        unit_num = i[0]
        val = i[1]["RUL"]
        tmp = test_data[test_data["UnitNumber"] == unit_num + 1]
        li = list(range(val + len(tmp) - 1, val - 1, -1))
        for j in li:
            RUL.append(j)
        assert RUL[-1] == val
    assert len(RUL) == len(test_data)
    test_data["RUL"] = RUL
    return test_data

In [12]:
# clean data (with outlier removal, where no samples are dropped but the outliers are replaced, method='winsorize')
# TODO: outsource settings to config file
cleaned_train, cleaned_test = clean_data(train_data, test_data, method='winsorize', ignore_columns=['UnitNumber', 'Cycle'], threshold_missing=0.1, threshold_corr=0.3, contamination=0.05)

2024-05-24 14:30:52 [[34msrc.data_cleaning:134[0m] [[32mINFO[0m] >>>> Cleaning train and test data...[0m
2024-05-24 14:30:52 [[34msrc.data_cleaning:136[0m] [[32mINFO[0m] >>>> Formatting column types...[0m
2024-05-24 14:30:52 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-24 14:30:52 [[34msrc.data_cleaning:69[0m] [DEBUG[0m] >>>> Found 0 categorical columns: [][0m
2024-05-24 14:30:52 [[34msrc.data_cleaning:141[0m] [[32mINFO[0m] >>>> Handling duplicates...[0m
2024-05-24 14:30:52 [[34msrc.data_cleaning:146[0m] [[32mINFO[0m] >>>> Removing outliers...[0m
2024-05-24 14:30:52 [[34msrc.outlier_detection:150[0m] [DEBUG[0m] >>>> Removing outliers using method: winsorize ...[0m
2024-05-24 14:30:52 [[34msrc.outlier_detection:98[0m] [DEBUG[0m] >>>> Found 1031 outliers to be replaced (winsorized).[0m
2024-05-24 14:30:52 [[34msrc.outlier_detection:100[0m] [DEBUG[0m] >>>> Original DataFrame shape: (20631, 26), Resulting Da

In [13]:
cleaned_train_data = calculate_RUL(cleaned_train, time_column= "Cycle", group_column= "UnitNumber")
cleaned_test_data = calculate_RUL_test(cleaned_test, test_RUL_data)

In [14]:
cleaned_train_data

Unnamed: 0,UnitNumber,Cycle,Sensor Measure 2,Sensor Measure 3,Sensor Measure 4,Sensor Measure 7,Sensor Measure 8,Sensor Measure 9,Sensor Measure 11,Sensor Measure 12,Sensor Measure 13,Sensor Measure 14,Sensor Measure 15,Sensor Measure 17,Sensor Measure 20,Sensor Measure 21,RUL
0,1,1,641.92,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.4190,192
1,1,2,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.00,23.4236,191
2,1,3,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,391,38.95,23.3442,190
3,1,4,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.15,522.50,2388.08,8133.83,8.3859,392,38.88,23.3739,189
4,1,5,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393,38.90,23.4044,188
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,643.49,1597.98,1425.67,551.74,2388.19,9065.52,48.05,520.04,2388.23,8137.60,8.4956,396,38.49,23.0934,5
20627,100,197,643.54,1601.47,1425.67,551.74,2388.22,9065.11,48.04,520.04,2388.22,8136.50,8.5110,395,38.49,23.1594,4
20628,100,198,643.42,1601.47,1425.67,551.74,2388.22,9065.90,48.05,520.04,2388.23,8141.05,8.5110,396,38.49,23.0934,3
20629,100,199,643.23,1601.47,1425.67,551.74,2388.22,9073.72,48.05,520.04,2388.23,8139.29,8.5110,395,38.49,23.0934,2


In [15]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (seq_len, batch_size, feature_size)
        """
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, feature_size: int, num_heads: int, num_layers: int, project_dim : int, window_size: int = 30, dropout: float = 0.05):
        super(TransformerModel, self).__init__()
        self.feature_size = feature_size
        self.project_dim = project_dim
        #self.intermediate_dim = intermediate_dim
        # pseudo emb
        self.project_emb = nn.Linear(feature_size, project_dim)
        
        # Positional Encoding
        self.positional_encoding = PositionalEncoding(project_dim, dropout)
        
        # Transformer Encoder
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=project_dim, nhead=num_heads, dropout=dropout, batch_first = True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        
        # Fully Connected layers to output
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(window_size * project_dim, 512)  # First fully connected layer
        self.fc2 = nn.Linear(512, 64)       # Second fully connected layer 
        self.fc3 = nn.Linear(64, 1)  

        self.act = nn.ReLU()
        #self.intermediate = nn.Linear(project_dim, intermediate)
        # Output layer
        #self.fc_out = nn.Linear(intermediate_dim, 1) 
        
    def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        """
        Args:
            x: Tensor of shape (seq_len, batch_size, feature_size)
            mask: Optional mask of shape (seq_len, seq_len)
            
        Returns:
            out: Tensor of shape (batch_size, 1) for regression
        """
        #print(f"Input dim: {x.shape}")
        # Pseudo projection
        x = self.project_emb(x)
        #print(f"Projection dim: {x.shape}")
        
        # Add positional encoding
        x = self.positional_encoding(x)# (seq_len, batch_size, feature_size)
        x = x.to(torch.float32)
        #print(f"Positional dim: {x.shape}")
        
        # Pass through transformer encoder
        x = self.transformer_encoder(x, mask)  # (seq_len, batch_size, feature_size)
        #print(x.var())
        #print(f"Transformer dim: {x.shape}")

        x = self.flatten(x)
        #print(f"Flatten: {x.shape}")
        x = self.fc1(x)
        x = self.act(x)
        #print(f"FC 1: {x.shape}")
        x = self.fc2(x)
        x = self.act(x)
        #print(f"FC2: {x.shape}")
        out = self.fc3(x)
        
        # Take the mean across the sequence length dimension
        #x = torch.mean(x, dim=1)# (batch_size, feature_size)
        #print(x.var())
        #print(f"Mean dim: {x.shape}")
        #print(f"After Median: {x.shape}")
        # Output layer
        #out = self.fc_out(x)  # (batch_size, 1)
        #print(f"Out dim: {out.shape}")
        #print(out.var())

        return out
    
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split

# Example dataset class
class TurbofanDataset(Dataset):
    def __init__(self, data, targets):
        self.data = torch.from_numpy(data).to(torch.float32)
        self.targets = torch.from_numpy(targets).to(torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

# Training function
def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    count = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        #if count % 44 == 0:
        #    print(f"--> {count}/{len(dataloader)}")
        count += 1
        optimizer.zero_grad()
        
        outputs = model(inputs)
        targets = targets.view(-1, 1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

# Evaluation function
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0

    count = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            targets = targets.view(-1, 1)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            running_loss += loss.item() * inputs.size(0)
            #if count == 1 or count == 10:
                #print(count)
                #print(outputs[:10], targets[:10])
            count += 1
    
    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

In [18]:
## create dataset
window_size = 160
train_data = nu.scale_data(cleaned_train_data)
#train, val = train_val_split_by_group(train_data)

X_train, y_train = nu.create_sliding_window(train_data, window_size = window_size)
#X_val, y_val = nu.create_sliding_window(val, window_size = window_size)

test_data = nu.scale_data(cleaned_test_data)
X_test, y_test = nu.create_sliding_window(test_data, typ = "test", window_size = window_size)

2024-05-24 15:02:05 [[34msrc.utils:131[0m] [[32mINFO[0m] >>>> Train set contains 82 different engines --> in total 16807[0m
2024-05-24 15:02:05 [[34msrc.utils:132[0m] [[32mINFO[0m] >>>>  Test set contains 18 different engines --> in total 3824[0m


In [None]:
'window_size': 160, 'project_dim': 192, 'num_heads': 8, 'num_layers': 1, 'batch_size': 64, 'num_epochs': 175

In [None]:
# Example data (replace with actual data loading)
seq_len, batch_size, feature_size = X_train.shape[1], 64, X_train.shape[2]
num_heads, num_layers, project_dim  = 8, 1, 192 #12 * 4 * 2
num_epochs = 450
learning_rate = 0.0001

print(seq_len)
# Create dataset and dataloaders
train_dataset = nu.TurbofanDataset(X_train, y_train)
val_dataset = nu.TurbofanDataset(X_val, y_val)
test_dataset = nu.TurbofanDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
# Initialize model, criterion, optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerModel(feature_size, num_heads, num_layers, project_dim = project_dim, window_size = seq_len).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=20, gamma=0.5)


print(f"The model has in total {count_parameters(model)} parameters!!")
    
# Training loop
for epoch in range(num_epochs):
    train_loss = train_model(model, train_loader, criterion, optimizer, device)
    val_loss = evaluate_model(model, val_loader, criterion, device)
    test_loss = evaluate_model(model, test_loader, criterion, device)
    scheduler.step()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train_L: {train_loss:.2f}, Val_L: {val_loss:.2f}, VAl_RMSE: {np.sqrt(val_loss):.2f}, Test_L: {test_loss:.2f}, Test_RMSE: {np.sqrt(test_loss):.2f} ")

130
The model has in total 7290593 parameters!!
Epoch 1/450, Train_L: 1959.89, Val_L: 2190.69, VAl_RMSE: 46.80, Test_L: 2872.58, Test_RMSE: 53.60 
Epoch 2/450, Train_L: 1706.62, Val_L: 2217.32, VAl_RMSE: 47.09, Test_L: 3102.86, Test_RMSE: 55.70 
Epoch 3/450, Train_L: 1710.26, Val_L: 2203.09, VAl_RMSE: 46.94, Test_L: 3008.56, Test_RMSE: 54.85 
Epoch 4/450, Train_L: 1708.03, Val_L: 2190.14, VAl_RMSE: 46.80, Test_L: 2864.58, Test_RMSE: 53.52 
Epoch 5/450, Train_L: 1710.28, Val_L: 2192.77, VAl_RMSE: 46.83, Test_L: 2908.32, Test_RMSE: 53.93 
Epoch 6/450, Train_L: 1708.43, Val_L: 2197.46, VAl_RMSE: 46.88, Test_L: 2960.86, Test_RMSE: 54.41 
Epoch 7/450, Train_L: 1708.94, Val_L: 2196.04, VAl_RMSE: 46.86, Test_L: 2946.72, Test_RMSE: 54.28 
Epoch 8/450, Train_L: 1708.87, Val_L: 2188.49, VAl_RMSE: 46.78, Test_L: 2819.87, Test_RMSE: 53.10 
Epoch 9/450, Train_L: 1717.11, Val_L: 2189.35, VAl_RMSE: 46.79, Test_L: 2848.03, Test_RMSE: 53.37 
Epoch 10/450, Train_L: 1704.89, Val_L: 2189.84, VAl_RMSE: 46.

In [None]:
# Evaluation function
def get_predictions(model, dataloader, criterion, device):
    pred = []
    tar  = []
    model.eval()
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            targets = targets.view(-1, 1).cpu()
            outputs = model(inputs).cpu()
            for i in outputs:
                pred.append(i.detach())
            for j in targets:
                tar.append(j.detach())
    return pred, tar

In [None]:
#pred, targets = get_predictions(model, test_loader, criterion, device)
pred, targets = get_predictions(model, val_loader, criterion, device)

In [None]:
plt.figure(figsize=(12,5))
plt.plot((pred[:3000]), label="Prediction")
plt.plot((targets[:3000]), label="Reale RUL")
plt.legend()
plt.show()

In [None]:
test_data = calculate_RUL_test(test_data, test_RUL_data)

In [None]:
test_data[test_data["UnitNumber"] == 2]

In [None]:
# --> 431: epoch 398, ohne scheduler
seq_len, batch_size, feature_size = X_train.shape[1], 32, X_train.shape[2]
num_heads, num_layers, project_dim  = 12, 2, 12 * 4 * 2
num_epochs = 350
learning_rate = 0.0001

In [None]:
# --> 193: 817
seq_len, batch_size, feature_size = X_train.shape[1], 32, X_train.shape[2]
num_heads, num_layers, project_dim  = 12, 2, 12 * 4 * 2
num_epochs = 450
learning_rate = 0.0005
scheduler = StepLR(optimizer, step_size=30, gamma=0.75)

In [None]:
# Example data (replace with actual data loading)
seq_len, batch_size, feature_size = X_train.shape[1], 64, X_train.shape[2]
num_heads, num_layers, project_dim  = 16, 1, 12 * 4 * 2
num_epochs = 450
learning_rate = 0.0001