In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [3]:
# third-party libraries
#os.getcwd() # check current working directory
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [5]:
np.random.seed(42)

# Paths

In [6]:
os.chdir("../") # set working directory to root of project
#os.getcwd() # check current working directory

In [8]:
# source code
from src.utils import load_data, load_config, train_val_split_by_group
import src.nn_utils as nu
from src.utils import load_data, load_config
from src.rolling_window_creator import RollingWindowDatasetCreator, calculate_RUL
from src.data_cleaning import identify_missing_values, identify_single_unique_features, format_dtype, clean_data
import src.nn_utils as nu
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [9]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [10]:
train_data, test_data, test_data_RUL = load_data(config_path=PATH_TO_CONFIG, dataset_num=1)
train_data = calculate_RUL(train_data, time_column= "Cycle", group_column= "UnitNumber")

#train_data, val_data = train_val_split_by_group(train_data)

# 📍 << Subtask NN: Try out NN Architecture >>

[TEMPLATE]

Findings:
* Interpretation of plots
* or other key take aways from previous code

In [11]:
train_data.head()

In [12]:
## Scale Data Min Max
from sklearn.preprocessing import MinMaxScaler
def scale_data(df):

    scaler = MinMaxScaler()
    float_columns = df.select_dtypes(include=float).columns.tolist()
    scaled_data = scaler.fit_transform(df[float_columns])
    df[float_columns] = scaled_data

    return df

## create sliding window
def create_sliding_window(df, window_size = 30, drop_columns = ["UnitNumber", "Cycle", "RUL"]):
    
    number_engines = df["UnitNumber"].unique()
    X, y = [], []

    for engine in number_engines:

        ## get all data with engine same engine type
        temp = df[df["UnitNumber"] == engine]
        assert temp["UnitNumber"].unique() == engine

        ## loop over group
        for i in range(len(temp) - window_size + 1):

            X_temp = temp.iloc[i : (i + window_size)].drop(columns = drop_columns)
            Y_temp = temp.iloc[(i + window_size - 1)]["RUL"]
            assert len(X_temp) == 30
            X.append(X_temp.to_numpy())
            y.append(Y_temp)
            if i == (len(temp) - window_size):
                assert Y_temp == 1
    X = np.array(X)
    y = np.array(y)
    
    return X, y

In [13]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def scale_data(df):
    """
    Scales the numerical columns in the DataFrame using MinMaxScaler.

    Args:
        df (pandas.DataFrame): Input DataFrame.

    Returns:
        pandas.DataFrame: Scaled DataFrame.
    """
    scaler = MinMaxScaler()
    
    # Select float columns
    float_columns = df.select_dtypes(include=float).columns.tolist()
    
    # Scale the data
    scaled_data = scaler.fit_transform(df[float_columns])
    
    # Update the DataFrame with scaled data
    df[float_columns] = scaled_data

    return df

def create_sliding_window(df, window_size=30, drop_columns=["UnitNumber", "Cycle", "RUL"]):
    """
    Creates a sliding window of data for time series prediction.

    Args:
        df (pandas.DataFrame): Input DataFrame containing time series data.
        window_size (int): Size of the sliding window.
        drop_columns (list): List of columns to drop from the input DataFrame.

    Returns:
        tuple: A tuple containing X (input) and y (output) arrays.
    """
    number_engines = df["UnitNumber"].unique()
    X, y = [], []

    for engine in number_engines:
        # Get data for the current engine
        temp = df[df["UnitNumber"] == engine]
        assert temp["UnitNumber"].unique() == engine

        for i in range(len(temp) - window_size + 1):
            # Extract windowed data and RUL for each window
            X_temp = temp.iloc[i : (i + window_size)].drop(columns=drop_columns)
            Y_temp = temp.iloc[(i + window_size - 1)]["RUL"]
            assert len(X_temp) == window_size
            X.append(X_temp.to_numpy())
            y.append(Y_temp)
            if i == (len(temp) - window_size):
                assert Y_temp == 1

    X = np.array(X)
    y = np.array(y)

    return X, y


In [14]:
train_data = scale_data(train_data)
X_train, y_train = create_sliding_window(train_data)

In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from tqdm import tqdm

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (seq_len, batch_size, feature_size)
        """
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, feature_size: int, num_heads: int, num_layers: int, dropout: float = 0.1):
        super(TransformerModel, self).__init__()
        self.feature_size = feature_size
        
        # Positional Encoding
        self.positional_encoding = PositionalEncoding(feature_size, dropout)
        
        # Transformer Encoder
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=feature_size, nhead=num_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        
        # Output layer
        self.fc_out = nn.Linear(feature_size, 1) 
        
    def forward(self, x: torch.Tensor, mask: torch.Tensor = None) -> torch.Tensor:
        """
        Args:
            x: Tensor of shape (seq_len, batch_size, feature_size)
            mask: Optional mask of shape (seq_len, seq_len)
            
        Returns:
            out: Tensor of shape (batch_size, 1) for regression
        """
        # Add positional encoding
        x = self.positional_encoding(x)# (seq_len, batch_size, feature_size)
        x = x.to(torch.float32)

        # Pass through transformer encoder
        x = self.transformer_encoder(x, mask)  # (seq_len, batch_size, feature_size)
    
        # Take the mean across the sequence length dimension
        x = torch.mean(x, dim=0)  # (batch_size, feature_size)
        
        # Output layer
        out = self.fc_out(x)  # (batch_size, 1)
    
        return out

In [77]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split

# Example dataset class
class TurbofanDataset(Dataset):
    def __init__(self, data, targets):
        self.data = torch.from_numpy(data).to(torch.float32)
        self.targets = torch.from_numpy(targets).to(torch.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.targets[idx]

# Training function
def train_model(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    count = 0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        
        if count % 44 == 0:
            print(f"--> {count}/{len(dataloader)}")
        count += 1
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

# Evaluation function
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item() * inputs.size(0)
    
    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

# Main function to execute training
if __name__ == "__main__":
    # Example data (replace with actual data loading)
    seq_len, batch_size, feature_size = X_train.shape[1], 32, X_train.shape[2]
    num_heads, num_layers = 4, 2
    num_epochs = 20
    learning_rate = 0.001
    
    # Create dataset and dataloaders
    dataset = TurbofanDataset(X_train, y_train)
    train_size = int(0.1 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model, criterion, optimizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TransformerModel(feature_size, num_heads, num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training loop
    for epoch in range(num_epochs):
        train_loss = train_model(model, train_loader, criterion, optimizer, device)
        val_loss = evaluate_model(model, val_loader, criterion, device)
        
        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")


In [71]:
len(train_loader)

In [52]:
transformer_model = nn.Transformer(d_model = 128, nhead=2, num_encoder_layers=2)
src = torch.rand((10, 32, 128))
tgt = torch.rand((20, 32, 128))
print(src.dtype)
print(tgt.dtype)
out = transformer_model(src, tgt)

In [57]:
encoder_layer = nn.TransformerEncoderLayer(d_model=feature_size, nhead=num_heads, dropout=dropout)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
src = torch.rand((30, 32, 24))
print(src.dtype)
transformer_encoder(src)

In [11]:
# [TEMPLATE] - save processed data (as pickle)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_pickle(f"{config['paths']['processed_data_dir']}ex2_topic_{timestamp}.pkl")

In [12]:
# [TEMPLATE] - save data predictions (as csv)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_csv(f"{config['paths']['prediction_dir']}ex2_topic_{timestamp}.csv", sep=',', decimal='.')

In [13]:
# [TEMPLATE] - save plot results (as png)
fig = plt.figure(figsize=(9, 6))
timestamp = time.strftime("%Y%m%d-%H%M%S")
fig.savefig(f"{config['paths']['plot_dir']}ex2_topic_{timestamp}.png")