In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Topic: EX2 - Turbofan RUL Prediction
**Task**: Predict the remaining useful life (RUL) of turbofan engines based on given sensor data (time series data). It is a forcasting problem, where the goal is to predict the number of cycles an engine will last before it fails.
**Data**: Turbofan engine degradation simulation data (NASA) - [Link](https://data.nasa.gov/dataset/Turbofan-Engine-Degradation-Simulation-Data-Set/vrks-gjie). See also in the topic [introduction notebook](https://github.com/nina-prog/damage-propagation-modeling/blob/2fb8c1a1102a48d7abbf04e4031807790a913a99/notebooks/Turbofan%20remaining%20useful%20life%20Prediction.ipynb).

**Subtasks**:
1. Perform a deep **exploratory data analysis (EDA)** on the given data.
2. Implement a more efficient **sliding window method** for time series data analysis. -> 🎯 **Focus on this task**
3. Apply **traditional machine learning methods** (SOTA) to predict the remaining useful life. Includes data preparation, feature extraction, feature selection, model selection, and model parameter optimization.
4. Create **neural network models** to predict the remaining useful life. Includes different architectures like Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), or Attention Models. Note: You can search for SOTA research papers and reproduce current state-of-the-art models.


# Imports + Settings

In [3]:
# third-party libraries
import pandas as pd
import numpy as np
import os

import time
from tqdm.notebook import tqdm

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns


In [15]:
# source code
from src.utils import load_data, load_config

In [18]:
# settings
sns.set_style("whitegrid")
sns.set_palette("Set2")
sns.set(rc={"figure.dpi":100, 'savefig.dpi':200})
sns.set_context('notebook')

In [19]:
np.random.seed(42)

# Paths

In [6]:
# Make sure to execute this cell only once for one kernel session, before running any other cell below.
os.chdir("../") # set working directory to root of project
os.getcwd() # check current working directory

'/Users/frbroy/Library/Mobile Documents/com~apple~CloudDocs/KIT/SoSe2024/PSDA/damage-propagation-modeling'

In [20]:
PATH_TO_CONFIG = "configs/config.yaml"

# Load Config + Data

In [21]:
config = load_config(PATH_TO_CONFIG) # config is dict

In [26]:
train_data, test_data, test_rul = load_data(config_path=PATH_TO_CONFIG, dataset_num=1) #, raw=False)

2024-05-16 11:17:24 [[34msrc.utils:56[0m] [[32mINFO[0m] >>>> Loading data set 1...[0m
2024-05-16 11:17:24 [[34msrc.utils:85[0m] [[32mINFO[0m] >>>> Loaded raw data for dataset 1.[0m
2024-05-16 11:17:24 [[34msrc.utils:86[0m] [[32mINFO[0m] >>>> Train Data: (20631, 26)[0m
2024-05-16 11:17:24 [[34msrc.utils:87[0m] [[32mINFO[0m] >>>> Test Data: (13096, 26)[0m
2024-05-16 11:17:24 [[34msrc.utils:88[0m] [[32mINFO[0m] >>>> Test RUL Data: (100, 1)[0m


# 📍 << Subtask Train split: Procesing >>

In [27]:
train_data.head()

Unnamed: 0,UnitNumber,Cycle,Operation Setting 1,Operation Setting 2,Operation Setting 3,Sensor Measure 1,Sensor Measure 2,Sensor Measure 3,Sensor Measure 4,Sensor Measure 5,...,Sensor Measure 12,Sensor Measure 13,Sensor Measure 14,Sensor Measure 15,Sensor Measure 16,Sensor Measure 17,Sensor Measure 18,Sensor Measure 19,Sensor Measure 20,Sensor Measure 21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [28]:
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=.18, n_splits=2, random_state = 7)
split = splitter.split(train_data, groups=train_data['UnitNumber'])
train_inds, test_inds = next(split)

train = train_data.iloc[train_inds]
test =  train_data.iloc[test_inds]

In [53]:
from sklearn.model_selection import GroupShuffleSplit 
from src.logger import setup_logger

logger = setup_logger(__name__, level='INFO')

def train_val_split_by_group(df, group = "UnitNumber", test_size = .18, n_splits = 2, random_state = 7):

    splitter = GroupShuffleSplit(test_size=test_size, n_splits=n_splits, random_state = random_state)
    split = splitter.split(df, groups=df[group])
    train_inds, test_inds = next(split)

    train = df.iloc[train_inds]
    test = df.iloc[test_inds]
    
    logger.info(f"Train set contains {train[group].nunique()} different engines --> in total {len(train)}")
    logger.info(f" Test set contains {test[group].nunique()} different engines --> in total {len(test)}")
    return train, test

In [68]:
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score

from typing import Any, Dict, Generator, Tuple
from src.logger import setup_logger

# Setup logger
logger = setup_logger(__name__, level='INFO')

def train_val_split_by_group(
    df: pd.DataFrame,
    group: str = "UnitNumber",
    test_size: float = 0.18,
    n_splits: int = 2,
    random_state: int = 7
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Splits the DataFrame into training and validation sets based on a group identifier.

    Parameters:
    df (pd.DataFrame): The DataFrame to split.
    group (str): The column name to group by for splitting. Defaults to "UnitNumber".
    test_size (float): The proportion of the dataset to include in the test split. Defaults to 0.18.
    n_splits (int): Number of re-shuffling & splitting iterations. Defaults to 2.
    random_state (int): Random state for reproducibility. Defaults to 7.

    Returns:
    Tuple[pd.DataFrame, pd.DataFrame]: The training and test DataFrames.
    """
    # Initialize GroupShuffleSplit
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=n_splits, random_state=random_state)
    
    # Perform the split
    split = splitter.split(df, groups=df[group])
    
    # Get the indices for the train and test sets
    train_inds, test_inds = next(split)
    
    # Create the train and test DataFrames using the indices
    train = df.iloc[train_inds]
    test = df.iloc[test_inds]
    
    # Log the number of unique groups and total rows in the train and test sets
    logger.info(f"Train set contains {train[group].nunique()} different engines --> in total {len(train)}")
    logger.info(f" Test set contains {test[group].nunique()} different engines --> in total {len(test)}")
    
    return train, test



# Setup logger
logger = setup_logger(__name__, level='INFO')

def k_fold_group_cross_validation(
    df: pd.DataFrame,
    group: str = "UnitNumber",
    n_splits: int = 5
) -> Generator[Tuple[pd.DataFrame, pd.DataFrame], None, None]:
    """
    Performs K-fold group cross-validation.

    Parameters:
    df (pd.DataFrame): The DataFrame to split.
    group (str): The column name to group by for splitting. Defaults to "UnitNumber".
    n_splits (int): Number of folds. Defaults to 5.
    random_state (int): Random state for reproducibility. Defaults to None.

    Yields:
    Generator[Tuple[pd.DataFrame, pd.DataFrame], None, None]: 
        A generator yielding tuples of (train DataFrame, validation DataFrame) for each fold.
    """
    # Initialize GroupKFold
    group_kfold = GroupKFold(n_splits=n_splits)
    
    # Iterate over each fold
    for fold, (train_inds, val_inds) in enumerate(group_kfold.split(df, groups=df[group])):
        # Create the train and validation DataFrames using the indices
        train = df.iloc[train_inds]
        val = df.iloc[val_inds]
        
        # Log the number of unique groups and total rows in the train and validation sets
        logger.info(f"Fold {fold + 1}:")
        logger.info(f"Train set contains {train[group].nunique()} different engines --> in total {len(train)}")
        logger.info(f"Validation set contains {val[group].nunique()} different engines --> in total {len(val)}")
        
        yield train, val

# Example usage:
# for train_df, val_df in k_fold_group_cross_validation(df):
#     # train your model on train_df
#     # validate your model on val_df


In [70]:


# Setup logger
logger = setup_logger(__name__, level='INFO')



def train_and_evaluate_model(
    model: Any,
    X: pd.DataFrame,
    y: pd.Series,
    groups: pd.Series,
    n_splits: int = 5,
) -> Dict[str, list]:
    """
    Train and evaluate a model using the specified cross-validation strategy.

    Parameters:
    model (Any): The model to be trained and evaluated.
    X (pd.DataFrame): The feature matrix.
    y (pd.Series): The target variable.
    groups (pd.Series): The group labels for cross-validation.
    cv (Generator): Cross-validation strategy.
    scoring (Dict[str, make_scorer]): The scoring metrics.

    Returns:
    Dict[str, list]: Cross-validation scores for each defined metric.
    """
    cv = GroupKFold(n_splits=n_splits)
    # Define the scoring metrics for regression
    scoring: Dict[str, make_scorer] = {
        'mae': make_scorer(mean_absolute_error),
        'mse': make_scorer(mean_squared_error),
        'r2': make_scorer(r2_score)
    }

    # Perform cross-validation
    scores = cross_validate(model, X, y, cv=cv, groups=groups, scoring=scoring, return_train_score=False)
    
    # Log the results
    for metric in scoring.keys():
        logger.info(f"{metric.upper()} Scores: {scores['test_' + metric]}")
        logger.info(f"Average {metric.upper()}: {scores['test_' + metric].mean():.4f}")
    
    return scores

In [58]:
train, val  = train_val_split_by_group(train_data)

2024-05-16 14:09:04 [[34m__main__:42[0m] [[32mINFO[0m] >>>> Train set contains 82 different engines --> in total 16807[0m
2024-05-16 14:09:04 [[34m__main__:43[0m] [[32mINFO[0m] >>>>  Test set contains 18 different engines --> in total 3824[0m


In [62]:
gen = k_fold_group_cross_validation(train_data)

In [71]:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(random_state=42, max_iter=1000, early_stopping=True, alpha=0.05)
train_and_evaluate_model(model, train_data)

NameError: name 'MLPRegressor' is not defined

In [63]:
gen.

<generator object k_fold_group_cross_validation at 0x1594afb50>

In [37]:
test["UnitNumber"].nunique()

18

[TEMPLATE]

Findings:
* Interpretation of plots
* or other key take aways from previous code

In [16]:
# [TEMPLATE] - save processed data (as pickle)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_pickle(f"{config['paths']['processed_data_dir']}ex2_topic_{timestamp}.pkl")

NameError: name 'config' is not defined

In [12]:
# [TEMPLATE] - save data predictions (as csv)
df = pd.DataFrame()
timestamp = time.strftime("%Y%m%d-%H%M%S")
df.to_csv(f"{config['paths']['prediction_dir']}ex2_topic_{timestamp}.csv", sep=',', decimal='.')

In [13]:
# [TEMPLATE] - save plot results (as png)
fig = plt.figure(figsize=(9, 6))
timestamp = time.strftime("%Y%m%d-%H%M%S")
fig.savefig(f"{config['paths']['plot_dir']}ex2_topic_{timestamp}.png")

<Figure size 900x600 with 0 Axes>