In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from pandas import DataFrame
from fastapi import HTTPException
from typing import List, Any, Dict

### Load data
def df_load_csv(path: str, separator: str, column_names: List[str], **kwargs) -> DataFrame:
    """
    Loads data from CSV file and returns a DataFrame object.
    """
    # Load Data
    df = pd.read_csv(
        filepath_or_buffer=path ,sep=separator, names=column_names, **kwargs
    )

    return df

#### Data Manipulation
def df_drop_columns(df: DataFrame, columns: List[str]) -> DataFrame:
    df = df.drop(columns=columns)
    return df

def column_unique_values(df: DataFrame, column: str) -> List[Any]:
    unique = df[column].unique()
    return list(unique)

def df_filter_rows(df: DataFrame, column: str, value: Any) -> DataFrame:
    df = df[df[column] == value]
    return df

### Plots
def plot_df(df: DataFrame, columns: List[str], xlabel: str, ylabel: str):
    plt.figure(figsize=(10, 6))

    for column in columns:
        plt.plot(df[column], label=column)
        
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(columns)
    plt.show()

def plot_scatter(df: DataFrame, x: str, y: str, xlabel: str, ylabel: str):
    plt.figure(figsize=(10, 6))
    plt.scatter(df[x], df[y])
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.show()


def plot_correlation_matrix(df: DataFrame):
    df = df.select_dtypes(include=[np.number])

    corr = df.corr().abs()

    plt.figure(figsize=(10, 6))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
    plt.show()

### Feature Engineering
def df_rolling_mean(df: pd.DataFrame, columns: List[str], window: int) -> pd.DataFrame:
    df_copy = df.copy()  # Work on a copy of the DataFrame
    for column in columns:
        # Calculate rolling mean
        df_copy[f'{column}_rolling_mean'] = df_copy[column].rolling(window=window).mean()
    return df_copy

def df_rolling_std(df: pd.DataFrame, columns: List[str], window: int) -> pd.DataFrame:
    df_copy = df.copy()  # Work on a copy of the DataFrame
    for column in columns:
        # Calculate rolling std
        df_copy[f'{column}_rolling_std'] = df_copy[column].rolling(window=window).std()
    return df_copy

def df_rolling_skewness(df: pd.DataFrame, columns: List[str], window: int) -> pd.DataFrame:
    df_copy = df.copy()  # Work on a copy of the DataFrame
    for column in columns:
        # Calculate rolling skewness
        df_copy[f'{column}_rolling_skewness'] = df_copy[column].rolling(window=window).skew()
    return df_copy

def df_remaining_useful_life(df: DataFrame, column: str) -> DataFrame:
    max_time_variable = df[column].max()
    df['RUL'] = max_time_variable - df[column]
    return df
    
def df_classification_threshold(df: DataFrame, column: str, thresholds: List[int], categories: Dict[str, int]) -> DataFrame:
    bins = [-float('inf')] + thresholds + [float('inf')]
    labels = list(categories.keys())
    df[f"{column}_class"] = pd.cut(df[column], bins=bins, labels=labels)

    return df


In [26]:
index_names = ['Unit', 'Cycle']
setting_names = ['op_setting_1', 'op_setting_2', 'op_setting_3']
sensor_names = ['sensor_' + str(i) for i in range(1, 22)]

column_names = index_names + setting_names + sensor_names
separator = '\s+'
kwargs = {
    "header": None,
    "index_col": False,
}

path = '../data/raw/train_FD001.txt'

# Load df
df = df_load_csv(path, separator, column_names, **kwargs)


In [27]:

# Drop constant columns
df = df_drop_columns(df, ['op_setting_1','op_setting_2','op_setting_3', 'sensor_1', 'sensor_5', 'sensor_6', 'sensor_10', 'sensor_16', 'sensor_18', 'sensor_19'])

# Apply transformations
units = column_unique_values(df, 'Unit')

max_columns = df.shape[1]

subsets = []

for unit in units:
    subset = df_filter_rows(df, 'Unit', unit)
    subset = df_rolling_mean(subset, subset.columns[2:max_columns], 10)
    subset = df_rolling_skewness(subset, subset.columns[2:max_columns], 10)
    subset = df_rolling_std(subset, subset.columns[2:max_columns], 10)
    subset = df_remaining_useful_life(subset, 'Cycle')
    subset = df_classification_threshold(subset, 'RUL', [50, 125, 200], {'urgent': 0, 'short': 1, 'medium': 2, 'long': 3 })    

    subsets.append(subset)

# create new dataframe
df = pd.concat(subsets)

df.dropna(inplace=True)

df.reset_index(drop=True, inplace=True)

df.head(30)

Unnamed: 0,Unit,Cycle,sensor_2,sensor_3,sensor_4,sensor_7,sensor_8,sensor_9,sensor_11,sensor_12,...,sensor_11_rolling_std,sensor_12_rolling_std,sensor_13_rolling_std,sensor_14_rolling_std,sensor_15_rolling_std,sensor_17_rolling_std,sensor_20_rolling_std,sensor_21_rolling_std,RUL,RUL_class
0,1,10,641.71,1591.24,1400.46,553.59,2388.05,9051.7,47.03,521.79,...,0.144207,0.400949,0.020111,3.333946,0.022654,0.918937,0.070111,0.044857,182,medium
1,1,11,642.28,1581.75,1400.64,554.54,2388.05,9049.61,47.15,521.4,...,0.130213,0.442267,0.021628,3.779342,0.023876,0.918937,0.065794,0.052522,181,medium
2,1,12,642.06,1583.41,1400.15,554.52,2388.09,9049.37,47.18,521.8,...,0.096661,0.448969,0.02044,3.804252,0.023,0.948683,0.071149,0.052264,180,medium
3,1,13,643.07,1582.19,1400.83,553.44,2388.12,9046.82,47.38,521.85,...,0.109747,0.435871,0.02406,4.061997,0.022867,0.816497,0.072296,0.062411,179,medium
4,1,14,642.35,1592.95,1399.16,554.48,2388.09,9047.37,47.44,521.67,...,0.124316,0.328843,0.023688,4.093274,0.019155,0.875595,0.088468,0.062331,178,medium
5,1,15,642.43,1583.82,1402.13,553.64,2388.11,9052.22,47.3,522.5,...,0.124993,0.371485,0.027669,4.3006,0.018221,0.875595,0.080726,0.062709,177,medium
6,1,16,642.13,1587.98,1404.5,553.94,2388.05,9049.34,47.24,521.49,...,0.120872,0.389923,0.029078,4.521815,0.018697,0.816497,0.081268,0.06696,176,medium
7,1,17,642.58,1584.96,1399.95,553.8,2388.06,9054.92,47.12,521.89,...,0.122841,0.362162,0.028752,4.827542,0.02376,0.816497,0.097325,0.069208,175,medium
8,1,18,642.62,1591.04,1396.12,554.2,2388.05,9049.55,47.21,521.76,...,0.123126,0.293455,0.029364,4.803526,0.023898,0.737865,0.101877,0.064462,174,medium
9,1,19,641.79,1587.56,1400.35,554.18,2388.04,9053.99,47.4,521.89,...,0.133187,0.295003,0.029889,4.355143,0.020436,0.816497,0.112131,0.06569,173,medium


In [28]:
# Save data
df.to_csv('../data/processed/FD001', index=False)