# CMAPSSA Dataset Exploration

## CMAPSS Dataset Variants 


In [4]:
import pandas as pd
train_FD001 = pd.read_csv("/home/meghagkrishnan/code/meghagkrishnan/jet_engine/raw_data/train_FD001.txt",sep = ' ', header=None)
train_FD001.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,,


In [5]:
def clean_train_data(df: pd.DataFrame):

    """
    This function assigns column names to the DataFrame, drops specified columns,
    and adds a Remaining Useful Life (RUL) column.

    Parameters:
    df (pd.DataFrame): Input DataFrame to be cleaned.

    Returns:
    pd.DataFrame: Cleaned DataFrame with RUL column added.
    """
    # Rename the columns
    columns =['id',
            'cycle',
            'setting1',
            'setting2',
            'setting3',
            'T2_Total_temperature_at_fan_inlet',
            'T24_Total_temperature_at_LPC_outlet',
            'T30_Total_temperature_at_HPC_outlet',
            'T50_Total_temperature_at_LPT_outlet',
            'P2_Pressure_at_fan_inlet',
            'P15_Total_pressure_in_bypass_duct',
            'P30_Total_pressure_at_HPC_outlet',
            'Nf_Physical_fan_speed',
            'Nc_Physical_core_speed',
            'epr_Engine_pressure_ratio',
            'Ps30_Static_pressure_at_HPC_outlet',
            'phi_Ratio_of_fuel_flow_to_Ps30',
            'NRf_Corrected_fan_speed',
            'NRc_Corrected_core_speed',
            'BPR_Bypass_Ratio',
            'farB_Burner_fuel_air_ratio',
            'htBleed_Bleed_Enthalpy',
            'Nf_dmd_Demanded_fan_speed',
            'PCNfR_dmd_Demanded_corrected_fan_speed',
            'W31_HPT_coolant_bleed',
            'W32_LPT_coolant_bleed',
            'sm22',
            'sm23']

    # Assign the column names to the DataFrame
    df.columns = columns

    # Add Remaining Useful Life (RUL) column
    max_cycle = df.groupby('id')['cycle'].max()
    df = df.merge(max_cycle, on='id', suffixes=('', '_max'))
    df['RUL'] = df['cycle_max'] - df['cycle']
    df = df.drop(columns=['cycle_max'])

    # Columns to drop
    columns_to_drop = ['sm22', 'sm23', 'setting3', 'T2_Total_temperature_at_fan_inlet', 'P2_Pressure_at_fan_inlet', "P15_Total_pressure_in_bypass_duct",
            'epr_Engine_pressure_ratio', 'farB_Burner_fuel_air_ratio', 'Nf_dmd_Demanded_fan_speed',
            'PCNfR_dmd_Demanded_corrected_fan_speed']

    # Drop the specified columns
    cleaned_train_df= df.drop(columns_to_drop, axis=1)

    return cleaned_train_df

In [6]:
data = clean_train_data(train_FD001)
data.head(2)

Unnamed: 0,id,cycle,setting1,setting2,T24_Total_temperature_at_LPC_outlet,T30_Total_temperature_at_HPC_outlet,T50_Total_temperature_at_LPT_outlet,P30_Total_pressure_at_HPC_outlet,Nf_Physical_fan_speed,Nc_Physical_core_speed,Ps30_Static_pressure_at_HPC_outlet,phi_Ratio_of_fuel_flow_to_Ps30,NRf_Corrected_fan_speed,NRc_Corrected_core_speed,BPR_Bypass_Ratio,htBleed_Bleed_Enthalpy,W31_HPT_coolant_bleed,W32_LPT_coolant_bleed,RUL
0,1,1,-0.0007,-0.0004,641.82,1589.7,1400.6,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.419,191
1,1,2,0.0019,-0.0003,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.0,23.4236,190


In [7]:
data.shape

(20631, 19)

In [12]:
import numpy as np

In [13]:
def data_preperation_RNN(data, seq_length=50):
    #This function create a sequence of the data in (n_seq, n_obs, n_features) format to train DL methods
    sequences = []
    labels = []
    for unit in data['id'].unique():
        unit_data = data[data['id'] == unit].sort_values(by='cycle')
        num_sequences = len(unit_data) - seq_length + 1
        for i in range(num_sequences):
            seq = unit_data.iloc[i:i + seq_length]
            sequences.append(seq.drop(columns=['id', 'cycle', 'RUL']).values)
            labels.append(seq['RUL'].values[-1])
    return np.array(sequences), np.array(labels)

In [14]:
# Create sequences
seq_length = 80
X, y = data_preperation_RNN(data, seq_length=seq_length)

# Save the prepared sequences (optional)
#np.save('X_train_FD001.npy', X)
#np.save('y_train_FD001.npy', y)
X.shape, y.shape

((12731, 80, 16), (12731,))

In [15]:
y = np.expand_dims(y, axis=1)
y.shape

(12731, 1)