# Data preparation

In [2]:
# imports
import os
import h5py
import time
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from pandas import DataFrame
import matplotlib.pyplot as plt
from matplotlib import gridspec
from pandas_profiling import ProfileReport
%matplotlib inline

In [3]:
# Read the file from DS02
filename = '../data/N-CMAPSS_DS02.h5'

# Setup save loc for graphs
graph_folder = '../graphs/'

In [4]:
# Time tracking, Operation time (min):  0.003
t = time.process_time()  

# Load data
with h5py.File(filename, 'r') as hdf:
        # Development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
        T_dev = np.array(hdf.get('T_dev'))             # T
        Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

        # Test set
        W_test = np.array(hdf.get('W_test'))           # W
        X_s_test = np.array(hdf.get('X_s_test'))       # X_s
        X_v_test = np.array(hdf.get('X_v_test'))       # X_v
        T_test = np.array(hdf.get('T_test'))           # T
        Y_test = np.array(hdf.get('Y_test'))           # RUL  
        A_test = np.array(hdf.get('A_test'))           # Auxiliary
        
        # Varnams
        W_var = np.array(hdf.get('W_var'))
        X_s_var = np.array(hdf.get('X_s_var'))  
        X_v_var = np.array(hdf.get('X_v_var')) 
        T_var = np.array(hdf.get('T_var'))
        A_var = np.array(hdf.get('A_var'))
        
        # from np.array to list dtype U4/U5
        W_var = list(np.array(W_var, dtype='U20'))
        X_s_var = list(np.array(X_s_var, dtype='U20'))  
        X_v_var = list(np.array(X_v_var, dtype='U20')) 
        T_var = list(np.array(T_var, dtype='U20'))
        A_var = list(np.array(A_var, dtype='U20'))
                          
W = np.concatenate((W_dev, W_test), axis=0)  
X_s = np.concatenate((X_s_dev, X_s_test), axis=0)
X_v = np.concatenate((X_v_dev, X_v_test), axis=0)
T = np.concatenate((T_dev, T_test), axis=0)
Y = np.concatenate((Y_dev, Y_test), axis=0) 
A = np.concatenate((A_dev, A_test), axis=0) 
    
print('')
print("Operation time (min): " , (time.process_time()-t)/60)
print('')
print ("W shape: " + str(W.shape))
print ("X_s shape: " + str(X_s.shape))
print ("X_v shape: " + str(X_v.shape))
print ("T shape: " + str(T.shape))
print ("A shape: " + str(A.shape))


Operation time (min):  0.047039399999999995

W shape: (6517190, 4)
X_s shape: (6517190, 14)
X_v shape: (6517190, 14)
T shape: (6517190, 10)
A shape: (6517190, 4)


In [5]:
# defining longer more meaningful names for the variables

# Auxiliary data
auxiliary_cols = ['unit','cycle','flight_class','health_state']
df_A_train = DataFrame(data=A_dev, columns=auxiliary_cols)
df_A_test = DataFrame(data=A_test, columns=auxiliary_cols)

# Scenario descriptors
scenario_cols = ['altitude','mach_number','throttle_resolver_angle','fan_inlet_temp']
df_W_train = DataFrame(data=W_dev, columns=scenario_cols)
df_W_test = DataFrame(data=W_test, columns=scenario_cols)

# Measurements
physical_sensor_cols = [
    'fuel_flow','physical_fan_speed','physical_core_speed','lpc_temp','hpc_temp','hpt_temp','lpt_temp',
    'bypass_duct_temp','fan_in_pressure','fan_out_pressure','lpc_pressure','hpc_static_pressure','burner_out_pressure','lpt_pressure'
    ]
df_X_s_train = DataFrame(data=X_s_dev, columns=physical_sensor_cols)
df_X_s_test = DataFrame(data=X_s_test, columns=physical_sensor_cols)


# Virtual sensors
virtual_sensor_cols = [
    'total_temp_burner_outlet','total_pressure_hpc_outlet','total_pressure_hpt_outlet','fan_flow','lpc_outflow',
    'hpc_inflow','hpt_coolant_bleed','lpt_coolant_bleed','hpt_outflow','lpt_outflow','fan_stall_margin',
    'lpc_stall_margin','hpc_stall_margin','ratio_fuel_flow_to_static_hpc_pressure'
]
df_X_v_train = DataFrame(data=X_v_dev, columns=virtual_sensor_cols)
df_X_v_test = DataFrame(data=X_v_test, columns=virtual_sensor_cols)

# Health parameters
health_param_cols = [
    'fan_efficiency_mod','fan_flow_mod','lpc_efficiency_mod','lpc_flow_mod','hpc_efficiency_mod','hpc_flow_mod',
    'hpt_efficiency_mod','hpt_flow_mod','lpt_efficiency_mod','lpt_flow_mod'
]
df_T_dev = DataFrame(data=T_dev, columns=health_param_cols)
df_T_test = DataFrame(data=T_test, columns=health_param_cols)


# RUL
df_Y_train = DataFrame(data=Y_dev, columns=['RUL'])
df_Y_test = DataFrame(data=Y_test, columns=['RUL'])


# exclude df_T health parameters, health state, flight class
df_train = pd.concat([df_A_train, df_W_train, df_X_s_train, df_X_v_train, df_Y_train], axis=1).drop(['health_state', 'flight_class'], axis=1)
df_test = pd.concat([df_A_test, df_W_test, df_X_s_test, df_X_v_test, df_Y_test], axis=1).drop(['health_state', 'flight_class'], axis=1)


In [6]:
print(df_train.shape)
df_train.head()

(5263447, 35)


Unnamed: 0,unit,cycle,altitude,mach_number,throttle_resolver_angle,fan_inlet_temp,fuel_flow,physical_fan_speed,physical_core_speed,lpc_temp,...,hpc_inflow,hpt_coolant_bleed,lpt_coolant_bleed,hpt_outflow,lpt_outflow,fan_stall_margin,lpc_stall_margin,hpc_stall_margin,ratio_fuel_flow_to_static_hpc_pressure,RUL
0,2.0,1.0,10005.0,0.448497,76.903748,502.420918,600.148034,1438.498187,1818.027714,1228.129848,...,228.487065,26.498785,15.899271,215.844851,228.411666,16.648833,9.89813,25.376144,41.89399,74
1,2.0,1.0,10013.0,0.447741,76.903748,502.326114,600.055894,1438.350208,1817.682618,1227.879113,...,228.383505,26.486552,15.891931,215.745634,228.307014,16.639222,9.904927,25.380549,41.884434,74
2,2.0,1.0,10017.0,0.448938,77.079529,502.416067,600.210756,1439.109101,1820.020627,1229.422522,...,228.661083,26.51934,15.911604,216.019054,228.592279,16.649823,9.923503,25.318848,41.953848,74
3,2.0,1.0,10024.0,0.449883,77.079529,502.469893,600.369717,1439.24023,1819.188327,1228.538726,...,228.768625,26.532044,15.919226,216.121238,228.702994,16.653812,9.905518,25.361981,41.914342,74
4,2.0,1.0,10031.0,0.449379,77.079529,502.401271,600.298227,1439.064004,1818.96354,1228.389046,...,228.653631,26.51846,15.911076,216.008509,228.584788,16.649031,9.897465,25.363994,41.911503,74


In [7]:
print(df_test.shape)
df_test.head()

(1253743, 35)


Unnamed: 0,unit,cycle,altitude,mach_number,throttle_resolver_angle,fan_inlet_temp,fuel_flow,physical_fan_speed,physical_core_speed,lpc_temp,...,hpc_inflow,hpt_coolant_bleed,lpt_coolant_bleed,hpt_outflow,lpt_outflow,fan_stall_margin,lpc_stall_margin,hpc_stall_margin,ratio_fuel_flow_to_static_hpc_pressure,RUL
0,11.0,1.0,10014.0,0.457506,77.25531,503.176696,601.369822,1441.086963,1822.407728,1230.069061,...,229.763207,26.649529,15.989717,217.085529,229.722454,16.74551,9.812495,25.345244,41.971419,58
1,11.0,1.0,10020.0,0.457947,77.25531,503.192949,601.381211,1441.055436,1822.376094,1230.025551,...,229.736365,26.646358,15.987815,217.05872,229.694212,16.751997,9.806257,25.346932,41.97147,58
2,11.0,1.0,10029.0,0.458451,77.25531,503.203187,601.392126,1441.063188,1822.350721,1229.965758,...,229.719527,26.644369,15.986621,217.04319,229.67765,16.758975,9.804009,25.348326,41.96994,58
3,11.0,1.0,10034.0,0.458136,77.25531,503.15858,601.348485,1440.964145,1822.1418,1229.809741,...,229.655911,26.636854,15.982113,216.981145,229.612403,16.755378,9.803649,25.35208,41.964794,58
4,11.0,1.0,10045.0,0.45801,77.25531,503.105629,601.285695,1440.85251,1822.01976,1229.73263,...,229.561414,26.625692,15.975415,216.890123,229.516137,16.753262,9.806697,25.351024,41.96354,58


Check engine unit and cycle distribution in the testing and training sets

In [8]:
def check_units(df):
        for i in np.unique(df['unit']):
            print('Unit: ' + str(i) + ': ', len(np.unique(df.loc[df['unit'] == i, 'cycle'])))

print('Train set:')
check_units(df_train)

print('Test set:')
check_units(df_test)

Train set:
Unit: 2.0:  75
Unit: 5.0:  89
Unit: 10.0:  82
Unit: 16.0:  63
Unit: 18.0:  71
Unit: 20.0:  66
Test set:
Unit: 11.0:  59
Unit: 14.0:  76
Unit: 15.0:  67


In [9]:
# save to a csv for reusability
df_train.to_csv('../data/df_train.csv', index=False)
df_test.to_csv('../data/df_test.csv', index=False)