# Notebook 01 - Data Processing

In [92]:
import pandas as pd

PATH = '../data/raw/FD001.txt'

# Loading the dataset
data = pd.read_csv(PATH, sep='\s+', header=None)


In [93]:
# Displaying the first five rows of the dataset
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [94]:
# Displaying the last five rows of the dataset
data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,519.68,2388.22,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,519.3,2388.26,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522


In [95]:
# Displaying the shape of the dataset
print("Dataset shape ")
print(f"rows: {data.shape[0]}")
print(f"columns: {data.shape[1]}")

Dataset shape 
rows: 20631
columns: 26


In [96]:
# Changing the column names
columns = ['unit_number', 'time_in_cycles', 'operational_setting_1', 'operational_setting_2', 'operational_setting_3']

sensor_columns = ['sensor_measurement_{}'.format(i) for i in range(1, 22)]

columns.extend(sensor_columns)

data.columns = columns

# Displaying the first five rows of the dataset 
data.head()

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,...,sensor_measurement_12,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [97]:
def remaining_useful_life(data, column):
    rul_data = data.copy()

    time_variable_max = rul_data[column].max()

    rul_data['RUL'] = time_variable_max - rul_data[column]

    return rul_data['RUL']



In [98]:
units_ = data['unit_number'].unique()

print(f"Number of units: {len(units_)}")

Number of units: 100


In [99]:
subsets = []

thresholds = [50, 125]
labels = {0: 'urgent', 1: 'medium', 2: 'long'}

for unit in units_:
    # Selecting the subset of the data by unit
    subset = data[data['unit_number'] == unit]

    subset_rul = remaining_useful_life(subset, 'time_in_cycles')

    subset = pd.concat([subset, subset_rul], axis=1)

    # Appending the subset to the subsets list
    subsets.append(subset)
    

# Concatenating the subsets
data_processed = pd.concat(subsets)

data_processed.dropna(inplace=True)

data_processed.reset_index(drop=True, inplace=True)

data_processed.head(10)

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,...,sensor_measurement_13,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187
5,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,...,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669,186
6,1,7,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.1,23.3774,185
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106,184
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,...,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066,183
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694,182


In [100]:
print("Dataset shape before adding rolling statistics")
print(f"rows: {data.shape[0]}")
print(f"columns: {data.shape[1]}")
print("Dataset shape after adding rolling statistics")
print(f"rows: {data_processed.shape[0]}")
print(f"columns: {data_processed.shape[1]}")

Dataset shape before adding rolling statistics
rows: 20631
columns: 26
Dataset shape after adding rolling statistics
rows: 20631
columns: 27


In [101]:
# Save processed data
data_processed.to_csv('../data/processed/FD001.csv', index=False)

In [102]:
import pandas as pd

PATH = '../data/processed/FD001.csv'

# Loading the dataset
data_processed = pd.read_csv(PATH)

In [103]:
def make_multiclass_classification(data, column, thresholds, labels, categorical=False):
    data_multiclass = data.copy()

    bins = [-float('inf')] + thresholds + [float('inf')]
    
    if categorical:
        labels = list(labels.values())
    else:
        labels = list(labels.keys())

    data_multiclass[column] = pd.cut(data_multiclass[column], bins=bins, labels=labels)

    return data_multiclass[column]

thresholds = [50, 125]
labels = {0: 'urgent', 1: 'medium', 2: 'long'}

data_processed['maintanance_urgency'] = make_multiclass_classification(data_processed, 'RUL', thresholds, labels, categorical=True)

In [104]:
# Displaying the first five rows of the dataset
data_processed.head()

Unnamed: 0,unit_number,time_in_cycles,operational_setting_1,operational_setting_2,operational_setting_3,sensor_measurement_1,sensor_measurement_2,sensor_measurement_3,sensor_measurement_4,sensor_measurement_5,...,sensor_measurement_14,sensor_measurement_15,sensor_measurement_16,sensor_measurement_17,sensor_measurement_18,sensor_measurement_19,sensor_measurement_20,sensor_measurement_21,RUL,maintanance_urgency
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191,long
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190,long
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189,long
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188,long
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187,long


In [105]:
def rolling_mean(data, window_size: int = 5):
    rolling_mean_data = data.copy()

    for col in data.columns:
        rolling_mean_data[col] = data[col].rolling(window=window_size).mean()

    # Rename the columns
    rolling_mean_data.columns = [col + '_rm' for col in data.columns]

    return rolling_mean_data

def ingestion_pipeline(data):
    constant_sensors = ['operational_setting_1', 'operational_setting_2', 'operational_setting_3', 'sensor_measurement_1', 'sensor_measurement_5', 'sensor_measurement_6', 'sensor_measurement_10', 'sensor_measurement_16', 'sensor_measurement_18', 'sensor_measurement_19']
    data = data.drop(columns=constant_sensors, axis=1)

    units_ = data['unit_number'].unique()

    subsets = []

    for unit in units_:
        # Selecting the subset of the data by unit
        subset = data[data['unit_number'] == unit]
        
        subset_units = subset['unit_number']
        
        subset_time = subset['time_in_cycles']
        
        subset_rul = subset['RUL']

        subset_urgency = subset['maintanance_urgency']

        subset_rm = rolling_mean(subset[subset.columns[2:-2]], window_size=10)

        subset = pd.concat([subset_units, subset_time, subset_rul, subset_rm, subset_urgency], axis=1)

        subsets.append(subset)
    # Concatenating the subsets
    data = pd.concat(subsets)

    data.dropna(inplace=True)

    data.reset_index(drop=True, inplace=True)

    return data

data_processed = ingestion_pipeline(data_processed)

data_processed.head()

Unnamed: 0,unit_number,time_in_cycles,RUL,sensor_measurement_2_rm,sensor_measurement_3_rm,sensor_measurement_4_rm,sensor_measurement_7_rm,sensor_measurement_8_rm,sensor_measurement_9_rm,sensor_measurement_11_rm,sensor_measurement_12_rm,sensor_measurement_13_rm,sensor_measurement_14_rm,sensor_measurement_15_rm,sensor_measurement_17_rm,sensor_measurement_20_rm,sensor_measurement_21_rm,maintanance_urgency
0,1,10,182,642.201,1587.712,1400.84,554.096,2388.049,9049.56,47.272,522.146,2388.044,8132.228,8.40839,391.8,38.984,23.3896,long
1,1,11,181,642.247,1586.917,1400.844,554.114,2388.048,9049.902,47.24,522.12,2388.043,8132.424,8.40984,391.8,38.972,23.39557,long
2,1,12,180,642.238,1586.076,1400.545,554.191,2388.053,9050.432,47.209,522.072,2388.038,8132.7,8.40604,391.7,38.978,23.38981,long
3,1,13,179,642.31,1585.496,1400.208,554.109,2388.057,9049.82,47.22,522.015,2388.043,8132.187,8.40578,392.0,38.976,23.38296,long
4,1,14,178,642.31,1586.512,1399.937,554.112,2388.055,9049.609,47.251,521.896,2388.035,8132.247,8.4086,392.1,39.006,23.38383,long


In [106]:
# Save processed data
data_processed.to_csv('../data/production/FD001.csv', index=False)