In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [4]:
train_df = pd.read_csv('train_FD001.txt')
train_df.head()

Unnamed: 0,1 1 -0.0007 -0.0004 100.0 518.67 641.82 1589.70 1400.60 14.62 21.61 554.36 2388.06 9046.19 1.30 47.47 521.66 2388.02 8138.62 8.4195 0.03 392 2388 100.00 39.06 23.4190
0,1 2 0.0019 -0.0003 100.0 518.67 642.15 1591.82...
1,1 3 -0.0043 0.0003 100.0 518.67 642.35 1587.99...
2,1 4 0.0007 0.0000 100.0 518.67 642.35 1582.79 ...
3,1 5 -0.0019 -0.0002 100.0 518.67 642.37 1582.8...
4,1 6 -0.0043 -0.0001 100.0 518.67 642.10 1584.4...


In [5]:
# The FD001 dataset has 26 columns:
# 1: engine_id
# 2: cycle
# 3: setting1 (operational setting 1)
# 4: setting2 (operational setting 2)
# 5: setting3 (operational setting 3)
# 6-26: sensor_1 to sensor_21

column_names = ['engine_id', 'cycle', 'setting1', 'setting2', 'setting3',
                'sensor1', 'sensor2', 'sensor3', 'sensor4', 'sensor5',
                'sensor6', 'sensor7', 'sensor8', 'sensor9', 'sensor10',
                'sensor11', 'sensor12', 'sensor13', 'sensor14', 'sensor15',
                'sensor16', 'sensor17', 'sensor18', 'sensor19', 'sensor20',
                'sensor21']

# Load the training data
train_df = pd.read_csv('train_FD001.txt', sep='\s+', header=None, names=column_names)

# Load the test data (without RUL)
test_df = pd.read_csv('test_FD001.txt', sep='\s+', header=None, names=column_names)

# Load the RUL data for the test set
# This file usually just contains the RUL for the *last* cycle of each engine in the test set
rul_df = pd.read_csv('RUL_FD001.txt', sep='\s+', header=None, names=['RUL'])

print("Training data shape:", train_df.shape)
print("Test data shape:", test_df.shape)
print("RUL data shape:", rul_df.shape)

Training data shape: (20631, 26)
Test data shape: (13096, 26)
RUL data shape: (100, 1)


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   engine_id  20631 non-null  int64  
 1   cycle      20631 non-null  int64  
 2   setting1   20631 non-null  float64
 3   setting2   20631 non-null  float64
 4   setting3   20631 non-null  float64
 5   sensor1    20631 non-null  float64
 6   sensor2    20631 non-null  float64
 7   sensor3    20631 non-null  float64
 8   sensor4    20631 non-null  float64
 9   sensor5    20631 non-null  float64
 10  sensor6    20631 non-null  float64
 11  sensor7    20631 non-null  float64
 12  sensor8    20631 non-null  float64
 13  sensor9    20631 non-null  float64
 14  sensor10   20631 non-null  float64
 15  sensor11   20631 non-null  float64
 16  sensor12   20631 non-null  float64
 17  sensor13   20631 non-null  float64
 18  sensor14   20631 non-null  float64
 19  sensor15   20631 non-null  float64
 20  sensor

In [7]:
train_df.describe().transpose().head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
engine_id,20631.0,51.506568,29.227633,1.0,26.0,52.0,77.0,100.0
cycle,20631.0,108.807862,68.88099,1.0,52.0,104.0,156.0,362.0
setting1,20631.0,-9e-06,0.002187,-0.0087,-0.0015,0.0,0.0015,0.0087
setting2,20631.0,2e-06,0.000293,-0.0006,-0.0002,0.0,0.0003,0.0006
setting3,20631.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0


In [8]:
train_df.isna().sum()

Unnamed: 0,0
engine_id,0
cycle,0
setting1,0
setting2,0
setting3,0
sensor1,0
sensor2,0
sensor3,0
sensor4,0
sensor5,0


In [9]:
column_names_maxcycles = ['engine_id', 'max_cycle']

max_cycles_per_engine = train_df.groupby('engine_id')['cycle'].max().reset_index()
max_cycles_per_engine.columns = column_names_maxcycles

print("\nMaximum cycle for each engine:")
max_cycles_per_engine


Maximum cycle for each engine:


Unnamed: 0,engine_id,max_cycle
0,1,192
1,2,287
2,3,179
3,4,189
4,5,269
...,...,...
95,96,336
96,97,202
97,98,156
98,99,185


In [10]:
# Calculate RUL for the training data
# For each engine, the RUL at a given cycle is the max cycle for that engine minus the current cycle
train_df['RUL'] = train_df.groupby('engine_id')['cycle'].transform(lambda x: x.max() - x)

print("Training data with RUL column:\n")
train_df.head(10)

Training data with RUL column:



Unnamed: 0,engine_id,cycle,setting1,setting2,setting3,sensor1,sensor2,sensor3,sensor4,sensor5,...,sensor13,sensor14,sensor15,sensor16,sensor17,sensor18,sensor19,sensor20,sensor21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187
5,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,...,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669,186
6,1,7,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.1,23.3774,185
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106,184
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,...,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066,183
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694,182
