# Data Exploration

### Define column names

In [15]:
import pandas as pd

In [2]:
# Column names based on data source documentation
column_names = (
    ["engine_id", "cycle"] + 
    [f"op_setting_{i}" for i in range(1, 4)] +
    [f"sensor_{i}" for i in range(1, 22)]
)

### Loading the training data (FD001)

In [3]:
training_path = "../data/train_FD001.txt"

df  = pd.read_csv(
    training_path, 
    sep=r"\s+", 
    header=None, 
    names=column_names 
)

# \s+ handles variable spacing safely -- common in industrial logs.

### Verify shape & sanity

In [4]:
df.shape  # Should be around 20K rows and 26 columns
df.head() # First few rows
df.tail() # Last few rows

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,...,519.49,2388.26,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,...,519.68,2388.22,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,...,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,...,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,...,519.3,2388.26,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522


### Check Unit / Engines Count and Basic Structure

In [5]:
df["engine_id"].nunique()  # Number of unique engines should be 100
df.info()  # All numeric types
df.describe()  # No unexpected NaNs, Reasonable ranges

<class 'pandas.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   engine_id     20631 non-null  int64  
 1   cycle         20631 non-null  int64  
 2   op_setting_1  20631 non-null  float64
 3   op_setting_2  20631 non-null  float64
 4   op_setting_3  20631 non-null  float64
 5   sensor_1      20631 non-null  float64
 6   sensor_2      20631 non-null  float64
 7   sensor_3      20631 non-null  float64
 8   sensor_4      20631 non-null  float64
 9   sensor_5      20631 non-null  float64
 10  sensor_6      20631 non-null  float64
 11  sensor_7      20631 non-null  float64
 12  sensor_8      20631 non-null  float64
 13  sensor_9      20631 non-null  float64
 14  sensor_10     20631 non-null  float64
 15  sensor_11     20631 non-null  float64
 16  sensor_12     20631 non-null  float64
 17  sensor_13     20631 non-null  float64
 18  sensor_14     20631 non-null  float64

Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
count,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,...,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0,20631.0
mean,51.506568,108.807862,-9e-06,2e-06,100.0,518.67,642.680934,1590.523119,1408.933782,14.62,...,521.41347,2388.096152,8143.752722,8.442146,0.03,393.210654,2388.0,100.0,38.816271,23.289705
std,29.227633,68.88099,0.002187,0.000293,0.0,0.0,0.500053,6.13115,9.000605,5.3292e-15,...,0.737553,0.071919,19.076176,0.037505,3.469531e-18,1.548763,0.0,0.0,0.180746,0.108251
min,1.0,1.0,-0.0087,-0.0006,100.0,518.67,641.21,1571.04,1382.25,14.62,...,518.69,2387.88,8099.94,8.3249,0.03,388.0,2388.0,100.0,38.14,22.8942
25%,26.0,52.0,-0.0015,-0.0002,100.0,518.67,642.325,1586.26,1402.36,14.62,...,520.96,2388.04,8133.245,8.4149,0.03,392.0,2388.0,100.0,38.7,23.2218
50%,52.0,104.0,0.0,0.0,100.0,518.67,642.64,1590.1,1408.04,14.62,...,521.48,2388.09,8140.54,8.4389,0.03,393.0,2388.0,100.0,38.83,23.2979
75%,77.0,156.0,0.0015,0.0003,100.0,518.67,643.0,1594.38,1414.555,14.62,...,521.95,2388.14,8148.31,8.4656,0.03,394.0,2388.0,100.0,38.95,23.3668
max,100.0,362.0,0.0087,0.0006,100.0,518.67,644.53,1616.91,1441.49,14.62,...,523.38,2388.56,8293.72,8.5848,0.03,400.0,2388.0,100.0,39.43,23.6184


### Check cycle progression per engine

In [6]:
df.groupby("engine_id")["cycle"].max().head()  

# Each engine runs for different cycles
# Degradation length varies across engines

engine_id
1    192
2    287
3    179
4    189
5    269
Name: cycle, dtype: int64

In [7]:
## Although data is sorted by engine_id and cycle, let's explicitly sort it
df = df.sort_values(by=["engine_id", "cycle"]).reset_index(drop=True)

### Check Operational Settings

In [8]:
df[["op_setting_1", "op_setting_2", "op_setting_3"]].nunique()

# Single operating condition
# Safe to compare sensor trends directly across engines

op_setting_1    158
op_setting_2     13
op_setting_3      1
dtype: int64

In [16]:
df[["op_setting_1", "op_setting_2", "op_setting_3"]].std()

# Standard deviation is zero for all operating settings
# Confirming single operating condition

# Data loaded correctly
# Validated structure
# Clean time-series

op_setting_1    0.002187
op_setting_2    0.000293
op_setting_3    0.000000
dtype: float64

### Write clean data in CSV format

In [17]:
df.to_csv("../data/train_FD001_clean.csv", index=False)