# 01 — FD001 Data Exploration

Load and explore the NASA C-MAPSS FD001 turbofan engine degradation dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
%matplotlib inline

## 1. Load the raw data

The C-MAPSS files are space-delimited with no headers. Columns are:
- `engine_id`, `cycle`
- `op_setting_1`, `op_setting_2`, `op_setting_3`
- `sensor_1` through `sensor_21`

In [None]:
columns = ["engine_id", "cycle", "op_setting_1", "op_setting_2", "op_setting_3"] + \
          [f"sensor_{i}" for i in range(1, 22)]

train_df = pd.read_csv("../data/raw/train_FD001.txt", sep=r"\s+", header=None, names=columns)
test_df = pd.read_csv("../data/raw/test_FD001.txt", sep=r"\s+", header=None, names=columns)
rul_df = pd.read_csv("../data/raw/RUL_FD001.txt", sep=r"\s+", header=None, names=["rul"])

print(f"Training data: {train_df.shape}")
print(f"Test data:     {test_df.shape}")
print(f"RUL labels:    {rul_df.shape}")

In [None]:
train_df.head(10)

In [None]:
train_df.describe()

## 2. Compute RUL (Remaining Useful Life)

For each row: `RUL = max_cycle_for_that_engine - current_cycle`

Then cap at 125 (piecewise linear — early life isn't meaningful degradation).

In [None]:
max_cycles = train_df.groupby("engine_id")["cycle"].max().rename("max_cycle")
train_df = train_df.merge(max_cycles, on="engine_id")
train_df["rul"] = train_df["max_cycle"] - train_df["cycle"]
train_df.drop(columns=["max_cycle"], inplace=True)

RUL_CAP = 125
train_df["rul_capped"] = train_df["rul"].clip(upper=RUL_CAP)

train_df[["engine_id", "cycle", "rul", "rul_capped"]].head(10)

## 3. Engine lifetime distribution

In [None]:
engine_lifetimes = train_df.groupby("engine_id")["cycle"].max()

fig, ax = plt.subplots(figsize=(10, 4))
ax.hist(engine_lifetimes, bins=30, edgecolor="black", alpha=0.7)
ax.set_xlabel("Lifetime (cycles)")
ax.set_ylabel("Number of engines")
ax.set_title("Engine Lifetime Distribution (FD001 Training Set)")
ax.axvline(engine_lifetimes.mean(), color="red", linestyle="--", label=f"Mean: {engine_lifetimes.mean():.0f}")
ax.legend()
plt.tight_layout()
plt.show()

print(f"Min: {engine_lifetimes.min()}, Max: {engine_lifetimes.max()}, Mean: {engine_lifetimes.mean():.1f}")

## 4. Sensor overview — identify constant/low-variance sensors

In [None]:
sensor_cols = [c for c in train_df.columns if c.startswith("sensor_")]

sensor_std = train_df[sensor_cols].std().sort_values()

fig, ax = plt.subplots(figsize=(12, 5))
sensor_std.plot(kind="bar", ax=ax, color="steelblue", edgecolor="black")
ax.set_ylabel("Standard Deviation")
ax.set_title("Sensor Standard Deviations — Low values = near-constant (candidates for removal)")
ax.axhline(y=0.5, color="red", linestyle="--", alpha=0.7, label="Threshold = 0.5")
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("Near-constant sensors (std < 0.5):")
print(sensor_std[sensor_std < 0.5].index.tolist())

## 5. Sensor traces for a few engines

Plot sensor readings over the lifetime of a few engines to see degradation patterns.

In [None]:
plot_sensors = ["sensor_2", "sensor_3", "sensor_4", "sensor_7", "sensor_11", "sensor_12", "sensor_15", "sensor_20", "sensor_21"]
sample_engines = [1, 20, 50, 80]

fig, axes = plt.subplots(len(plot_sensors), 1, figsize=(14, 3 * len(plot_sensors)), sharex=False)

for idx, sensor in enumerate(plot_sensors):
    ax = axes[idx]
    for eng_id in sample_engines:
        eng_data = train_df[train_df["engine_id"] == eng_id]
        ax.plot(eng_data["cycle"], eng_data[sensor], alpha=0.7, label=f"Engine {eng_id}")
    ax.set_ylabel(sensor)
    ax.legend(loc="upper left", fontsize=8)

axes[0].set_title("Sensor Readings Over Engine Lifetime")
axes[-1].set_xlabel("Cycle")
plt.tight_layout()
plt.show()

## 6. Correlation with RUL

In [None]:
correlations = train_df[sensor_cols + ["rul"]].corr()["rul"].drop("rul").sort_values()

fig, ax = plt.subplots(figsize=(10, 6))
correlations.plot(kind="barh", ax=ax, color=correlations.apply(lambda x: "salmon" if x < 0 else "steelblue"))
ax.set_xlabel("Correlation with RUL")
ax.set_title("Sensor Correlation with Remaining Useful Life")
ax.axvline(x=0, color="black", linewidth=0.5)
plt.tight_layout()
plt.show()

## 7. RUL target distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

axes[0].hist(train_df["rul"], bins=50, edgecolor="black", alpha=0.7)
axes[0].set_title("Raw RUL Distribution")
axes[0].set_xlabel("RUL")

axes[1].hist(train_df["rul_capped"], bins=50, edgecolor="black", alpha=0.7, color="orange")
axes[1].set_title(f"Capped RUL Distribution (cap={RUL_CAP})")
axes[1].set_xlabel("RUL (capped)")

plt.tight_layout()
plt.show()

## Summary

**Key findings to carry forward into preprocessing:**
- Several sensors are near-constant and can be dropped
- Some sensors show clear degradation trends over engine lifetime
- RUL capping at 125 gives a cleaner target distribution
- Sensor correlations with RUL will guide feature selection