In [None]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from src.data_utils.feature_engineering import (
    create_cross_system_group_interaction_features,
    create_delta_features,
    create_fuel_air_group_interaction_features,
    create_pressure_group_interaction_features,
    create_rolling_features,
    create_settings_sensor_interactions,
    create_settings_x_settings_interaction_features,
    create_speed_group_interaction_features,
    create_temperature_group_interaction_features,
)
from src.utils.config import config

warnings.filterwarnings("ignore")

# 1. Feature Engineering
Original features: 3 settings + 21 sensors = 24 features

In [None]:
prepared_folder = config.PREPARED_DATA_PATH

train_df = pd.read_csv(prepared_folder / "train-all-prepared.csv", index_col=False)
test_df = pd.read_csv(prepared_folder / "test-all-prepared.csv", index_col=False)

In [None]:
train_df

## 1. Rolling Window Features
Moving averages of sensor readings over last N cycles
sensor_1_rolling_mean_5 = average of sensor 1 over last 5 cycles
Smooths out noise, reveals underlying trends

In [None]:
print("Creating rolling features...")
window_sizes = [3, 5, 10, 20]
# window_sizes = [3, 5, 10]
# window_sizes = [2, 3, 5]
# window_sizes = [2]
train_df = create_rolling_features(train_df, window_sizes=window_sizes)
test_df = create_rolling_features(test_df, window_sizes=window_sizes)

In [None]:
rolling_cols = [c for c in train_df.columns if "_roll_" in c]
correlations = train_df[rolling_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)

print("Top 10 rolling features correlated with RUL:")
print(correlations.head(11)[1:])  # Skip RUL itself

# Check settings rolling features specifically
setting_rolling = [c for c in rolling_cols if c.startswith("setting")]
print("\nSettings rolling features correlation with RUL:")
for col in setting_rolling:
    corr = train_df[col].corr(train_df["RUL"], method="spearman")
    print(f"{col}: {corr:.4f}")

In [None]:
train_df

## 2. Delta/Rate Features
How much each sensor changed from previous cycle
Example: sensor_1_delta = current_value - previous_value
Captures degradation speed/acceleration

In [None]:
print("Creating delta features...")
train_df = create_delta_features(train_df)
test_df = create_delta_features(test_df)

In [None]:
# Let correlation with RUL tell us which deltas are useful
delta_cols = [c for c in train_df.columns if "_delta" in c]
delta_correlations = train_df[delta_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)

print("Top 10 delta features correlated with RUL:")
print(delta_correlations.head(10))

## 3. Time-based Features
What: Normalized cycle position (0 to 1 across engine's life)
Example: cycle_norm = current_cycle / max_cycles_for_this_engine

❗❗❗ Won't do - skip this FE ❗❗❗
Train has data until faillure
Test has data until a certain point, and need to predict RUL from this point (cycle)
They cannot be normalized the same way, this will create issue in prediction

## 4. Interaction Features (Medium complexity)
What: Combine operating settings to capture complex conditions
Example: setting_1_x_setting_2 = setting_1 * setting_2
Why important: Equipment might behave differently under combined stress
Computation: Simple multiplication

Operating settings with each other (setting_1 × setting_2 × setting_3)
Settings with key sensors (high temperature + high pressure scenarios)
Physically related sensors (temperature sensors with pressure sensors)

`X` : captures "amplification" effects
`+` : captures "combined stress" effects
`Ratio/Division`: captures "efficiency" or normalized response

### 4.1 Settings × Settings: 3 features (systematic)

In [None]:
print("Creating settings x settings features...")
train_df, settings_interaction_cols = create_settings_x_settings_interaction_features(train_df)
test_df, _ = create_settings_x_settings_interaction_features(test_df)

In [None]:
# Check correlations with RUL
settings_interaction_corr = (
    train_df[settings_interaction_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)
)

print("Settings×settings interactions correlated with RUL:")
print(settings_interaction_corr.head(4))

### 4.2 Settings × Sensors: 63 features (systematic)

In [None]:
print("Creating settings×sensors interaction features...")
train_df = create_settings_sensor_interactions(train_df)
test_df = create_settings_sensor_interactions(test_df)

In [None]:
# Check correlations with RUL
settings_sensor_cols = [c for c in train_df.columns if c.startswith("setting_") and "_x_s_" in c]
settings_sensor_corr = (
    train_df[settings_sensor_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)
)

print("Top 10 settings×sensors interactions correlated with RUL:")
print(settings_sensor_corr.head(10))

### 4.3 Sensors x sensors:
Instead of all 210 possible combinations, let's group them by what they measure.

##### 4.3.1 Temperature Group

In [None]:
print("Creating temperature features...")
train_df, interaction_cols = create_temperature_group_interaction_features(train_df)
test_df, _ = create_temperature_group_interaction_features(test_df)

In [None]:
# Check correlations with RUL
settings_interaction_corr = (
    train_df[interaction_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)
)

print("Settings×settings interactions correlated with RUL:")
print(settings_interaction_corr.head(10))

##### 4.3.2 Pressure Group

In [None]:
print("Creating pressure group interaction features...")
train_df, pressure_interaction_cols = create_pressure_group_interaction_features(train_df)
test_df, _ = create_pressure_group_interaction_features(test_df)

In [None]:
# Check correlations with RUL
pressure_interaction_corr = (
    train_df[pressure_interaction_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)
)

print("Pressure group interactions correlated with RUL:")
print(pressure_interaction_corr.head(10))

##### 4.3.3 Speed Group

In [None]:
print("Creating speed group interaction features...")
train_df, speed_interaction_cols = create_speed_group_interaction_features(train_df)
test_df, _ = create_speed_group_interaction_features(test_df)

In [None]:
# Check correlations with RUL
speed_interaction_corr = (
    train_df[speed_interaction_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)
)

print("Speed group interactions correlated with RUL:")
print(speed_interaction_corr.head(10))

##### 4.3.4 Fuel & Air Group

In [None]:
print("Creating fuel & air group interaction features...")
train_df, fuel_air_interaction_cols = create_fuel_air_group_interaction_features(train_df)
test_df, _ = create_fuel_air_group_interaction_features(test_df)

In [None]:
# Check correlations with RUL
fuel_air_interaction_corr = (
    train_df[fuel_air_interaction_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)
)

print("Fuel & air group interactions correlated with RUL:")
print(fuel_air_interaction_corr.head(10))

##### 4.3.5 Cross-System Group (2 interactions)

In [None]:
print("Creating cross-system group interaction features...")
train_df, cross_system_interaction_cols = create_cross_system_group_interaction_features(train_df)
test_df, _ = create_cross_system_group_interaction_features(test_df)

In [None]:
# Check correlations with RUL
cross_system_interaction_corr = (
    train_df[cross_system_interaction_cols + ["RUL"]].corr(method="spearman")["RUL"].abs().sort_values(ascending=False)
)

print("Cross-system group interactions correlated with RUL:")
print(cross_system_interaction_corr.head(10))

_____

# Feature Importance

In [None]:
train_df.shape

In [None]:
test_df.shape

Planning to use Random Forest, should be able to handle ~200 features

In [None]:
train_to_use = train_df
test_to_use = test_df

## 1. Random Forest Feature Importance

In [None]:
rul_thresholds = {
    1: {"max": 145, "min": 6},
    2: {"max": 194, "min": 6},
    3: {"max": 145, "min": 6},
    4: {"max": 194, "min": 6},
}

# Apply different RUL filtering for each subset
filtered_dfs = []
for subset_id in [1, 2, 3, 4]:
    subset_data = train_to_use[train_to_use["subset"] == subset_id]
    max_rul = rul_thresholds[subset_id]["max"]
    min_rul = rul_thresholds[subset_id]["min"]

    filtered_subset = subset_data[(subset_data["RUL"] <= max_rul) & (subset_data["RUL"] >= min_rul)]
    filtered_dfs.append(filtered_subset)

# Combine all filtered subsets back together
train_to_use = pd.concat(filtered_dfs, ignore_index=True)

In [None]:
# Get unique unit numbers for splitting
unique_units = train_to_use["unit_number"].unique()
train_units, valid_units = train_test_split(unique_units, test_size=0.1, random_state=45)

# Split data based on unit numbers
train_mask = train_to_use["unit_number"].isin(train_units)
test_mask = train_to_use["unit_number"].isin(valid_units)

X_train = train_to_use[train_mask].drop(["unit_number", "time_cycles", "RUL"], axis=1)
y_train = train_to_use[train_mask]["RUL"]

X_valid = train_to_use[test_mask].drop(["unit_number", "time_cycles", "RUL"], axis=1)
y_valid = train_to_use[test_mask]["RUL"]

In [None]:
# Scale features (keep 'subset' as is since it's categorical)
scaler = MinMaxScaler()
feature_cols = [col for col in X_train.columns if col != "subset"]

X_train_scaled = X_train.copy()
X_valid_scaled = X_valid.copy()

X_train_scaled[feature_cols] = scaler.fit_transform(X_train[feature_cols])
X_valid_scaled[feature_cols] = scaler.transform(X_valid[feature_cols])

In [None]:
rf = RandomForestRegressor(
    n_estimators=500,  # More trees for complex patterns
    max_depth=None,  # Let trees grow deep
    min_samples_split=5,  # Prevent overfitting
    min_samples_leaf=2,  # Balance bias-variance
    max_features="sqrt",  # Features per tree: √200 ~ 14 features per tree
    random_state=46,
    n_jobs=-1,  # XGBoost for GPU
)
rf.fit(X_train_scaled, y_train)
top_n_features = X_train_scaled.columns

In [None]:
# Get feature importance
feature_importance = pd.DataFrame({"feature": X_train.columns, "importance": rf.feature_importances_}).sort_values(
    "importance", ascending=False
)

# Display top 20 features
print("Top 20 Most Important Features:")
print(feature_importance.head(20))

In [None]:
# Plot top 15 features
plt.figure(figsize=(10, 6))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features["importance"])
plt.yticks(range(len(top_features)), top_features["feature"])
plt.xlabel("Feature Importance")
plt.title("Top 15 Feature Importances")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print(f"Baseline performance - RMSE: {np.sqrt(((rf.predict(X_valid_scaled) - y_valid) ** 2).mean()):.2f}")