In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# --- 1. Load Data ---
df = pd.read_csv('../data/train_FD001.csv')

# --- 2. Feature Selection (from EDA) ---
# Drop the constant columns we identified in our EDA
columns_to_drop = [
    'sensor_measurement_1', 'sensor_measurement_5', 'sensor_measurement_6',
    'sensor_measurement_10', 'sensor_measurement_16', 'sensor_measurement_18',
    'sensor_measurement_19'
]
df.drop(columns=columns_to_drop, inplace=True)

# --- 3. Feature Engineering (from EDA) ---
# Create rolling average features for the most promising sensors
window_size = 10
promising_sensors = [
    'sensor_measurement_4', 'sensor_measurement_7', 'sensor_measurement_11', 'sensor_measurement_12'
]
for sensor in promising_sensors:
    df[f'{sensor}_rolling_avg'] = df.groupby('unit_number')[sensor].transform(
        lambda x: x.rolling(window_size, min_periods=1).mean()
    )

# --- 4. Create the Target Variable ---
max_cycles = df.groupby('unit_number')['time_in_cycles'].max()
df = df.merge(max_cycles.to_frame(name='max_cycles'), left_on='unit_number', right_index=True)
df['RUL'] = df['max_cycles'] - df['time_in_cycles']
df['label'] = (df['RUL'] <= 30).astype(int)

# --- 5. Final Feature Selection and Splitting ---
# IMPORTANT: Our feature list now includes the new rolling average features!
feature_cols = [col for col in df.columns if 'op_setting' in col or 'sensor' in col]

# Define X and y
X = df[feature_cols]
y = df['label']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 6. Scale the Data ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Improved data pipeline is ready for modeling.")
print(f"Number of features: {X_train_scaled.shape[1]}")
print(f"X_train_scaled shape: {X_train_scaled.shape}")

Improved data pipeline is ready for modeling.
Number of features: 21
X_train_scaled shape: (16504, 21)


In [5]:
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf

# Set a new experiment name to keep results separate
mlflow.set_experiment("Predictive Maintenance - Feature Engineered")

# --- Run 1: Random Forest on Engineered Features ---
with mlflow.start_run(run_name="Random Forest (Feature Engineered)"):
    # Log parameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("features_used", X_train_scaled.shape[1])
    mlflow.log_param("model_type", "RandomForestClassifier")
    
    # Train the model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    rf_model.fit(X_train_scaled, y_train)
    
    # Evaluate and log metrics
    y_pred_rf = rf_model.predict(X_test_scaled)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    mlflow.log_metric("accuracy", accuracy_rf)
    
    # Log the model
    mlflow.sklearn.log_model(rf_model, "random_forest_model_v2")
    
    print(f"\nRandom Forest (Feature Engineered) run logged with accuracy: {accuracy_rf:.4f}")

# --- Run 2: Neural Network on Engineered Features ---
with mlflow.start_run(run_name="Neural Network (Feature Engineered)"):
    # Log parameters
    mlflow.log_param("epochs", 10)
    mlflow.log_param("batch_size", 32)
    mlflow.log_param("features_used", X_train_scaled.shape[1])
    mlflow.log_param("model_type", "TensorFlow/Keras")
    
    # Define and compile the model (using the same architecture as before)
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Train the model
    model.fit(
        X_train_scaled,
        y_train,
        epochs=10,
        batch_size=32,
        validation_split=0.2,
        verbose=1
    )
    
    # Evaluate and log metrics
    loss, accuracy_nn = model.evaluate(X_test_scaled, y_test, verbose=0)
    mlflow.log_metric("accuracy", accuracy_nn)
    mlflow.log_metric("loss", loss)
    
    # Log the model
    mlflow.tensorflow.log_model(model, "neural_network_model_v2")

    print(f"Neural Network (Feature Engineered) run logged with accuracy: {accuracy_nn:.4f}")

2025/09/19 23:57:07 INFO mlflow.tracking.fluent: Experiment with name 'Predictive Maintenance - Feature Engineered' does not exist. Creating a new experiment.



Random Forest (Feature Engineered) run logged with accuracy: 0.9661
Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9254 - loss: 0.1887 - val_accuracy: 0.9509 - val_loss: 0.1125
Epoch 2/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9473 - loss: 0.1208 - val_accuracy: 0.9576 - val_loss: 0.1028
Epoch 3/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9524 - loss: 0.1110 - val_accuracy: 0.9606 - val_loss: 0.0977
Epoch 4/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9530 - loss: 0.1062 - val_accuracy: 0.9579 - val_loss: 0.0978
Epoch 5/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9566 - loss: 0.1018 - val_accuracy: 0.9627 - val_loss: 0.0915
Epoch 6/10
[1m413/413[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9570 - loss: 0.1000 - val_accuracy: 0.9597 - val_loss: 0.0923
Epoch 7/10
[1m413/413[0m [32m━━━━━━━



Neural Network (Feature Engineered) run logged with accuracy: 0.9639
