# Modeling_clean.ipynb
### Clean & Professional Modeling Pipeline — FD001

## 1. Introduction
This notebook continues from EDA_clean and performs model training using a clean, validated feature set.

## 2. Load Cleaned Dataset

In [1]:
import pandas as pd
df = pd.read_csv("../data/FD001_clean.csv")
df.head()

Unnamed: 0,unit,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_17_rollmean_20,sensor_17_rollstd_20,sensor_18_rollmean_20,sensor_18_rollstd_20,sensor_19_rollmean_20,sensor_19_rollstd_20,sensor_20_rollmean_20,sensor_20_rollstd_20,sensor_21_rollmean_20,sensor_21_rollstd_20
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,,,,,,,,,,
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,,,,,,,,,,
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,,,,,,,,,,
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,,,,,,,,,,
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,,,,,,,,,,


## 3. Select Numeric Features & Drop Non-Model Columns

In [2]:
# Drop identifiers not useful for modeling
drop_cols = ["unit", "cycle"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Ensure numeric
df = df.apply(pd.to_numeric, errors="coerce").fillna(0)

df.head()

Unnamed: 0,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,...,sensor_17_rollmean_20,sensor_17_rollstd_20,sensor_18_rollmean_20,sensor_18_rollstd_20,sensor_19_rollmean_20,sensor_19_rollstd_20,sensor_20_rollmean_20,sensor_20_rollstd_20,sensor_21_rollmean_20,sensor_21_rollstd_20
0,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4. Train/Validation Split

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=["RUL"])
y = df["RUL"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_val.shape

((16504, 108), (4127, 108))

## 5. Scaling

In [4]:
from sklearn.preprocessing import MinMaxScaler
import json
import pickle

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Save feature names
with open("../models/feature_names.json","w") as f:
    json.dump(list(X.columns), f)

# Save scaler
pickle.dump(scaler, open("../models/minmax_scaler.pkl","wb"))

X_train_scaled[:2]

array([[0.55172414, 0.25      , 0.        , 0.        , 0.38109756,
        0.45089998, 0.40462525, 0.        , 1.        , 0.49275362,
        0.38461538, 0.14340842, 0.        , 0.34131737, 0.60554371,
        0.33823529, 0.1712251 , 0.40977299, 0.        , 0.5       ,
        0.        , 0.        , 0.56589147, 0.61190677, 1.        ,
        0.        , 0.99797832, 0.49973957, 0.99015466, 0.36781438,
        0.98391055, 0.78536831, 1.        , 0.        , 1.        ,
        0.        , 0.99632908, 0.3830087 , 0.99992045, 0.19489513,
        0.98133815, 0.22744242, 1.        , 0.        , 0.98284791,
        0.23344458, 0.99695967, 0.32665402, 0.99990872, 0.17774135,
        0.98185035, 0.33164439, 0.98789102, 0.58187847, 1.        ,
        0.        , 0.98943662, 0.47343208, 1.        , 0.        ,
        1.        , 0.        , 0.9903537 , 0.38236914, 0.99166722,
        0.4821912 , 1.        , 0.        , 0.99858086, 0.56750251,
        0.99124965, 0.46122607, 0.98648431, 0.59

## 6. Baseline Models

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

lr = LinearRegression().fit(X_train_scaled, y_train)
rf = RandomForestRegressor(n_estimators=200).fit(X_train_scaled, y_train)
xgbr = xgb.XGBRegressor(n_estimators=300).fit(X_train_scaled, y_train)


## 7. Metrics Function

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

def evaluate(model, Xv, yv):
    preds = model.predict(Xv)
    return {
        "MAE": mean_absolute_error(yv, preds),
        "RMSE": mean_squared_error(yv, preds, squared=False)
    }

evaluate(lr, X_val_scaled, y_val)

TypeError: got an unexpected keyword argument 'squared'

## 8. Neural Network Model

In [None]:
import tensorflow as tf
from tensorflow import keras

nn = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(1)
])

nn.compile(optimizer='adam', loss='mse')
history = nn.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=20, batch_size=32
)

nn.save("../models/neural_network_rul.keras")

## 9. Ensemble Model

In [None]:
nn_preds = nn.predict(X_val_scaled).flatten()
xgb_preds = xgbr.predict(X_val_scaled)

ensemble_preds = (nn_preds + xgb_preds) / 2

print({
    "MAE": mean_absolute_error(y_val, ensemble_preds),
    "RMSE": mean_squared_error(y_val, ensemble_preds, squared=False)
})

## 10. Save Models

In [None]:
import pickle
pickle.dump(xgbr, open("../models/xgb_model.pkl","wb"))


## 11. Summary
This version fixes non-numeric columns, ensures clean scaling, and provides stable model training.