In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline

import os, sys

# Ensure project root is available
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

processed_path = "../data/processed/FD001_processed.parquet"
df = pd.read_parquet(processed_path)

df.head()


Unnamed: 0,engine_id,cycle,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12_roll_mean,sensor_13_roll_mean,sensor_14_roll_mean,sensor_15_roll_mean,sensor_16_roll_mean,sensor_17_roll_mean,sensor_18_roll_mean,sensor_19_roll_mean,sensor_20_roll_mean,sensor_21_roll_mean
0,1,1,0.45977,0.166667,0.0,0.0,0.183735,0.406802,0.309757,0.0,...,,,,,,,,,,
1,1,2,0.609195,0.25,0.0,0.0,0.283133,0.453019,0.352633,0.0,...,,,,,,,,,,
2,1,3,0.252874,0.75,0.0,0.0,0.343373,0.369523,0.370527,0.0,...,,,,,,,,,,
3,1,4,0.54023,0.5,0.0,0.0,0.343373,0.256159,0.331195,0.0,...,,,,,,,,,,
4,1,5,0.390805,0.333333,0.0,0.0,0.349398,0.257467,0.404625,0.0,...,0.816337,0.260204,0.172934,0.264082,0.0,0.236842,0.0,0.0,0.734884,0.769768


In [7]:
# Remove rolling/delta NaNs
df_clean = df.dropna().reset_index(drop=True)

feature_cols = [c for c in df_clean.columns 
                if ('sensor' in c or 'op_setting' in c) and ('RUL' not in c)]

X = df_clean[feature_cols]
y = df_clean["RUL"]
groups = df_clean["engine_id"]


In [8]:
gkf = GroupKFold(n_splits=5)

def evaluate_model(model, X, y, groups):
    mae_scores = []
    rmse_scores = []

    for train_idx, test_idx in gkf.split(X, y, groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        mae_scores.append(mean_absolute_error(y_test, preds))
        rmse_scores.append(np.sqrt(mean_squared_error(y_test, preds)))

    return np.mean(mae_scores), np.mean(rmse_scores)


In [9]:
dt = DecisionTreeRegressor(max_depth=10, random_state=0)

dt_mae, dt_rmse = evaluate_model(dt, X, y, groups)
dt_mae, dt_rmse


(32.29766551057162, 45.96675581479808)

In [10]:
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_leaf=2,
    random_state=0,
    n_jobs=-1
)

rf_mae, rf_rmse = evaluate_model(rf, X, y, groups)
rf_mae, rf_rmse


(29.47109759814591, 42.0016173984494)

In [11]:
svr_pipe = Pipeline([
    ("scaler", MinMaxScaler()),
    ("svr", SVR(kernel="rbf", C=10, gamma=0.01))
])

svr_mae, svr_rmse = evaluate_model(svr_pipe, X, y, groups)
svr_mae, svr_rmse


(32.346692829423716, 44.52694425239331)

In [12]:
results = pd.DataFrame({
    "Model": ["Decision Tree", "Random Forest", "SVR (RBF)"],
    "MAE": [dt_mae, rf_mae, svr_mae],
    "RMSE": [dt_rmse, rf_rmse, svr_rmse]
})

results


Unnamed: 0,Model,MAE,RMSE
0,Decision Tree,32.297666,45.966756
1,Random Forest,29.471098,42.001617
2,SVR (RBF),32.346693,44.526944
