In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("C:/Users/navya/Downloads/rul_hrs.csv")

In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [5]:
# Step 1: only numeric features
numeric_df = df.select_dtypes(include='number')

# Step 2: Correlation-based feature selection
corr = numeric_df.corr()['rul']
top_features = corr[corr.abs() > 0.1].index.tolist()
top_features.remove('rul')

X = numeric_df[top_features]
y = numeric_df['rul']

# Step 3: Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

In [6]:
# Step 5: Random Forest with Grid Search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(RandomForestRegressor(random_state=42),
                    param_grid,
                    scoring='neg_mean_absolute_error',
                    cv=5,
                    n_jobs=-1,
                    verbose=1)

grid.fit(X_train, y_train)

# Step 6: Evaluate best model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Safe MAPE calculation (ignore zero targets)
non_zero_indices = y_test != 0
mape = np.mean(np.abs((y_test[non_zero_indices] - y_pred[non_zero_indices]) / y_test[non_zero_indices])) * 100

# SMAPE calculation
smape = np.mean(2 * np.abs(y_pred - y_test) / (np.abs(y_test) + np.abs(y_pred) + 1e-8)) * 100  # add small value to avoid division by 0

print("Best Parameters:", grid.best_params_)
print(f"RandomForest → MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")
print(f"MAPE: {mape:.2f}%")
print(f"SMAPE: {smape:.2f}%")


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RandomForest → MAE: 10.3135, RMSE: 27.7187, R²: 0.9849
MAPE: 78.98%
SMAPE: 9.23%


In [8]:
import joblib  
joblib.dump(best_model, 'rf_model.pkl')

['rf_model.pkl']

In [8]:
print("Top features selected based on correlation:")
print(top_features)
print("Total number of features used:", len(top_features))

Top features selected based on correlation:
['sensor_01', 'sensor_02', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_09', 'sensor_11', 'sensor_13', 'sensor_17', 'sensor_18', 'sensor_24', 'sensor_29', 'sensor_37', 'sensor_39', 'sensor_41']
Total number of features used: 15


In [9]:
top_features = [
    'sensor_01', 'sensor_02', 'sensor_05', 'sensor_06', 'sensor_07',
    'sensor_09', 'sensor_11', 'sensor_13', 'sensor_17', 'sensor_18',
    'sensor_24', 'sensor_29', 'sensor_37', 'sensor_39', 'sensor_41'
]

In [10]:
import json

with open('selected_features.json', 'w') as f:
    json.dump(top_features, f)


In [5]:
# Step 1: Keep only numeric features
numeric_df = df.select_dtypes(include='number')

# Step 2: Separate features and target
X = numeric_df.drop(columns=['rul'])  
y = numeric_df['rul']

# Step 3: Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)

# Step 5: Random Forest with Grid Search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(RandomForestRegressor(random_state=42),
                    param_grid,
                    scoring='neg_mean_absolute_error',
                    cv=5,
                    n_jobs=-1,
                    verbose=1)

grid.fit(X_train, y_train)

# Step 6: Evaluate best model
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", grid.best_params_)
print(f"RandomForest (All Features) → MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
RandomForest (All Features) → MAE: 3.0485, RMSE: 9.4948, R²: 0.9982


In [6]:
# Step 1: Log-transform training targets (handles small/zero RUL)
y_train_log = np.log1p(y_train)  # Equivalent to log(1 + y_train)

# Step 2: Grid search on log-transformed targets
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

grid = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train_log)

# Step 3: Predict in log-space and invert
best_model = grid.best_estimator_
y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # Equivalent to exp(y_pred_log) - 1

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [7]:
# Standard metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# WMAPE: weighted MAPE (avoids division-by-zero & stabilizes small targets)
wmape = np.abs(y_test - y_pred).sum() / np.abs(y_test).sum() * 100

# SMAPE: symmetric MAPE
smape = np.mean(
    2 * np.abs(y_pred - y_test) /
    (np.abs(y_test) + np.abs(y_pred) + 1e-8)
) * 100

print("Best Parameters:", grid.best_params_)
print(f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")
print(f"WMAPE: {wmape:.2f}%")
print(f"SMAPE: {smape:.2f}%")


Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
MAE: 13.8896, RMSE: 40.9715, R²: 0.9671
WMAPE: 4.84%
SMAPE: 6.10%
