## Introduction
In this notebook, we preprocess the data and train two models:
- **Random Forest**: A traditional machine learning model.
- **Neural Network**: A deep learning model.

The results are logged in MLFlow for tracking and comparison.


In [23]:
# 📌 Set up environment
import sys
import os

# Set working directory to project root
project_root = os.path.abspath(os.path.join(os.path.dirname("__file__"), ".."))
sys.path.append(project_root)
print(f"📌 Project root added to Python path: {project_root}")

📌 Project root added to Python path: f:\Portfolio Projects\fault_prediction_project


In [24]:
# Disable TensorFlow oneDNN optimizations to avoid floating-point rounding issues
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

In [25]:
# 📌 Import libraries
import mlflow
import mlflow.sklearn
import mlflow.keras
import joblib
import pandas as pd
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split


In [26]:
# ✅ Set MLflow tracking
mlflow.set_tracking_uri("file:F:/Portfolio Projects/fault_prediction_project/mlruns")
mlflow.set_experiment("NEV Fault Prediction")

<Experiment: artifact_location='file:///F:/Portfolio%20Projects/fault_prediction_project/mlruns/560277888509444402', creation_time=1738156492984, experiment_id='560277888509444402', last_update_time=1738156492984, lifecycle_stage='active', name='NEV Fault Prediction', tags={}>

In [27]:
# ✅ Load Data Processor
from src.data_processor import DataProcessor

# 📂 **Load Dataset**
data_file_path = "F:/Portfolio Projects/fault_prediction_project/data/Fault_nev_dataset.csv"
processor = DataProcessor(data_file_path)

print("\n📂 Loading Dataset...")
data = processor.load_data()
print(f"✅ Dataset Loaded: {data.shape}")
print(data.head())

Updated working directory: F:\Portfolio Projects\fault_prediction_project

📂 Loading Dataset...
✅ Dataset Loaded: (1000, 18)
   battery_voltage  battery_current  engine_temperature  motor_efficiency  \
0        12.623620        11.479653           83.085284         93.454060   
1        14.352143        23.966533           82.348940         95.933628   
2        13.695982        35.553104          115.312729         85.009358   
3        13.295975        30.627871           82.477310         92.497482   
4        11.968056        33.229640           83.597486         91.434920   

   tire_pressure  fuel_efficiency       speed  acceleration  driving_distance  \
0      35.719959        13.936355   77.790834      0.116398        360.133962   
1      38.054323        14.734357   20.686363      0.560318        343.641502   
2      37.601609        18.545474  104.687348      2.493737         47.877099   
3      31.538999        13.400044   73.573949      2.300305        461.286202   
4      

In [28]:
# 🛠 **Preprocess Dataset**
print("\n🛠️ Preprocessing Dataset...")
X, y = processor.preprocess(target_column="fault_type", categorical_columns=["road_condition"])
print(f"✅ Features Shape: {X.shape}, Target Shape: {y.shape}")


🛠️ Preprocessing Dataset...
✅ Features Shape: (1628, 17), Target Shape: (1628,)


In [29]:
# 📊 **Split Data**
print("\n📊 Splitting Data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"✅ Training Set: {X_train.shape}, {y_train.shape}")
print(f"✅ Testing Set: {X_test.shape}, {y_test.shape}")



📊 Splitting Data...
✅ Training Set: (1302, 17), (1302,)
✅ Testing Set: (326, 17), (326,)


In [31]:
# ✅ **Initialize MLflow Experiment**
mlflow.set_experiment("NEV Fault Prediction")

# -----------------------------------
# 🌲 **Train Random Forest Model**
# -----------------------------------
print("\n🌲 Training Random Forest Model...")

with mlflow.start_run(run_name="Random Forest Classifier"):
    try:
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)

        # Predictions & Metrics
        y_pred_rf = rf_model.predict(X_test)
        accuracy_rf = accuracy_score(y_test, y_pred_rf)
        f1_rf = f1_score(y_test, y_pred_rf, average="weighted")

        print(f"🎯 Random Forest - Accuracy: {accuracy_rf:.2f}, F1 Score: {f1_rf:.2f}")

        # Log model & metrics to MLflow
        mlflow.log_param("Model", "Random Forest")
        mlflow.log_param("n_estimators", 100)
        mlflow.log_metric("Accuracy", accuracy_rf)
        mlflow.log_metric("F1 Score", f1_rf)
        mlflow.sklearn.log_model(rf_model, "random_forest_model")

        print("✅ Random Forest model logged to MLflow.")

    except Exception as e:
        print(f"❌ Error during Random Forest training: {e}")


🌲 Training Random Forest Model...
🎯 Random Forest - Accuracy: 0.66, F1 Score: 0.66




✅ Random Forest model logged to MLflow.


In [32]:
# -----------------------------------
# 🤖 **Train Neural Network Model**
# -----------------------------------
print("\n🤖 Training Neural Network Model...")

with mlflow.start_run(run_name="Deep Learning Model"):
    try:
        # ✅ Encode target labels
        label_encoder = LabelEncoder()
        y_train_encoded = label_encoder.fit_transform(y_train)
        y_test_encoded = label_encoder.transform(y_test)

        # ✅ One-hot encoding for categorical classification
        y_train_onehot = tf.keras.utils.to_categorical(y_train_encoded, num_classes=len(label_encoder.classes_))
        y_test_onehot = tf.keras.utils.to_categorical(y_test_encoded, num_classes=len(label_encoder.classes_))

        # ✅ Define Neural Network Model
        model = Sequential([
            Dense(128, activation='relu', input_dim=X_train.shape[1]),
            Dropout(0.3),
            Dense(64, activation='relu'),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dense(len(label_encoder.classes_), activation='softmax')
        ])
        model.compile(optimizer=Adam(learning_rate=0.001),
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

        # ✅ Train the Neural Network
        print("🚀 Training the Neural Network...")
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        model.fit(X_train, y_train_onehot,
                  validation_data=(X_test, y_test_onehot),
                  epochs=100, batch_size=32, callbacks=[early_stopping], verbose=1)

        print("✅ Neural Network training complete.")

        # ✅ Save model locally
        model_dir = "F:/Portfolio Projects/fault_prediction_project/models"
        os.makedirs(model_dir, exist_ok=True)
        model_path = os.path.join(model_dir, "deep_learning_model.keras")
        model.save(model_path)
        print(f"✅ Neural Network model saved at {model_path}")

        # ✅ Evaluate Model
        loss, accuracy_nn = model.evaluate(X_test, y_test_onehot, verbose=0)
        y_pred_nn = model.predict(X_test)
        y_pred_classes_nn = tf.argmax(y_pred_nn, axis=1)
        f1_nn = f1_score(y_test_encoded, y_pred_classes_nn, average="weighted")

        # ✅ Log model & metrics to MLflow
        mlflow.log_param("Model", "Deep Learning")
        mlflow.log_param("Learning Rate", 0.001)
        mlflow.log_param("Batch Size", 32)
        mlflow.log_metric("Accuracy", accuracy_nn)
        mlflow.log_metric("F1 Score", f1_nn)
        mlflow.keras.log_model(model, "deep_learning_model")

        print(f"🎯 Neural Network - Accuracy: {accuracy_nn:.2f}, F1 Score: {f1_nn:.2f}")

    except Exception as e:
        print(f"❌ Error during Neural Network training: {e}")


🤖 Training Neural Network Model...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


🚀 Training the Neural Network...
Epoch 1/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 23ms/step - accuracy: 0.2498 - loss: 1.4312 - val_accuracy: 0.2577 - val_loss: 1.3820
Epoch 2/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2620 - loss: 1.3855 - val_accuracy: 0.3067 - val_loss: 1.3739
Epoch 3/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3144 - loss: 1.3617 - val_accuracy: 0.2945 - val_loss: 1.3650
Epoch 4/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3460 - loss: 1.3507 - val_accuracy: 0.3436 - val_loss: 1.3516
Epoch 5/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3663 - loss: 1.3276 - val_accuracy: 0.3497 - val_loss: 1.3439
Epoch 6/100
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3671 - loss: 1.3170 - val_accuracy: 0.3466 - val_loss: 1.3366




🎯 Neural Network - Accuracy: 0.61, F1 Score: 0.60


In [33]:
# -----------------------------------
# 🏆 **Select Best Model & Save**
# -----------------------------------
print("\n🏆 Comparing Models & Saving Best One...")

best_model = rf_model if accuracy_rf >= accuracy_nn else model
best_model_name = "best_rf_model.pkl" if accuracy_rf >= accuracy_nn else "best_nn_model.keras"
best_model_path = f"F:/Portfolio Projects/fault_prediction_project/models/{best_model_name}"

if isinstance(best_model, RandomForestClassifier):
    joblib.dump(best_model, best_model_path)
else:
    best_model.save(best_model_path)

print(f"🏆 ✅ Best model saved as {best_model_name} with accuracy: {max(accuracy_rf, accuracy_nn):.2f}")


🏆 Comparing Models & Saving Best One...
🏆 ✅ Best model saved as best_rf_model.pkl with accuracy: 0.66
