In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from mpl_toolkits.mplot3d import Axes3D
import joblib
import warnings
warnings.filterwarnings("ignore");

In [2]:
# Load dataset
df = pd.read_csv("uci-secom.csv")

# Identify and drop timestamp columns
date_columns = [col for col in df.columns if "time" in col.lower()]
df.drop(columns=date_columns, inplace=True, errors='ignore')

# Data Cleaning: Remove columns with >80% missing values
missing_threshold = 0.8
df_cleaned = df.dropna(thresh=int((1 - missing_threshold) * df.shape[0]), axis=1)

# Impute remaining missing values with median
df_cleaned.fillna(df_cleaned.median(), inplace=True)

# Define target variable
df_cleaned.rename(columns={df_cleaned.columns[-1]: "Pass/Fail"}, inplace=True)

# Split features and target
X = df_cleaned.drop(columns=["Pass/Fail"])
y = df_cleaned["Pass/Fail"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE for balancing
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Apply PCA for dimensionality reduction before model training
pca = PCA(n_components=3)  # Keep 3 principal components
X_train_pca = pca.fit_transform(X_train_balanced)
X_test_pca = pca.transform(X_test)

# Standardization after PCA
X_train_pca_scaled = scaler.fit_transform(X_train_pca)
X_test_pca_scaled = scaler.transform(X_test_pca)

# Define models
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "NaiveBayes": GaussianNB()
}
param_grids = {
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "NaiveBayes": {}  # No hyperparameters for Naïve Bayes
}

# Train and evaluate models
best_models_pca = {}
classification_reports_pca = {}

for name, model in models.items():
    print(f"Training {name} on PCA data...")
    model.fit(X_train_pca_scaled, y_train_balanced)
    
    y_pred_pca = model.predict(X_test_pca_scaled)
    
    best_models_pca[name] = model
    classification_reports_pca[name] = classification_report(y_test, y_pred_pca)

    print(f"{name} Training on PCA Complete.\n")

# Display classification reports
for name, report in classification_reports_pca.items():
    print(f"{name} Classification Report (After PCA):\n{report}\n")

# Save the best model
joblib.dump(best_models_pca["RandomForest"], "best_model_SMOTE_pca.pkl")

Training RandomForest on PCA data...
RandomForest Training on PCA Complete.

Training SVM on PCA data...
SVM Training on PCA Complete.

Training NaiveBayes on PCA data...
NaiveBayes Training on PCA Complete.

RandomForest Classification Report (After PCA):
              precision    recall  f1-score   support

          -1       0.94      0.83      0.88       293
           1       0.09      0.24      0.13        21

    accuracy                           0.79       314
   macro avg       0.51      0.53      0.50       314
weighted avg       0.88      0.79      0.83       314


SVM Classification Report (After PCA):
              precision    recall  f1-score   support

          -1       0.96      0.51      0.67       293
           1       0.09      0.67      0.16        21

    accuracy                           0.52       314
   macro avg       0.52      0.59      0.41       314
weighted avg       0.90      0.52      0.63       314


NaiveBayes Classification Report (After PCA):
  

['best_model_SMOTE_pca.pkl']

# Model Performance Comparison (After PCA with 3 Components & SMOTE)

## Random Forest:
- **Pass (-1) Class:** Precision (0.94), recall (0.83), meaning it performs well but misses some "Pass" cases.
- **Fail (1) Class:** Precision (0.09), recall (0.24), meaning it struggles to correctly classify "Fail" cases.
- **Overall:** Accuracy (79%)—performs reasonably well but still biased toward the majority class.

## SVM:
- **Pass (-1) Class:** Precision (0.96), recall (0.51), meaning it predicts "Pass" cases well but misses nearly half.
- **Fail (1) Class:** Precision (0.09), recall (0.67), meaning it captures more "Fail" cases but with high false positives.
- **Overall:** Accuracy (52%)—better at capturing "Fail" cases but loses overall reliability.

## Naïve Bayes:
- **Pass (-1) Class:** Precision (0.97), recall (0.29), meaning it struggles to recall most "Pass" cases.
- **Fail (1) Class:** Precision (0.08), recall (0.86), meaning it detects many "Fail" cases but with extremely poor precision.
- **Overall:** Accuracy (33%)—performs poorly overall despite high recall for "Fail" cases.

## Conclusion:
- **Best model:** Random Forest (maintains reasonable balance but still struggles with "Fail" cases).
- **Weakest model:** Naïve Bayes (low overall accuracy and poor balance).
- **SVM:** Improved recall for "Fail" cases but loses too much accuracy to be reliable.
