In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.utils import resample
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
import joblib
import warnings
warnings.filterwarnings("ignore");

In [10]:
# Load dataset
df = pd.read_csv("uci-secom.csv")

# Identify and drop timestamp columns
date_columns = [col for col in df.columns if "time" in col.lower()]
df.drop(columns=date_columns, inplace=True, errors='ignore')

# Data Cleaning: Remove columns with >80% missing values
missing_threshold = 0.8
df_cleaned = df.dropna(thresh=int((1 - missing_threshold) * df.shape[0]), axis=1)

# Impute remaining missing values with median
df_cleaned.fillna(df_cleaned.median(), inplace=True)

# Define target variable
df_cleaned.rename(columns={df_cleaned.columns[-1]: "Pass/Fail"}, inplace=True)

# Split features and target
X = df_cleaned.drop(columns=["Pass/Fail"])
y = df_cleaned["Pass/Fail"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance with undersampling
train_data = pd.concat([X_train, y_train], axis=1)
majority_class = train_data[train_data["Pass/Fail"] == -1]
minority_class = train_data[train_data["Pass/Fail"] == 1]
majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)
balanced_train_data = pd.concat([majority_downsampled, minority_class])

# Separate features and target after balancing
X_train_balanced = balanced_train_data.drop(columns=["Pass/Fail"])
y_train_balanced = balanced_train_data["Pass/Fail"]

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [30]:
# Apply PCA for dimensionality reduction before model training
pca = PCA(n_components=150)  # Keep 150 principal components
X_train_pca = pca.fit_transform(X_train_balanced)
X_test_pca = pca.transform(X_test)

# Standardization after PCA
X_train_pca_scaled = scaler.fit_transform(X_train_pca)
X_test_pca_scaled = scaler.transform(X_test_pca)

# Define models
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "NaiveBayes": GaussianNB()
}
param_grids = {
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "NaiveBayes": {}  # No hyperparameters for Naïve Bayes
}

# Train and evaluate models
best_models_pca = {}
classification_reports_pca = {}

for name, model in models.items():
    print(f"Training {name} on PCA data...")
    model.fit(X_train_pca_scaled, y_train_balanced)
    
    y_pred_pca = model.predict(X_test_pca_scaled)
    
    best_models_pca[name] = model
    classification_reports_pca[name] = classification_report(y_test, y_pred_pca)

    print(f"{name} Training on PCA Complete.\n")

# Display classification reports
for name, report in classification_reports_pca.items():
    print(f"{name} Classification Report (After PCA):\n{report}\n")

# Save the best model
joblib.dump(best_models_pca["RandomForest"], "best_model_pca.pkl")

Training RandomForest on PCA data...
RandomForest Training on PCA Complete.

Training SVM on PCA data...
SVM Training on PCA Complete.

Training NaiveBayes on PCA data...
NaiveBayes Training on PCA Complete.

RandomForest Classification Report (After PCA):
              precision    recall  f1-score   support

          -1       0.95      0.78      0.86       293
           1       0.12      0.43      0.19        21

    accuracy                           0.76       314
   macro avg       0.54      0.61      0.53       314
weighted avg       0.90      0.76      0.82       314


SVM Classification Report (After PCA):
              precision    recall  f1-score   support

          -1       1.00      0.00      0.01       293
           1       0.07      1.00      0.13        21

    accuracy                           0.07       314
   macro avg       0.53      0.50      0.07       314
weighted avg       0.94      0.07      0.01       314


NaiveBayes Classification Report (After PCA):
  

['best_model_pca.pkl']

# Model Performance Comparison (After PCA with 150 Components)

## Random Forest:
- **Pass (-1) Class:** Precision (0.95), recall (0.78), indicating strong precision but some missed "Pass" cases.
- **Fail (1) Class:** Precision (0.12), recall (0.43), meaning it identifies some "Fail" cases but with high false positives.
- **Overall:** Accuracy (76%)—a more balanced model but still favors the majority class.

## SVM:
- **Pass (-1) Class:** Precision (1.00), recall (0.00), meaning it predicts all cases as "Pass" but fails to recall any.
- **Fail (1) Class:** Precision (0.07), recall (1.00), meaning it captures all "Fail" cases but at the cost of extreme false positives.
- **Overall:** Accuracy (7%)—completely ineffective for classification.

## Naïve Bayes:
- **Pass (-1) Class:** Precision (0.94), recall (0.95), showing strong classification for "Pass" cases.
- **Fail (1) Class:** Precision (0.17), recall (0.14), meaning it struggles to identify "Fail" cases.
- **Overall:** Accuracy (89%)—performs well overall but poorly for the minority class.

## Conclusion:
- **Best model:** Random Forest (better recall for "Pass" class and reasonable balance).
- **Weakest model:** SVM (fails to classify properly, extreme bias).
- **Naïve Bayes:** Effective for "Pass" cases but weak in identifying "Fail" cases.
