In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import joblib
import warnings
warnings.filterwarnings("ignore");

# Load dataset
df = pd.read_csv("uci-secom.csv")

# Identify and drop timestamp columns
date_columns = [col for col in df.columns if "time" in col.lower()]
df.drop(columns=date_columns, inplace=True, errors='ignore')

# Data Cleaning: Remove columns with >80% missing values
missing_threshold = 0.8
df_cleaned = df.dropna(thresh=int((1 - missing_threshold) * df.shape[0]), axis=1)

# Impute remaining missing values with median
df_cleaned.fillna(df_cleaned.median(), inplace=True)

# Define target variable
df_cleaned.rename(columns={df_cleaned.columns[-1]: "Pass/Fail"}, inplace=True)

# Split features and target
X = df_cleaned.drop(columns=["Pass/Fail"])
y = df_cleaned["Pass/Fail"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "NaiveBayes": GaussianNB()
}
param_grids = {
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "NaiveBayes": {}  # No hyperparameters for Naïve Bayes
}

# Train and evaluate models
best_models = {}
classification_reports = {}

for name, model in models.items():
    print(f"Training {name} with SMOTE applied...")
    model.fit(X_train_scaled, y_train_smote)
    
    y_pred = model.predict(X_test_scaled)
    
    best_models[name] = model
    classification_reports[name] = classification_report(y_test, y_pred)
    
    print(f"{name} Training Complete.\n")

# Display classification reports
for name, report in classification_reports.items():
    print(f"{name} Classification Report (After SMOTE):\n{report}\n")

# Save best model
joblib.dump(best_models["SVM"], "best_model_smote.pkl")


Training RandomForest with SMOTE applied...
RandomForest Training Complete.

Training SVM with SMOTE applied...
SVM Training Complete.

Training NaiveBayes with SMOTE applied...
NaiveBayes Training Complete.

RandomForest Classification Report (After SMOTE):
              precision    recall  f1-score   support

          -1       0.93      1.00      0.97       293
           1       0.00      0.00      0.00        21

    accuracy                           0.93       314
   macro avg       0.47      0.50      0.48       314
weighted avg       0.87      0.93      0.90       314


SVM Classification Report (After SMOTE):
              precision    recall  f1-score   support

          -1       0.94      1.00      0.97       293
           1       0.50      0.05      0.09        21

    accuracy                           0.93       314
   macro avg       0.72      0.52      0.53       314
weighted avg       0.91      0.93      0.91       314


NaiveBayes Classification Report (After SMOT

['best_model_smote.pkl']

# Model Performance Comparison (After SMOTE)

## Random Forest:
- **Pass (-1) Class:** Precision (0.93), recall (1.00), meaning it correctly identifies almost all "Pass" cases.
- **Fail (1) Class:** Precision (0.00), recall (0.00), meaning it completely fails to detect "Fail" cases.
- **Overall:** Accuracy (93%)—high accuracy but ignores the minority class entirely.

## SVM:
- **Pass (-1) Class:** Precision (0.94), recall (1.00), meaning strong classification for the majority class.
- **Fail (1) Class:** Precision (0.50), recall (0.05), meaning it identifies very few "Fail" cases.
- **Overall:** Accuracy (93%)—good accuracy but struggles with minority class detection.

## Naïve Bayes:
- **Pass (-1) Class:** Precision (0.95), recall (0.20), meaning it misclassifies most "Pass" cases.
- **Fail (1) Class:** Precision (0.07), recall (0.86), meaning it captures most "Fail" cases but with extremely poor precision.
- **Overall:** Accuracy (24%)—fails in overall classification despite better recall for "Fail" cases.

## Conclusion:
- **Best model:** SVM (slightly better at detecting "Fail" cases, though still weak).
- **Weakest model:** Naïve Bayes (very poor performance overall).
- **Random Forest:** High accuracy but completely fails at detecting "Fail" cases.

