In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore");

In [10]:
from sklearn.utils import resample

# Load dataset
df = pd.read_csv("uci-secom.csv")

# Identify and drop timestamp columns
date_columns = [col for col in df.columns if "time" in col.lower()]
df.drop(columns=date_columns, inplace=True, errors='ignore')

# Data Cleaning: Remove columns with >80% missing values
missing_threshold = 0.8
df_cleaned = df.dropna(thresh=int((1 - missing_threshold) * df.shape[0]), axis=1)

# Impute remaining missing values with median
df_cleaned.fillna(df_cleaned.median(), inplace=True)

In [11]:
# Define target variable
df_cleaned.rename(columns={df_cleaned.columns[-1]: "Pass/Fail"}, inplace=True)

# Split features and target
X = df_cleaned.drop(columns=["Pass/Fail"])
y = df_cleaned["Pass/Fail"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle class imbalance with undersampling
train_data = pd.concat([X_train, y_train], axis=1)
majority_class = train_data[train_data["Pass/Fail"] == -1]
minority_class = train_data[train_data["Pass/Fail"] == 1]
majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)
balanced_train_data = pd.concat([majority_downsampled, minority_class])

# Separate features and target after balancing
X_train_balanced = balanced_train_data.drop(columns=["Pass/Fail"])
y_train_balanced = balanced_train_data["Pass/Fail"]

In [12]:
# Standardize data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_balanced)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Define models and hyperparameter grids
models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "NaiveBayes": GaussianNB()
}
param_grids = {
    "RandomForest": {"n_estimators": [50, 100, 200], "max_depth": [5, 10, 20]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "NaiveBayes": {}  # No hyperparameters for Naïve Bayes
}

In [14]:
# Train and evaluate models
best_models = {}
classification_reports = {}
for name, model in models.items():
    print(f"Training {name}...")
    if param_grids[name]:
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring="accuracy", n_jobs=-1)
        grid_search.fit(X_train_scaled, y_train_balanced)
        best_model = grid_search.best_estimator_
    else:
        best_model = model.fit(X_train_scaled, y_train_balanced)
    
    best_models[name] = best_model
    y_pred = best_model.predict(X_test_scaled)
    classification_reports[name] = classification_report(y_test, y_pred)
    print(f"{name} Training Complete.\n")

Training RandomForest...
RandomForest Training Complete.

Training SVM...
SVM Training Complete.

Training NaiveBayes...
NaiveBayes Training Complete.



In [15]:
# Display classification reports
for name, report in classification_reports.items():
    print(f"{name} Classification Report:\n{report}\n")

RandomForest Classification Report:
              precision    recall  f1-score   support

          -1       0.97      0.75      0.84       293
           1       0.16      0.67      0.26        21

    accuracy                           0.74       314
   macro avg       0.56      0.71      0.55       314
weighted avg       0.91      0.74      0.80       314


SVM Classification Report:
              precision    recall  f1-score   support

          -1       0.96      0.60      0.74       293
           1       0.11      0.67      0.18        21

    accuracy                           0.60       314
   macro avg       0.53      0.63      0.46       314
weighted avg       0.90      0.60      0.70       314


NaiveBayes Classification Report:
              precision    recall  f1-score   support

          -1       0.94      0.94      0.94       293
           1       0.21      0.24      0.22        21

    accuracy                           0.89       314
   macro avg       0.58      

In [16]:
# Save best model
import joblib
joblib.dump(best_models["RandomForest"], "best_model.pkl")

['best_model.pkl']

# Model Performance Comparison

## Random Forest:
- **Pass (-1) Class:** Good precision (0.97), but recall is only 0.75.
- **Fail (1) Class:** Low precision (0.16), but decent recall (0.67).
- **Overall:** Balanced model but still biased towards "Pass" cases.

## SVM:
- **Pass (-1) Class:** Precision (0.96), recall (0.60), meaning it struggles to capture all "Pass" cases.
- **Fail (1) Class:** Precision (0.11), recall (0.67), meaning it captures many "Fail" cases but with high false positives.
- **Overall:** Poor performance compared to Random Forest.

## Naïve Bayes:
- **Pass (-1) Class:** Excellent precision & recall (0.94 each).
- **Fail (1) Class:** Precision (0.21), recall (0.24), meaning it poorly identifies "Fail" cases.
- **Overall:** High accuracy (89%), but very weak in detecting the minority class.

## Conclusion:
- **Best model:** Random Forest (better balance between classes).
- **Weakest model:** SVM (lowest accuracy and performance).
- **Naïve Bayes:** Works well for "Pass" class but ignores "Fail" cases.