In [1]:
import time
import pickle
import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sn
from matplotlib.colors import Colormap
import scipy.stats as stats
from numpy import interp
import scikitplot as skplt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV, 
    cross_val_score, 
    cross_val_predict
)
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error,
    accuracy_score,
    matthews_corrcoef,
    brier_score_loss,
    f1_score,
    roc_curve, 
    roc_auc_score
)
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.over_sampling import SMOTE

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from skopt import BayesSearchCV
from skopt.space import Real, Integer

### Import DataFrame from Prepping

In [3]:
df = pd.read_csv('../data/data_threshed50.csv', low_memory=False)
df = df.dropna()
print(F"Dataframe shape: {df.shape}")

Dataframe shape: (349256, 39)


### Converting Column Data Types

#### Object Columns to Categorical

In [6]:
object_columns_to_change_to_category = [        
    'eventDescription',  
    'ecuSoftwareVersion',          
    'ecuSerialNumber',         
    'ecuModel',           
    'ecuMake',                  
    'EquipmentID',                                 
    'LampStatus',
    'next_derate_timestamp',
    'time_until_derate'] 

for column in object_columns_to_change_to_category:
    df[column] = df[column].astype('category')

#### INT Columns to Categorical

In [8]:
int_columns_to_categorical = ['RecordID',
'ESS_Id',                    
'ecuSource',                    
'spn',                       
'fmi',  
'active',       
'MCTNumber',                  
'FaultId']
for column in int_columns_to_categorical:
    df[column] = df[column].astype('category')

#### Date Columns to DateTime

In [10]:
df['EventTimeStamp'] = pd.to_datetime(df['EventTimeStamp'])
df['LocationTimeStamp'] = pd.to_datetime(df['LocationTimeStamp'])

### Splitting Data Train/Test before and After 2019

In [12]:
test_date = '2019-01-01'
df_train = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] < test_date]
df_test = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] > test_date]

In [13]:
X_train = df_train.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId',
            'EngineLoad',
            'TurboBoostPressure',
            'DistanceLtd',
             'LampStatus',
            'Unnamed: 0'], axis=1)

y_train = df_train['target'].values

X_test = df_test.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId',
            'EngineLoad',
            'TurboBoostPressure',
            'DistanceLtd',
             'LampStatus',
             'Unnamed: 0'], axis=1)

y_test = df_test['target'].values

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 327884 entries, 17 to 945556
Data columns (total 17 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   spn                        327884 non-null  category
 1   fmi                        327884 non-null  category
 2   active                     327884 non-null  category
 3   activeTransitionCount      327884 non-null  int64   
 4   EquipmentID                327884 non-null  category
 5   BarometricPressure         327884 non-null  float64 
 6   CruiseControlActive        327884 non-null  bool    
 7   EngineCoolantTemperature   327884 non-null  float64 
 8   EngineOilPressure          327884 non-null  float64 
 9   EngineOilTemperature       327884 non-null  float64 
 10  EngineRpm                  327884 non-null  float64 
 11  FuelLtd                    327884 non-null  float64 
 12  FuelRate                   327884 non-null  float64 
 13  IgnStatus         

### Evaluating What Rows were Dropped in Train/Test Split

In [16]:
print(F"The amount of rows in the df {df.shape} The amount of rows in Training Df {X_train.shape} The amount of rows in Testing Df {X_test.shape}")

The amount of rows in the df (349256, 39) The amount of rows in Training Df (327884, 17) The amount of rows in Testing Df (21372, 17)


In [17]:
X_train.isna().sum()

spn                          0
fmi                          0
active                       0
activeTransitionCount        0
EquipmentID                  0
BarometricPressure           0
CruiseControlActive          0
EngineCoolantTemperature     0
EngineOilPressure            0
EngineOilTemperature         0
EngineRpm                    0
FuelLtd                      0
FuelRate                     0
IgnStatus                    0
IntakeManifoldTemperature    0
ParkingBrake                 0
Speed                        0
dtype: int64

In [18]:
# dropped_rows = df.merge(X_train, how="left", indicator=True).query('_merge == "left_only"').drop("_merge", axis=1)
# print(F"The amount of rows dropped for train/test split: {dropped_rows.shape}")

In [19]:
categorical_features = ['spn', 'fmi', 'EquipmentID'] 
bool_features = ['ParkingBrake', 'IgnStatus', 'active']
numeric_features = [x for x in X_train.columns if x not in categorical_features + bool_features] 
#pca = PCA(n_components=1, svd_solver="arpack")

numeric_pipe = Pipeline(
    steps=[
        ('scale', StandardScaler()),
        ('numeric_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
        ('categorical_impute', SimpleImputer(strategy='most_frequent')),
    ]
)

bool_pipe = Pipeline(
    steps=[
        ('bool_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

ct = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipe, numeric_features),
        ('categorical', categorical_pipe, categorical_features),
        ('bool', bool_pipe, bool_features)
    ]
)

pipe = Pipeline(
    steps=[
        ('transformer', ct),
        #('pca', pca)
    ]
)

pipe.fit(X_train, y_train)
X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test)

### Creating Pickle File for Storing Pipe Object

In [21]:
filename = 'pipe_transformed.pkl'

pickle_list = [pipe, X_train_transformed, X_test_transformed]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [22]:
with open(filename, 'rb') as file:
    pipe, X_train_transformed, X_test_transformed = pickle.load(file)

### Defining Model Evaluation Function

### Smoting To Solve Class Imbalance

In [25]:
smote = SMOTE()
X_trained_smoted, y_trained_smoted = smote.fit_resample(X_train_transformed, y_train)

### LGBMClassifier Model

In [27]:
LGBMClassiferModel = LGBMClassifier()
LGBMClassiferModel = LGBMClassiferModel.fit(X_trained_smoted, y_trained_smoted)
LGBMClassifer_y_pred = LGBMClassiferModel.predict(X_test_transformed)
print(confusion_matrix(y_test, LGBMClassifer_y_pred))
print(classification_report(y_test, LGBMClassifer_y_pred, zero_division=1))

[LightGBM] [Info] Number of positive: 324839, number of negative: 324839
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.491615 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 68379
[LightGBM] [Info] Number of data points in the train set: 649678, number of used features: 872
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[20161   150]
 [ 1045    16]]
              precision    recall  f1-score   support

       False       0.95      0.99      0.97     20311
        True       0.10      0.02      0.03      1061

    accuracy                           0.94     21372
   macro avg       0.52      0.50      0.50     21372
weighted avg       0.91      0.94      0.92     21372



#### LGBM Hyperparameter Tuning

In [29]:
param_grid = param_ranges = {
    "max_depth": [3, 6, 9],  # Controls tree complexity
    "learning_rate": [0.01, 0.1, 0.3],  # Step size for weight updates
    "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used per boosting iteration
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features used per tree
    "lambda": [0.1, 1, 10],  # L2 regularization
    "alpha": [0.1, 0.5, 1],  # L1 regularization
    "n_estimators": [100, 500, 1000]  # Number of boosting rounds
}
bayes_search = BayesSearchCV(estimator=LGBMClassiferModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
bayes_search.fit(X_trained_smoted, y_trained_smoted)

best_params = bayes_search.best_params_
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

#### LGBM Model Tuned

In [31]:
LGBMClassifierTuned = LGBMClassifier(**best_params)
LGBMClassifierTuned.fit(X_trained_smoted, y_trained_smoted)
LGBMClassiferTuned_y_pred = LGBMClassifierTuned.predict(X_test_transformed)
print(confusion_matrix(y_test, LGBMClassiferTuned_y_pred))
print(classification_report(y_test, LGBMClassiferTuned_y_pred, zero_division=1))

[LightGBM] [Info] Number of positive: 324839, number of negative: 324839
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.673343 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 68379
[LightGBM] [Info] Number of data points in the train set: 649678, number of used features: 872
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[20297    14]
 [ 1059     2]]
              precision    recall  f1-score   support

       False       0.95      1.00      0.97     20311
        True       0.12      0.00      0.00      1061

    accuracy                           0.95     21372
   macro avg       0.54      0.50      0.49     21372
weighted avg       0.91      0.95      0.93     21372



In [32]:
LGBMClassiferTuned_y_pred

array([False, False, False, ..., False, False, False])

### XGBClassifier Model

In [39]:
XGBClassiferModel = XGBClassifier()
XGBClassiferModel.fit(X_trained_smoted, y_trained_smoted)
XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
print(confusion_matrix(y_test, XGBClassifer_y_pred))
print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

[[20091   220]
 [ 1035    26]]
              precision    recall  f1-score   support

       False       0.95      0.99      0.97     20311
        True       0.11      0.02      0.04      1061

    accuracy                           0.94     21372
   macro avg       0.53      0.51      0.50     21372
weighted avg       0.91      0.94      0.92     21372



In [55]:
XGBClassifer_y_pred

array([0, 1, 0, ..., 0, 0, 0])

#### XGB Hyperparameter Tuning

In [57]:
param_grid = param_ranges = {
    "max_depth": [3, 6, 9],  # Controls tree complexity
    "learning_rate": [0.01, 0.1, 0.3],  # Step size for weight updates
    "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used per boosting iteration
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features used per tree
    "min_child_weight": [1, 3, 5],  # Minimum sum of instance weight in child node
    "lambda": [0.1, 1, 10],  # L2 regularization
    "alpha": [0, 0.5, 1],  # L1 regularization
    "n_estimators": [100, 500, 1000]  # Number of boosting rounds
}
bayes_search = BayesSearchCV(estimator=XGBClassiferModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
bayes_search.fit(X_trained_smoted, y_trained_smoted)

best_params = bayes_search.best_params_
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

#### XGB Model Tuned 

In [59]:
XGBClassifierTuned = XGBClassifier(**best_params)
XGBClassifierTuned.fit(X_trained_smoted, y_trained_smoted)
XGBClassifierTuned_y_pred = XGBClassifierTuned.predict(X_test_transformed)
print(confusion_matrix(y_test, XGBClassifierTuned_y_pred))
print(classification_report(y_test, XGBClassifierTuned_y_pred, zero_division=1))

[[20221    90]
 [ 1052     9]]
              precision    recall  f1-score   support

       False       0.95      1.00      0.97     20311
        True       0.09      0.01      0.02      1061

    accuracy                           0.95     21372
   macro avg       0.52      0.50      0.49     21372
weighted avg       0.91      0.95      0.93     21372



### Decision Tree Model Classifier

In [61]:
DecisionTreeClassifierModel = DecisionTreeClassifier()
DecisionTreeClassifierModel.fit(X_trained_smoted, y_trained_smoted)
DecisionTreeClassifierModel_y_pred = DecisionTreeClassifierModel.predict(X_test_transformed)
print(confusion_matrix(y_test, DecisionTreeClassifierModel_y_pred))
print(classification_report(y_test, DecisionTreeClassifierModel_y_pred, zero_division=0))

[[19148  1163]
 [  949   112]]
              precision    recall  f1-score   support

       False       0.95      0.94      0.95     20311
        True       0.09      0.11      0.10      1061

    accuracy                           0.90     21372
   macro avg       0.52      0.52      0.52     21372
weighted avg       0.91      0.90      0.91     21372



#### Decision Tree Hyperparameter Tuning

In [63]:
param_grid = {
    "max_depth": [3, 6, 9],  
    "min_samples_split": [2, 5, 10],  
    "min_samples_leaf": [1, 3, 5],  
    "max_features": ["sqrt", "log2"],  # Removed "auto"
    "criterion": ["gini", "entropy"],  
    "max_leaf_nodes": [None, 10, 50]
}
bayes_search = BayesSearchCV(estimator=DecisionTreeClassifierModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
bayes_search.fit(X_trained_smoted, y_trained_smoted)

best_params = bayes_search.best_params_
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

#### DecisionTree Model Tuned

In [65]:
DecisionTreeClassifierTuned = DecisionTreeClassifier(**best_params)
DecisionTreeClassifierTuned.fit(X_trained_smoted, y_trained_smoted)
DecisionTreeClassifierTuned_y_pred = DecisionTreeClassifierTuned.predict(X_test_transformed)
print(confusion_matrix(y_test, DecisionTreeClassifierTuned_y_pred))
print(classification_report(y_test, DecisionTreeClassifierTuned_y_pred, zero_division=1))

[[17922  2389]
 [  892   169]]
              precision    recall  f1-score   support

       False       0.95      0.88      0.92     20311
        True       0.07      0.16      0.09      1061

    accuracy                           0.85     21372
   macro avg       0.51      0.52      0.50     21372
weighted avg       0.91      0.85      0.88     21372



### Logistic Regression Model

In [67]:
LogisticRegressionModel = LogisticRegression(max_iter=500)
LogisticRegressionModel.fit(X_trained_smoted, y_trained_smoted)
LogisticRegressionModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
print(confusion_matrix(y_test, LogisticRegressionModel_y_pred))
print(classification_report(y_test, LogisticRegressionModel_y_pred, zero_division=0))

[[11834  8477]
 [  670   391]]
              precision    recall  f1-score   support

       False       0.95      0.58      0.72     20311
        True       0.04      0.37      0.08      1061

    accuracy                           0.57     21372
   macro avg       0.50      0.48      0.40     21372
weighted avg       0.90      0.57      0.69     21372



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Logisitic Regression Hyperparameter Tuning

In [69]:
param_grid = {
    "penalty": ["l1", "l2"],  # Type of regularization
    "C": [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength (inverse of λ)
    "solver": ["liblinear"],  # Optimizer for l1/l2 penalties
    "max_iter": [100, 200, 500, 1000]  # Iterations for convergence
}

bayes_search = BayesSearchCV(estimator=LogisticRegressionModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
bayes_search.fit(X_trained_smoted, y_trained_smoted)

best_params = bayes_search.best_params_
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi



Fitting 3 folds for each of 1 candidates, totalling 3 fits




Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters: OrderedDict({'C': 100, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'})
Best score: 0.6522604651756612


In [70]:
LogisticRegressionModelTuned = LogisticRegression(**best_params)
LogisticRegressionModelTuned.fit(X_trained_smoted, y_trained_smoted)
LogisticRegressionModelTuned_y_pred = LogisticRegressionModelTuned.predict(X_test_transformed)
print(confusion_matrix(y_test, LogisticRegressionModelTuned_y_pred))
print(classification_report(y_test, LogisticRegressionModelTuned_y_pred, zero_division=1))

[[12318  7993]
 [  742   319]]
              precision    recall  f1-score   support

       False       0.94      0.61      0.74     20311
        True       0.04      0.30      0.07      1061

    accuracy                           0.59     21372
   macro avg       0.49      0.45      0.40     21372
weighted avg       0.90      0.59      0.70     21372



### KNN Classifier Model

In [72]:
KNeighborsClassifierModel = KNeighborsClassifier()
KNeighborsClassifierModel.fit(X_trained_smoted, y_trained_smoted)
KNeighborsClassifierModel_y_pred = KNeighborsClassifierModel.predict(X_test_transformed)
print(confusion_matrix(y_test, KNeighborsClassifierModel_y_pred))
print(classification_report(y_test, KNeighborsClassifierModel_y_pred, zero_division=0))

[[11834  8477]
 [  670   391]]
              precision    recall  f1-score   support

       False       0.95      0.58      0.72     20311
        True       0.04      0.37      0.08      1061

    accuracy                           0.57     21372
   macro avg       0.50      0.48      0.40     21372
weighted avg       0.90      0.57      0.69     21372



##### I've aborted the Hyperparameter tuning for KNN because it doesn't work.

#### KNN Hyperparameter Tuning

In [75]:
# param_grid = {
#     "n_neighbors": [3, 5, 7, 9, 11],  # Number of nearest neighbors
#     "weights": ["uniform", "distance"],  # How neighbors influence prediction
#     "metric": ["euclidean", "manhattan", "minkowski"],  # Distance calculation method
#     "p": [1, 2],  # Minkowski power parameter (1 = Manhattan, 2 = Euclidean)
# }

# bayes_search = BayesSearchCV(estimator=KNeighborsClassifierModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
# bayes_search.fit(X_trained_smoted, y_trained_smoted)

# best_params = bayes_search.best_params_
# print(f"Best parameters: {bayes_search.best_params_}")
# print(f"Best score: {bayes_search.best_score_}")

#### KNN Model tuned

In [77]:
# KNeighborsClassifierModelTuned = LogisticRegression(**best_params)
# KNeighborsClassifierModelTuned.fit(X_trained_smoted, y_trained_smoted)
# KNeighborsClassifierModelTuned_y_pred = KNeighborsClassifierModelTuned.predict(X_test_transformed)
# print(confusion_matrix(y_test, KNeighborsClassifierModelTuned_y_pred))
# print(classification_report(y_test, KNeighborsClassifierModelTuned_y_pred, zero_division=1))

### Model Evaluation 

In [96]:
def target_pred(df, event, equipment, target, pred):
    df = df[[event, equipment, target]].copy()
    df['predict'] = pred
    df['predict'] = df['predict'].replace({0: 'False', 1: 'True'})
    df['combined'] = df[target].astype(str) + '_' + df['predict'].astype(str)
    df[event] = pd.to_datetime(df[event])
    df = df.sort_values(by=[equipment, event])
    df['time_diff'] = df.groupby(equipment, observed=True)[event].diff().dt.total_seconds() / 3600
    df['valid_group'] = (df['time_diff'].isna()) | (df['time_diff'] <= 2)
    df['temp_group'] = (~df['valid_group']).cumsum()
    result = df.groupby([equipment, 'temp_group'], observed=True)['combined'].value_counts().reset_index()
    result = df.groupby([equipment, 'temp_group'], observed=True)['combined'].value_counts().unstack(fill_value=0).drop_duplicates()
    result = result.rename(columns = {'False_False': 'true negative', 'False_True': 'false positive', 'True_False': 'false negative', 'True_True': 'true positive'})
    
    for index, row in result.iterrows():
        if row['true positive'] >= 1:
            result.loc[index, 'true positive'] = 1
            result.loc[index, ~result.columns.isin(['true positive'])] = 0
        elif row['true positive'] == 0 and row['false positive'] >= 1:
            result.loc[index, 'false positive'] = 1
            result.loc[index, ~result.columns.isin(['false positive', 'true positive'])] = 0
        elif row['true positive'] == 0 and row['false positive'] == 0 and row['false negative'] >= 1:
            result.loc[index, 'false negative'] = 1
            result.loc[index, ~result.columns.isin(['false negative', 'false positive', 'true positive'])] = 0
        else:
            result.loc[index, 'true negative'] = 1
    counts = (result.iloc[:,0].sum() * 0) - (result.iloc[:,1].sum() * 500) + (result.iloc[:,3].sum() * 4000)
    return counts

In [100]:
print(F"The LGBM Model saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', LGBMClassifer_y_pred)}")
print(F"The LGBM Model tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', LGBMClassiferTuned_y_pred)}")
print(F"The XGBoost Model  saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', XGBClassifer_y_pred)}")
print(F"The XGBoost Model tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', XGBClassifierTuned_y_pred)}")
print(F"The Decision Tree Model saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', DecisionTreeClassifierModel_y_pred)}")
print(F"The Decision Tree Model Tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', DecisionTreeClassifierTuned_y_pred)}")
print(F"The Logistic Regression Model saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', LogisticRegressionModel_y_pred)}")
print(F"The Logistic Regression Model Tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', LogisticRegressionModelTuned_y_pred)}")
print(F"The KNN Model saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', KNeighborsClassifierModel_y_pred)}")

The LGBM Model saved: 13000
The LGBM Model tuned saved: 4500
The XGBoost Model  saved: 23500
The XGBoost Model tuned saved: 7000
The Decision Tree Model saved: 42000
The Decision Tree Model Tuned saved: 52500
The Logistic Regression Model saved: 128500
The Logistic Regression Model Tuned saved: 97500
The KNN Model saved: 128500
