In [1]:
import time
import pickle
import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sn
from matplotlib.colors import Colormap
import scipy.stats as stats
from numpy import interp
import scikitplot as skplt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV, 
    cross_val_score, 
    cross_val_predict
)
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error,
    accuracy_score,
    matthews_corrcoef,
    brier_score_loss,
    f1_score,
    roc_curve, 
    roc_auc_score
)
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.over_sampling import SMOTE

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from skopt import BayesSearchCV
from skopt.space import Real, Integer

### Import DataFrame from Prepping

In [3]:
df = pd.read_csv('../data/data_threshed70.csv', low_memory=False)
df = df.dropna()
print(F"Dataframe shape: {df.shape}")

Dataframe shape: (176897, 44)


### Converting Column Data Types

#### Object Columns to Categorical

In [6]:
object_columns_to_change_to_category = [        
    'eventDescription',  
    'ecuSoftwareVersion',          
    'ecuSerialNumber',         
    'ecuModel',           
    'ecuMake',                  
    'EquipmentID',                                 
    'LampStatus',
    'next_derate_timestamp',
    'time_until_derate'] 

for column in object_columns_to_change_to_category:
    df[column] = df[column].astype('category')

#### INT Columns to Categorical

In [8]:
int_columns_to_categorical = ['RecordID',
'ESS_Id',                    
'ecuSource',                    
'spn',                       
'fmi',  
'active',       
'MCTNumber',                  
'FaultId']
for column in int_columns_to_categorical:
    df[column] = df[column].astype('category')

#### Date Columns to DateTime

In [10]:
df['EventTimeStamp'] = pd.to_datetime(df['EventTimeStamp'])
df['LocationTimeStamp'] = pd.to_datetime(df['LocationTimeStamp'])

### Splitting Data Train/Test before and After 2019

In [12]:
test_date = '2019-01-01'
df_train = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] < test_date]
df_test = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] > test_date]

In [13]:
X_train = df_train.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId',
            'EngineLoad',
            'TurboBoostPressure',
            'CruiseControlSetSpeed',
            'DistanceLtd',
             'LampStatus',
            'Unnamed: 0'], axis=1)

y_train = df_train['target'].values

X_test = df_test.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId',
            'EngineLoad',
            'TurboBoostPressure',
            'CruiseControlSetSpeed',
            'DistanceLtd',
             'LampStatus',
             'Unnamed: 0'], axis=1)

y_test = df_test['target'].values

In [14]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 155901 entries, 17 to 945556
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   spn                        155901 non-null  category
 1   fmi                        155901 non-null  category
 2   active                     155901 non-null  category
 3   activeTransitionCount      155901 non-null  int64   
 4   EquipmentID                155901 non-null  category
 5   AcceleratorPedal           155901 non-null  float64 
 6   BarometricPressure         155901 non-null  float64 
 7   CruiseControlActive        155901 non-null  bool    
 8   EngineCoolantTemperature   155901 non-null  float64 
 9   EngineOilPressure          155901 non-null  float64 
 10  EngineOilTemperature       155901 non-null  float64 
 11  EngineRpm                  155901 non-null  float64 
 12  EngineTimeLtd              155901 non-null  float64 
 13  FuelLevel         

### Evaluating What Rows were Dropped in Train/Test Split

In [16]:
print(F"The amount of rows in the df {df.shape} The amount of rows in Training Df {X_train.shape} The amount of rows in Testing Df {X_test.shape}")

The amount of rows in the df (176897, 44) The amount of rows in Training Df (155901, 21) The amount of rows in Testing Df (20996, 21)


In [17]:
X_train.isna().sum()

spn                          0
fmi                          0
active                       0
activeTransitionCount        0
EquipmentID                  0
AcceleratorPedal             0
BarometricPressure           0
CruiseControlActive          0
EngineCoolantTemperature     0
EngineOilPressure            0
EngineOilTemperature         0
EngineRpm                    0
EngineTimeLtd                0
FuelLevel                    0
FuelLtd                      0
FuelRate                     0
IgnStatus                    0
IntakeManifoldTemperature    0
ParkingBrake                 0
Speed                        0
Throttle                     0
dtype: int64

In [18]:
# dropped_rows = df.merge(X_train, how="left", indicator=True).query('_merge == "left_only"').drop("_merge", axis=1)
# print(F"The amount of rows dropped for train/test split: {dropped_rows.shape}")

In [19]:
categorical_features = ['spn', 'fmi', 'EquipmentID'] 
bool_features = ['CruiseControlActive', 'ParkingBrake', 'IgnStatus', 'active']
numeric_features = [x for x in X_train.columns if x not in categorical_features + bool_features] 
#pca = PCA(n_components=1, svd_solver="arpack")

numeric_pipe = Pipeline(
    steps=[
        ('scale', StandardScaler()),
        ('numeric_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
        ('categorical_impute', SimpleImputer(strategy='most_frequent')),
    ]
)

bool_pipe = Pipeline(
    steps=[
        ('bool_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

ct = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipe, numeric_features),
        ('categorical', categorical_pipe, categorical_features),
        ('bool', bool_pipe, bool_features)
    ]
)

pipe = Pipeline(
    steps=[
        ('transformer', ct),
        #('pca', pca)
    ]
)

pipe.fit(X_train, y_train)
X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test)

### Creating Pickle File for Storing Pipe Object

In [21]:
filename = 'pipe_transformed.pkl'

pickle_list = [pipe, X_train_transformed, X_test_transformed]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [22]:
with open(filename, 'rb') as file:
    pipe, X_train_transformed, X_test_transformed = pickle.load(file)

### Defining Model Evaluation Function

### Smoting To Solve Class Imbalance

In [25]:
smote = SMOTE()
X_trained_smoted, y_trained_smoted = smote.fit_resample(X_train_transformed, y_train)

### Defining The Function to Evaluate Performance. 

### LGBMClassifier Model

In [27]:
LGBMClassiferModel = LGBMClassifier()
LGBMClassiferModel = LGBMClassiferModel.fit(X_trained_smoted, y_trained_smoted)
LGBMClassifer_y_pred = LGBMClassiferModel.predict(X_test_transformed)
print(confusion_matrix(y_test, LGBMClassifer_y_pred))
print(classification_report(y_test, LGBMClassifer_y_pred, zero_division=1))

[LightGBM] [Info] Number of positive: 154277, number of negative: 154277
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.135028 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59558
[LightGBM] [Info] Number of data points in the train set: 308554, number of used features: 556
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[19811   145]
 [ 1022    18]]
              precision    recall  f1-score   support

       False       0.95      0.99      0.97     19956
        True       0.11      0.02      0.03      1040

    accuracy                           0.94     20996
   macro avg       0.53      0.51      0.50     20996
weighted avg       0.91      0.94      0.92     20996



#### LGBM Hyperparameter Tuning

In [29]:
param_grid = param_ranges = {
    "max_depth": [3, 6, 9],  # Controls tree complexity
    "learning_rate": [0.01, 0.1, 0.3],  # Step size for weight updates
    "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used per boosting iteration
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features used per tree
    "lambda": [0.1, 1, 10],  # L2 regularization
    "alpha": [0.1, 0.5, 1],  # L1 regularization
    "n_estimators": [100, 500, 1000]  # Number of boosting rounds
}
bayes_search = BayesSearchCV(estimator=LGBMClassiferModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
bayes_search.fit(X_trained_smoted, y_trained_smoted)

best_params = bayes_search.best_params_
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

#### LGBM Model Tuned

In [31]:
LGBMClassifierTuned = LGBMClassifier(**best_params)
LGBMClassifierTuned.fit(X_trained_smoted, y_trained_smoted)
LGBMClassiferTuned_y_pred = LGBMClassifierTuned.predict(X_test_transformed)
print(confusion_matrix(y_test, LGBMClassiferTuned_y_pred))
print(classification_report(y_test, LGBMClassiferTuned_y_pred, zero_division=1))

[LightGBM] [Info] Number of positive: 154277, number of negative: 154277
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.123148 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59558
[LightGBM] [Info] Number of data points in the train set: 308554, number of used features: 556
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[19941    15]
 [ 1037     3]]
              precision    recall  f1-score   support

       False       0.95      1.00      0.97     19956
        True       0.17      0.00      0.01      1040

    accuracy                           0.95     20996
   macro avg       0.56      0.50      0.49     20996
weighted avg       0.91      0.95      0.93     20996



In [32]:
LGBMClassiferTuned_y_pred

array([False, False, False, ..., False, False, False])

### XGBClassifier Model

In [34]:
XGBClassiferModel = XGBClassifier()
XGBClassiferModel.fit(X_trained_smoted, y_trained_smoted)
XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
print(confusion_matrix(y_test, XGBClassifer_y_pred))
print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

[[19833   123]
 [ 1035     5]]
              precision    recall  f1-score   support

       False       0.95      0.99      0.97     19956
        True       0.04      0.00      0.01      1040

    accuracy                           0.94     20996
   macro avg       0.49      0.50      0.49     20996
weighted avg       0.91      0.94      0.92     20996



#### XGB Hyperparameter Tuning

In [36]:
param_grid = param_ranges = {
    "max_depth": [3, 6, 9],  # Controls tree complexity
    "learning_rate": [0.01, 0.1, 0.3],  # Step size for weight updates
    "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used per boosting iteration
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features used per tree
    "min_child_weight": [1, 3, 5],  # Minimum sum of instance weight in child node
    "lambda": [0.1, 1, 10],  # L2 regularization
    "alpha": [0, 0.5, 1],  # L1 regularization
    "n_estimators": [100, 500, 1000]  # Number of boosting rounds
}
bayes_search = BayesSearchCV(estimator=XGBClassiferModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
bayes_search.fit(X_trained_smoted, y_trained_smoted)

best_params = bayes_search.best_params_
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

#### XGB Model Tuned 

In [38]:
XGBClassifierTuned = XGBClassifier(**best_params)
XGBClassifierTuned.fit(X_trained_smoted, y_trained_smoted)
XGBClassifierTuned_y_pred = XGBClassifierTuned.predict(X_test_transformed)
print(confusion_matrix(y_test, XGBClassifierTuned_y_pred))
print(classification_report(y_test, XGBClassifierTuned_y_pred, zero_division=1))

[[19936    20]
 [ 1036     4]]
              precision    recall  f1-score   support

       False       0.95      1.00      0.97     19956
        True       0.17      0.00      0.01      1040

    accuracy                           0.95     20996
   macro avg       0.56      0.50      0.49     20996
weighted avg       0.91      0.95      0.93     20996



### Decision Tree Model Classifier

In [40]:
DecisionTreeClassifierModel = DecisionTreeClassifier()
DecisionTreeClassifierModel.fit(X_trained_smoted, y_trained_smoted)
DecisionTreeClassifierModel_y_pred = DecisionTreeClassifierModel.predict(X_test_transformed)
print(confusion_matrix(y_test, DecisionTreeClassifierModel_y_pred))
print(classification_report(y_test, DecisionTreeClassifierModel_y_pred, zero_division=0))

[[19204   752]
 [  985    55]]
              precision    recall  f1-score   support

       False       0.95      0.96      0.96     19956
        True       0.07      0.05      0.06      1040

    accuracy                           0.92     20996
   macro avg       0.51      0.51      0.51     20996
weighted avg       0.91      0.92      0.91     20996



#### Decision Tree Hyperparameter Tuning

In [42]:
param_grid = {
    "max_depth": [3, 6, 9],  
    "min_samples_split": [2, 5, 10],  
    "min_samples_leaf": [1, 3, 5],  
    "max_features": ["sqrt", "log2"],  # Removed "auto"
    "criterion": ["gini", "entropy"],  
    "max_leaf_nodes": [None, 10, 50]
}
bayes_search = BayesSearchCV(estimator=DecisionTreeClassifierModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
bayes_search.fit(X_trained_smoted, y_trained_smoted)

best_params = bayes_search.best_params_
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi

#### DecisionTree Model Tuned

In [44]:
DecisionTreeClassifierTuned = DecisionTreeClassifier(**best_params)
DecisionTreeClassifierTuned.fit(X_trained_smoted, y_trained_smoted)
DecisionTreeClassifierTuned_y_pred = DecisionTreeClassifierTuned.predict(X_test_transformed)
print(confusion_matrix(y_test, DecisionTreeClassifierTuned_y_pred))
print(classification_report(y_test, DecisionTreeClassifierTuned_y_pred, zero_division=1))

[[19053   903]
 [ 1009    31]]
              precision    recall  f1-score   support

       False       0.95      0.95      0.95     19956
        True       0.03      0.03      0.03      1040

    accuracy                           0.91     20996
   macro avg       0.49      0.49      0.49     20996
weighted avg       0.90      0.91      0.91     20996



### Logistic Regression Model

In [46]:
LogisticRegressionModel = LogisticRegression(max_iter=500)
LogisticRegressionModel.fit(X_trained_smoted, y_trained_smoted)
LogisticRegressionModel_y_pred = LogisticRegressionModel.predict(X_test_transformed)
print(confusion_matrix(y_test, LogisticRegressionModel_y_pred))
print(classification_report(y_test, LogisticRegressionModel_y_pred, zero_division=0))

[[11588  8368]
 [  642   398]]
              precision    recall  f1-score   support

       False       0.95      0.58      0.72     19956
        True       0.05      0.38      0.08      1040

    accuracy                           0.57     20996
   macro avg       0.50      0.48      0.40     20996
weighted avg       0.90      0.57      0.69     20996



#### Logisitic Regression Hyperparameter Tuning

In [48]:
param_grid = {
    "penalty": ["l1", "l2"],  # Type of regularization
    "C": [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization strength (inverse of λ)
    "solver": ["liblinear"],  # Optimizer for l1/l2 penalties
    "max_iter": [100, 200, 500, 1000]  # Iterations for convergence
}

bayes_search = BayesSearchCV(estimator=LogisticRegressionModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
bayes_search.fit(X_trained_smoted, y_trained_smoted)

best_params = bayes_search.best_params_
print(f"Best parameters: {bayes_search.best_params_}")
print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi



Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits




Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters: OrderedDict({'C': 100, 'max_iter': 200, 'penalty': 'l1', 'solver': 'liblinear'})
Best score: 0.6612965229582387


In [49]:
LogisticRegressionModelTuned = LogisticRegression(**best_params)
LogisticRegressionModelTuned.fit(X_trained_smoted, y_trained_smoted)
LogisticRegressionModelTuned_y_pred = LogisticRegressionModelTuned.predict(X_test_transformed)
print(confusion_matrix(y_test, LogisticRegressionModelTuned_y_pred))
print(classification_report(y_test, LogisticRegressionModelTuned_y_pred, zero_division=1))

[[11914  8042]
 [  695   345]]
              precision    recall  f1-score   support

       False       0.94      0.60      0.73     19956
        True       0.04      0.33      0.07      1040

    accuracy                           0.58     20996
   macro avg       0.49      0.46      0.40     20996
weighted avg       0.90      0.58      0.70     20996



### KNN Classifier Model

In [371]:
KNeighborsClassifierModel = KNeighborsClassifier()
KNeighborsClassifierModel.fit(X_trained_smoted, y_trained_smoted)
KNeighborsClassifierModel_y_pred = KNeighborsClassifierModel.predict(X_test_transformed)
print(confusion_matrix(y_test, KNeighborsClassifierModel_y_pred))
print(classification_report(y_test, KNeighborsClassifierModel_y_pred, zero_division=0))


KeyboardInterrupt



##### I've aborted the Hyperparameter tuning for KNN because it doesn't work.

#### KNN Hyperparameter Tuning

In [54]:
# param_grid = {
#     "n_neighbors": [3, 5, 7, 9, 11],  # Number of nearest neighbors
#     "weights": ["uniform", "distance"],  # How neighbors influence prediction
#     "metric": ["euclidean", "manhattan", "minkowski"],  # Distance calculation method
#     "p": [1, 2],  # Minkowski power parameter (1 = Manhattan, 2 = Euclidean)
# }

# bayes_search = BayesSearchCV(estimator=KNeighborsClassifierModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
# bayes_search.fit(X_trained_smoted, y_trained_smoted)

# best_params = bayes_search.best_params_
# print(f"Best parameters: {bayes_search.best_params_}")
# print(f"Best score: {bayes_search.best_score_}")

#### KNN Model tuned

In [56]:
# KNeighborsClassifierModelTuned = LogisticRegression(**best_params)
# KNeighborsClassifierModelTuned.fit(X_trained_smoted, y_trained_smoted)
# KNeighborsClassifierModelTuned_y_pred = KNeighborsClassifierModelTuned.predict(X_test_transformed)
# print(confusion_matrix(y_test, KNeighborsClassifierModelTuned_y_pred))
# print(classification_report(y_test, KNeighborsClassifierModelTuned_y_pred, zero_division=1))

### Model Evaluation 

In [362]:
df_test['predictions'] = XGBClassifer_y_pred
df_test['predictions'] = df_test['predictions'].replace({0: 'False', 1: 'True'})
pred_df = df_test[['EventTimeStamp', 'EquipmentID', 'spn', 'target', 'predictions']]
pred_df['combined'] = pred_df['target'].astype(str) + '_' + pred_df['predictions'].astype(str)
pred_df['EventTimeStamp'] = pd.to_datetime(pred_df['EventTimeStamp'])
pred_df = pred_df.sort_values(by=['EquipmentID', 'EventTimeStamp'])
pred_df['time_diff'] = pred_df.groupby('EquipmentID', observed=True)['EventTimeStamp'].diff().dt.total_seconds() / 3600
pred_df['valid_group'] = (pred_df['time_diff'].isna()) | (pred_df['time_diff'] <= 2)
pred_df['temp_group'] = (~pred_df['valid_group']).cumsum()
result = pred_df.groupby(['EquipmentID', 'temp_group'], observed=True)['combined'].value_counts().reset_index()
result = pred_df.groupby(['EquipmentID', 'temp_group'], observed=True)['combined'].value_counts().unstack(fill_value=0).drop_duplicates()
result
result = result.rename(columns = {'False_False': 'true negative', 'False_True': 'false positive', 'True_False': 'false negative', 'True_True': 'true positive'})
print(result.columns)


for index, row in result.iterrows():
    if row['true positive'] >= 1:
        result.loc[index, 'true positive'] = 1
        result.loc[index, ~result.columns.isin(['true positive'])] = 0
    elif row['true positive'] == 0 and row['false positive'] >= 1:
        result.loc[index, 'false positive'] = 1
        result.loc[index, ~result.columns.isin(['false positive', 'true positive'])] = 0
    elif row['true positive'] == 0 and row['false positive'] == 0 and row['false negative'] >= 1:
        result.loc[index, 'false negative'] = 1
        result.loc[index, ~result.columns.isin(['false negative', 'false positive', 'true positive'])] = 0
    else:
        result.loc[index, 'true negative'] = 1
    counts = (result.iloc[:,0].sum() * 0) - (result.iloc[:,1].sum() * 500) + (result.iloc[:,3].sum() * 4000)
print(result[['true positive', 'false positive', 'false negative', 'true negative']].sum())
print(counts)

Index(['true negative', 'false positive', 'false negative', 'true positive'], dtype='object', name='combined')
combined
true positive      5
false positive    18
false negative    37
true negative     21
dtype: int64
11000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df['combined'] = pred_df['target'].astype(str) + '_' + pred_df['predictions'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pred_df['EventTimeStamp'] = pd.to_datetime(pred_df['EventTimeStamp'])


In [373]:
def target_pred(df, event, equipment, target, pred):
    df = df[[event, equipment, target]].copy()
    df['predict'] = pred
    df['predict'] = df['predict'].replace({0: 'False', 1: 'True'})
    df['combined'] = df[target].astype(str) + '_' + df['predict'].astype(str)
    df[event] = pd.to_datetime(df[event])
    df = df.sort_values(by=[equipment, event])
    df['time_diff'] = df.groupby(equipment, observed=True)[event].diff().dt.total_seconds() / 3600
    df['valid_group'] = (df['time_diff'].isna()) | (df['time_diff'] <= 2)
    df['temp_group'] = (~df['valid_group']).cumsum()
    result = df.groupby([equipment, 'temp_group'], observed=True)['combined'].value_counts().reset_index()
    result = df.groupby([equipment, 'temp_group'], observed=True)['combined'].value_counts().unstack(fill_value=0).drop_duplicates()
    result = result.rename(columns = {'False_False': 'true negative', 'False_True': 'false positive', 'True_False': 'false negative', 'True_True': 'true positive'})
    
    for index, row in result.iterrows():
        if row['true positive'] >= 1:
            result.loc[index, 'true positive'] = 1
            result.loc[index, ~result.columns.isin(['true positive'])] = 0
        elif row['true positive'] == 0 and row['false positive'] >= 1:
            result.loc[index, 'false positive'] = 1
            result.loc[index, ~result.columns.isin(['false positive', 'true positive'])] = 0
        elif row['true positive'] == 0 and row['false positive'] == 0 and row['false negative'] >= 1:
            result.loc[index, 'false negative'] = 1
            result.loc[index, ~result.columns.isin(['false negative', 'false positive', 'true positive'])] = 0
        else:
            result.loc[index, 'true negative'] = 1
    counts = (result.iloc[:,0].sum() * 0) - (result.iloc[:,1].sum() * 500) + (result.iloc[:,3].sum() * 4000)
    return counts

In [379]:
print(F"The LGBM Model saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', LGBMClassifer_y_pred)}")
print(F"The LGBM Model tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', LGBMClassiferTuned_y_pred)}")
print(F"The XGBoost Model  saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', XGBClassifer_y_pred)}")
print(F"The XGBoost Model tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', XGBClassifierTuned_y_pred)}")
print(F"The Decision Tree Model saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', DecisionTreeClassifierModel_y_pred)}")
print(F"The Decision Tree Model Tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', DecisionTreeClassifierTuned_y_pred)}")
print(F"The Logistic Regression Model saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', LogisticRegressionModel_y_pred)}")
print(F"The Logistic Regression Model Tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', LogisticRegressionModelTuned_y_pred)}")
print(F"The KNN Model saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', KNeighborsClassifierModel_y_pred)}")

The LGBM Model saved: 13500
The LGBM Model tuned saved: 4500
The XGBoost Model  saved: 11000
The XGBoost Model tuned saved: 5000
The Decision Tree Model saved: 33000
The Decision Tree Model Tuned saved: 11000
The Logistic Regression Model saved: 121000
The Logistic Regression Model Tuned saved: 98500
The KNN Model saved: 121000


In [163]:
print(F"The LGBM Model tuned saved: {target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', XGBClassifierTuned_y_pred)}")

The LGBM Model tuned saved: 5000
