In [1]:
pip install scikit-optimize

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error
)
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree
from faiss_imputer import FaissImputer

from tqdm.notebook import tqdm, trange
from ipywidgets import interactive
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix, 
    f1_score, fbeta_score, 
    matthews_corrcoef, brier_score_loss
)
from sklearn.decomposition import PCA
import pickle
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler

In [3]:
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [4]:
df = pd.read_csv('../data/data_clean.csv', low_memory=False)

In [5]:
df = df.drop('Unnamed: 0', axis=1)

In [6]:
object_columns_to_change_to_category = [        
    'eventDescription',  
    'ecuSoftwareVersion',          
    'ecuSerialNumber',         
    'ecuModel',           
    'ecuMake',                  
    'EquipmentID',                                 
    'LampStatus',              
    'ServiceDistance',
    'next_derate_timestamp',
    'time_until_derate'] 

In [7]:
for column in object_columns_to_change_to_category:
    df[column] = df[column].astype('category')

In [8]:
df['EventTimeStamp'] = pd.to_datetime(df['EventTimeStamp'])
df['LocationTimeStamp'] = pd.to_datetime(df['LocationTimeStamp'])

In [9]:
int_columns_to_categorical = ['RecordID',
'ESS_Id',                    
'ecuSource',                    
'spn',                       
'fmi',  
'active',       
'MCTNumber',                  
'FaultId']

In [10]:
for column in int_columns_to_categorical:
    df[column] = df[column].astype('category')

In [11]:
float64_cols = df.select_dtypes(include=['float64']).columns
df[float64_cols] = df[float64_cols].astype('float32')

In [12]:
test_date = '2019-01-01'
df_test = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] > test_date]
df_train = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] < test_date]

In [13]:
X_train = df_train.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'ServiceDistance',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId',
            'EngineLoad',
            'TurboBoostPressure',
            'CruiseControlSetSpeed',
            'DistanceLtd'], axis=1)

y_train = df_train['target'].values

X_test = df_test.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'ServiceDistance',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId',
            'EngineLoad',
            'TurboBoostPressure',
            'CruiseControlSetSpeed',
            'DistanceLtd'], axis=1)


y_test = df_test['target'].values

X_val = df_test.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'ServiceDistance',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_derate',
            'FaultId'], axis=1)

y_val = df_test['target'].values

In [14]:
categorical_features = ['spn', 'fmi', 'EquipmentID', 'LampStatus'] 
bool_features = ['CruiseControlActive', 'ParkingBrake', 'IgnStatus', 'active']
numeric_features = [x for x in X_train.columns if x not in categorical_features + bool_features] 
#pca = PCA(n_components=1, svd_solver="arpack")

numeric_pipe = Pipeline(
    steps=[
        ('scale', StandardScaler()),
        ('numeric_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True)),
        ('categorical_impute', SimpleImputer(strategy='most_frequent')),
    ]
)

bool_pipe = Pipeline(
    steps=[
        ('bool_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

ct = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipe, numeric_features),
        ('categorical', categorical_pipe, categorical_features),
        ('bool', bool_pipe, bool_features)
    ]
)

pipe = Pipeline(
    steps=[
        ('transformer', ct),
        #('pca', pca)
    ]
)

pipe.fit(X_train, y_train)
X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test)

In [15]:
smote = SMOTE()
X_trained_smoted, y_trained_smoted = smote.fit_resample(X_train_transformed, y_train)

In [39]:
XGBClassiferModel = XGBClassifier()
XGBClassiferModel.fit(X_trained_smoted, y_trained_smoted)
XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
print(confusion_matrix(y_test, XGBClassifer_y_pred))
print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

Accuracy: 0.9734956184804154
MCC: 0.18025051451264293
[[108356   2824]
 [   131    180]]
              precision    recall  f1-score   support

       False       1.00      0.97      0.99    111180
        True       0.06      0.58      0.11       311

    accuracy                           0.97    111491
   macro avg       0.53      0.78      0.55    111491
weighted avg       1.00      0.97      0.98    111491



In [None]:
Best parameters: OrderedDict({})

In [61]:
XGBClassiferModel = XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=500)
XGBClassiferModel.fit(X_trained_smoted, y_trained_smoted)
XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
print(confusion_matrix(y_test, XGBClassifer_y_pred))
print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

Accuracy: 0.9815590496093856
MCC: 0.18052704083569918
[[109287   1893]
 [   163    148]]
              precision    recall  f1-score   support

       False       1.00      0.98      0.99    111180
        True       0.07      0.48      0.13       311

    accuracy                           0.98    111491
   macro avg       0.54      0.73      0.56    111491
weighted avg       1.00      0.98      0.99    111491



In [63]:
XGBClassiferModel = XGBClassifier(objective="binary:logistic",
    colsample_bytree = 0.9179993452534458,
    gamma = 3.55507468819673,
    max_depth = 17,
    min_child_weight = 9,
    reg_alpha = 62,
    n_estimators = 100,
    reg_lambda = 0.5990326545008506,
    scale_pos_weight = 2)
XGBClassiferModel.fit(X_trained_smoted, y_trained_smoted)
XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
print(confusion_matrix(y_test, XGBClassifer_y_pred))
print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

Accuracy: 0.9894520633952516
MCC: 0.11251819846189282
[[110250    930]
 [   246     65]]
              precision    recall  f1-score   support

       False       1.00      0.99      0.99    111180
        True       0.07      0.21      0.10       311

    accuracy                           0.99    111491
   macro avg       0.53      0.60      0.55    111491
weighted avg       1.00      0.99      0.99    111491



In [55]:
# XGBClassiferModel = XGBClassifier(alpha=0.5, colsample_bytree=1.0, 
#                                   learning_rate=0.3, max_depth=9, min_child_weight=1, 
#                                   n_estimators=100, subsample=0.6)
# XGBClassiferModel.fit(X_trained_smoted, y_trained_smoted)
# XGBClassifer_y_pred = XGBClassiferModel.predict(X_test_transformed)
# print(f'Accuracy: {accuracy_score(y_test, XGBClassifer_y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, XGBClassifer_y_pred)}')
# print(confusion_matrix(y_test, XGBClassifer_y_pred))
# print(classification_report(y_test, XGBClassifer_y_pred, zero_division=0))

Accuracy: 0.9852095684853486
MCC: 0.16110174782584727
[[109726   1454]
 [   195    116]]
              precision    recall  f1-score   support

       False       1.00      0.99      0.99    111180
        True       0.07      0.37      0.12       311

    accuracy                           0.99    111491
   macro avg       0.54      0.68      0.56    111491
weighted avg       1.00      0.99      0.99    111491



In [41]:
# param_grid = param_ranges = {
#     "max_depth": [3, 6, 9],  # Controls tree complexity
#     "learning_rate": [0.01, 0.1, 0.3],  # Step size for weight updates
#     "subsample": [0.6, 0.8, 1.0],  # Fraction of samples used per boosting iteration
#     "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features used per tree
#     "min_child_weight": [1, 3, 5],  # Minimum sum of instance weight in child node
#     "lambda": [0.1, 1, 10],  # L2 regularization
#     "alpha": [0, 0.5, 1],  # L1 regularization
#     "n_estimators": [100, 500, 1000]  # Number of boosting rounds
# }
# # grid_search = GridSearchCV(
# #     estimator = XGBClassiferModel,
# #     param_grid = param_grid,
# #     scoring = 'roc_auc',
# #     cv=3,
# #     verbose=1,
# #     n_jobs=-1
# # )

# # grid_search.fit(X_train_transformed, y_train)
# # print("Best Parameters:", grid_search.best_params_)

In [43]:
# bayes_search = BayesSearchCV(estimator=XGBClassiferModel, search_spaces=param_grid, n_iter=25, cv=3, n_jobs=-1, verbose=2, scoring="f1")
# bayes_search.fit(X_train_transformed, y_train)
# # Print best parameters
# print(f"Best parameters: {bayes_search.best_params_}")
# print(f"Best score: {bayes_search.best_score_}")

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 1 candidates, totalling 3 fi