## Modeling ##

In [1]:

from sklearn.ensemble import GradientBoostingClassifier
import seaborn as sns
import shap
import pickle
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error
)
from sklearn.inspection import PartialDependenceDisplay, permutation_importance, partial_dependence
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree

#from xgboost import XGBClassifier

from faiss_imputer import FaissImputer

from sklearn.impute import SimpleImputer

from sklearn.neural_network import MLPClassifier

This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [4]:
df = pd.read_csv('../data/data_clean_bb.csv', low_memory=False,)
df
df.columns

Index(['Unnamed: 0', 'RecordID', 'ESS_Id', 'EventTimeStamp',
       'eventDescription', 'ecuModel', 'ecuMake', 'ecuSource', 'spn', 'fmi',
       'active', 'activeTransitionCount', 'EquipmentID', 'Latitude',
       'Longitude', 'LocationTimeStamp', 'FaultId', 'AcceleratorPedal',
       'BarometricPressure', 'CruiseControlActive', 'CruiseControlSetSpeed',
       'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad',
       'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm',
       'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature',
       'IgnStatus', 'IntakeManifoldTemperature', 'LampStatus', 'ParkingBrake',
       'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure',
       'next_derate_timestamp', 'time_until_detate', 'target'],
      dtype='object')

In [6]:
df['Throttle'] = df['Throttle'].str.replace(',', '.').astype(np.float64)

In [8]:
df['spn'] = df['spn'].astype(object)
df['fmi'] = df['fmi'].astype(object)
df = df.drop('Unnamed: 0', axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 42 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   RecordID                   1057049 non-null  int64  
 1   ESS_Id                     1057049 non-null  int64  
 2   EventTimeStamp             1057049 non-null  object 
 3   eventDescription           1006152 non-null  object 
 4   ecuModel                   1001026 non-null  object 
 5   ecuMake                    1001026 non-null  object 
 6   ecuSource                  1057049 non-null  int64  
 7   spn                        1057049 non-null  object 
 8   fmi                        1057049 non-null  object 
 9   active                     1057049 non-null  bool   
 10  activeTransitionCount      1057049 non-null  int64  
 11  EquipmentID                1057049 non-null  object 
 12  Latitude                   1057049 non-null  float64
 13  Longitude   

In [12]:
column_name = df.columns
NAN_percentage = df.apply(lambda col: col.isna().sum()/col.shape[0], axis=0)
pd.DataFrame({'Column_Name': column_name, 'NAN_percentage': NAN_percentage})

Unnamed: 0,Column_Name,NAN_percentage
RecordID,RecordID,0.0
ESS_Id,ESS_Id,0.0
EventTimeStamp,EventTimeStamp,0.0
eventDescription,eventDescription,0.04815
ecuModel,ecuModel,0.052999
ecuMake,ecuMake,0.052999
ecuSource,ecuSource,0.0
spn,spn,0.0
fmi,fmi,0.0
active,active,0.0


In [14]:
test_date = '2019-01-01'
df_test = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] > test_date]
df_train = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] < test_date]


In [16]:
print(df_test.shape)
print(df_train.shape)

(111491, 42)
(945558, 42)


In [18]:
df = df.sample(n=100000)
df

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuModel,ecuMake,ecuSource,spn,fmi,active,...,IntakeManifoldTemperature,LampStatus,ParkingBrake,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure,next_derate_timestamp,time_until_detate,target
59573,67422,2635389,2015-05-27 10:19:19,Low (Severity Low) Engine Coolant Level,6X1u10D1500000000,CMMNS,0,111,17,True,...,111.2,1023,,49.243670,3276.75,,4.35,2016-02-03 11:37:07,252 days 01:17:48,False
28388,31951,2048000,2015-04-23 10:43:35,Low (Severity Low) Engine Coolant Level,6X1u10D1500000000,CMMNS,0,111,17,True,...,86.0,1023,,43.525110,3276.75,,0.87,,,False
820849,937150,35660556,2017-12-28 07:18:34,Low Voltage (Left Fuel Level Sensor),CECU3B-NAMUX4,PACCR,49,829,4,False,...,,255,,,,,,,,False
697595,793094,16491562,2017-05-24 14:34:46,Abnormal Update Rate Tire Location,CECU3B-NAMUX4,PACCR,49,929,9,False,...,,255,,,,,,,,False
399036,447362,8616388,2016-04-14 10:34:41,Low (Severity Low) Engine Coolant Level,6X1u10D1500000000,CMMNS,0,111,17,True,...,116.6,1023,,9.679798,,,0.87,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
663522,754149,15473553,2017-04-03 09:25:34,Abnormal Update Rate Tire Location,CECU3B-NAMUX4,PACCR,49,929,9,False,...,,255,,,,,,,,False
168836,188063,4364148,2015-08-31 14:57:28,Low (Severity Low) Engine Coolant Level,6X1u10D1500000000,CMMNS,0,111,17,False,...,,17407,,,,,,,,False
308607,343967,6830047,2016-01-13 12:00:37,Low (Severity Low) Engine Coolant Level,6X1u13D1500000000,CMMNS,0,111,17,False,...,,1023,,,,,,2018-11-28 13:35:38,1050 days 01:35:01,False
297648,331950,6629366,2016-01-03 08:24:45,Low (Severity Low) Engine Coolant Level,6X1u10D1500000000,CMMNS,0,111,17,True,...,75.2,1023,,59.457450,,,27.55,2017-03-21 05:41:16,442 days 21:16:31,False


Scaling and encoding features for modeling

In [21]:
X_train = df_train.drop(columns = [
            'FuelTemperature',
            'ParkingBrake',
            'target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'Longitude',
            'Latitude',
            'ESS_Id',
            'RecordID',
            'ecuModel',
            'ecuMake',
            'SwitchedBatteryVoltage',
            'EquipmentID',
            'LampStatus',
            'CruiseControlSetSpeed',
            'EngineLoad',
            'TurboBoostPressure',
            'DistanceLtd',
            'FaultId', 
            'next_derate_timestamp', 
            'time_until_detate'
            ], axis=1)

y_train = df_train['target']

X_test = df_test.drop(columns = [
            'FuelTemperature',
            'ParkingBrake',
            'target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'Longitude',
            'Latitude',
            'ESS_Id',
            'RecordID',
            'ecuModel',
            'ecuMake',
            'SwitchedBatteryVoltage',
            'EquipmentID',
            'LampStatus',
            'CruiseControlSetSpeed',
            'EngineLoad',
            'TurboBoostPressure',
            'DistanceLtd',
            'FaultId', 
            'next_derate_timestamp', 
            'time_until_detate'
            ], axis=1)

y_test = df_test['target']

In [23]:
ohe_features = ['spn',
                'fmi',
                'ecuSource'
                ]
bool_features = ['CruiseControlActive',
                 'IgnStatus',
                 'active'
                ]
scale_features = ['AcceleratorPedal',
                  'BarometricPressure',
                  'EngineCoolantTemperature',
                  'EngineOilPressure',
                  'EngineOilTemperature',
                  'EngineRpm',
                  'FuelLevel',
                  'FuelLtd',
                  'IntakeManifoldTemperature',
                  'Speed',
                  'FuelRate',
                  'EngineTimeLtd',
                  'Throttle',
                  'activeTransitionCount'
                 ]


In [25]:
everything = list(set(ohe_features + bool_features + scale_features))
the_rest = X_train.columns.difference(everything)
the_rest

Index([], dtype='object')

In [27]:
%%time



numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    #('faiss', FaissImputer(n_neighbors=3, strategy='mean')),
    ('simple_imputer', SimpleImputer(strategy='mean'))
])
categorical_pipeline = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False, categories='auto')), 
    ('simple_imputer', SimpleImputer(strategy = 'most_frequent'))
])

bool_pipeline = Pipeline([
    ('simple_imputer', SimpleImputer(strategy = 'most_frequent'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, scale_features),
        ('cat', categorical_pipeline, ohe_features),
        ('bool', bool_pipeline, bool_features)
    ],
    remainder='drop'
)

pipe = Pipeline(steps=[('transformer', preprocessor)])

pipe.fit(X_train)

X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test)

CPU times: total: 51 s
Wall time: 53.1 s


In [29]:
smote = SMOTE()
X_trained_balanced, y_trained_balanced = smote.fit_resample(X_train_transformed, y_train)

In [31]:
#filename = 'pipe_transformed.pkl'

#pickle_list = [pipe, X_train_transformed, X_test_transformed]

#with open(filename, 'wb') as file:
    #pickle.dump(pickle_list, file)

In [33]:
#with open(filename, 'rb') as file:
    #pipe, X_train_transformed, X_test_transformed = pickle.load(file)

In [43]:
X_trained_balanced

array([[-7.80364887e-01,  1.97239877e-01,  4.15200147e-01, ...,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [-7.80364887e-01,  1.97239877e-01,  4.15200147e-01, ...,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [ 1.52617829e-16,  3.98704695e-16, -4.49231980e-16, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       ...,
       [ 1.52617829e-16,  3.98704695e-16, -4.49231980e-16, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-7.80364887e-01,  2.57932779e-01, -1.60157138e+00, ...,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00],
       [-7.55796325e-01,  2.24509292e-01,  4.45390934e-01, ...,
         0.00000000e+00,  1.00000000e+00,  1.00000000e+00]])

In [None]:
%%time

feature_names = X_trained_balanced.columns.tolist()
forest = RandomForestClassifier(random_state=42)
forest.fit(X_train_transformed, y_train)

In [None]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0)
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
feature_names

In [37]:
from sklearn.linear_model import LogisticRegression
LG = LogisticRegression(penalty='l1', solver='liblinear', C=1.0, max_iter=100, random_state=42, class_weight='balanced')
LG.fit(X_trained_balanced, y_trained_balanced)
LG_y_pred = LG.predict(X_test_transformed)
#print(f'Accuracy: {accuracy_score(y_test, RC_y_pred)}')
#print(f'MCC: {matthews_corrcoef(y_test, RC_y_pred)}')
print(confusion_matrix(y_test, LG_y_pred))
print(classification_report(y_test, LG_y_pred))

[[103080   8100]
 [    56    255]]
              precision    recall  f1-score   support

       False       1.00      0.93      0.96    111180
        True       0.03      0.82      0.06       311

    accuracy                           0.93    111491
   macro avg       0.51      0.87      0.51    111491
weighted avg       1.00      0.93      0.96    111491



%%time

gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=4)

gbc.fit(X_trained_balanced, y_trained_balanced)

y_pred = gbc.predict(X_test_transformed)

In [39]:
from sklearn.inspection import PartialDependenceDisplay

In [73]:
X_test_transformed

array([[-0.7803648869992846, 0.13654697529743015, -2.9289857870243003,
        ..., False, True, True],
       [1.5261782929524005e-16, 3.9870469539917076e-16,
        -4.492319801225067e-16, ..., False, True, False],
       [-0.7803648869992846, -0.16691753490684277, -1.7228859419963023,
        ..., False, True, True],
       ...,
       [-0.7803648869992846, 0.13654697529743015, 0.5248455873740583,
        ..., True, True, True],
       [-0.7366195453056467, 0.19723987733828446, 0.360377426688422, ...,
        False, True, True],
       [1.5261782929524005e-16, 3.9870469539917076e-16,
        -4.492319801225067e-16, ..., False, True, False]], dtype=object)

In [75]:
variable_names = X_test.columns.tolist()
pd.DataFrame({
    'variable': variable_names,
    'LG_coeff': LG.coef_.ravel()
})

ValueError: All arrays must be of the same length

In [57]:
pd.DataFrame({
    'variable': variable_names,
    'importance': permutation_importance(LG, X_test_transformed, y_test, random_state = 42)['importances_mean']
}).sort_values('importance', ascending = False)

ValueError: All arrays must be of the same length

In [87]:
#feature_names = X_test.columns

#importance = np.abs(LG.coef_[0])

#top_indices = np.argsort(importance)[-3:][::-1]

#top_features = feature_names[top_indices]

top_features = X_test['EngineOilTemperature']

print(f"Plotting PDP for EngineOilTemperature")
PartialDependenceDisplay.from_estimator(LG, X_test, [top_features])
plt.show()

Plotting PDP for EngineOilTemperature


ValueError: Each entry in features must be either an int, a string, or an iterable of size at most 2.

In [None]:
explainer = shap.TreeExplainer(gbc)
explanation = explainer(X_test)


In [None]:
shap.plots.beeswarm(explanation)

In [None]:
shap.plots.bar(explanation)

In [None]:
pd.DataFrame(X_test).info()

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred, zero_division = 0))

In [None]:
from sklearn.linear_model import RidgeClassifier
RC = RidgeClassifier(alpha=1.0)
RC.fit(X_train_transformed, y_train)
RC_y_pred = RC.predict(X_test_transformed)
#print(f'Accuracy: {accuracy_score(y_test, RC_y_pred)}')
#print(f'MCC: {matthews_corrcoef(y_test, RC_y_pred)}')
print(confusion_matrix(y_test, RC_y_pred))
print(classification_report(y_test, RC_y_pred))

In [None]:
from sklearn.linear_model import LogisticRegression
LG = LogisticRegression(penalty='l1', solver='liblinear')
LG.fit(X_train_transformed, y_train)
LG_y_pred = LG.predict(X_test_transformed)
#print(f'Accuracy: {accuracy_score(y_test, RC_y_pred)}')
#print(f'MCC: {matthews_corrcoef(y_test, RC_y_pred)}')
print(confusion_matrix(y_test, RC_y_pred))
print(classification_report(y_test, RC_y_pred))

In [None]:
%%time

mlp = MLPClassifier(
            hidden_layer_sizes = (17, 15, 13, 11),
            activation = 'relu',
            solver = 'adam',            
            max_iter = 100000, 
            alpha = 0.5,
            learning_rate = 'adaptive'
        ).fit(X_train_transformed, y_train)

y_pred_mlp = mlp.predict(X_test_transformed)

In [None]:
confusion_matrix(y_test, y_pred_mlp)

In [None]:
print(classification_report(y_test, y_pred_mlp, zero_division = 0))

In [None]:
corr_matrix = X_train[scale_features].corr().melt(ignore_index=False)

In [None]:
corr_matrix[corr_matrix['value'] != 1].sort_values(by = 'value', ascending=False).head(12)

In [None]:
sns.heatmap(corr_matrix, annot=True, cmap='YlGnBu', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()