## Modeling ##

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error
)
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree

from sklearn.linear_model import LogisticRegression

from faiss_imputer import FaissImputer

from xgboost import XGBClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import HistGradientBoostingClassifier

import time

This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/data_clean_05_01.csv', low_memory=False)

In [4]:
#df = df.sample(frac=0.50)

In [5]:
columns_to_object = ['ecuSource',
                     'spn',
                     'fmi',
                     'MCTNumber',
                     'RecordID',
                     'ESS_Id',
                     'LampStatus'
                    ]

for column in columns_to_object:
    df[column] = df[column].astype(object)

In [6]:
columns_to_bool = ['CruiseControlActive',
                   'IgnStatus',
                   'ParkingBrake']

for column in columns_to_bool:
    df[column] = df[column].astype(bool)

In [7]:
#float64_cols = df.select_dtypes(include=['float64']).columns
#df[float64_cols] = df[float64_cols].astype('float32')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 45 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   AcceleratorPedal           480660 non-null   float64
 1   BarometricPressure         530902 non-null   float64
 2   CruiseControlActive        1057049 non-null  bool   
 3   CruiseControlSetSpeed      521823 non-null   float64
 4   DistanceLtd                530760 non-null   float64
 5   EngineCoolantTemperature   530890 non-null   float64
 6   EngineLoad                 530421 non-null   float64
 7   EngineOilPressure          531008 non-null   float64
 8   EngineOilTemperature       529370 non-null   float64
 9   EngineRpm                  531324 non-null   float64
 10  EngineTimeLtd              527047 non-null   float64
 11  FuelLevel                  455471 non-null   float64
 12  FuelLtd                    530354 non-null   float64
 13  FuelRate    

Scaling and encoding features for modeling

In [20]:
df[df.isin(['6X1u10D1500000000']).any(axis=1)]

Unnamed: 0,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,EngineRpm,EngineTimeLtd,FuelLevel,FuelLtd,FuelRate,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,time_derate,time_until_derate,target
208,44.4,14.7175,False,42.25324,503177.2,125.6,40.0,35.96,163.2312,1020.625,12263.00,64.4,75593.361024,4.239975,,True,66.2,17407,True,,35.93273,,,5.22,726943,14836854,2009-02-26 00:02:42,Low (Severity Medium) ECU Power Output Supply ...,04993120*00179417*082113134117*07700053*I0*BBZ*,79620769,6X1u10D1500000000,CMMNS,0,3597,18,True,1,1589,105338555,35.137592,-90.093935,2017-03-02 08:36:50,,,False
209,44.4,14.7175,False,42.25324,503177.2,125.6,40.0,35.96,163.2312,1020.625,12263.00,64.4,75593.361024,4.239975,,True,66.2,17407,True,,35.93273,,,5.22,726945,14836856,2009-02-26 00:02:42,Incorrect Data Aftertreatment 1 Outlet Gas Sen...,04993120*00179417*082113134117*07700053*I0*BBZ*,79620769,6X1u10D1500000000,CMMNS,0,3228,2,True,1,1589,105338555,35.137592,-90.093935,2017-03-02 08:36:50,,,False
210,44.4,14.7175,False,42.25324,503177.2,125.6,40.0,35.96,163.2312,1020.625,12263.00,64.4,75593.361024,4.239975,,True,66.2,17407,True,,35.93273,,,5.22,726944,14836855,2009-02-26 00:02:43,Incorrect Data Aftertreatment 1 Intake Gas Sen...,04993120*00179417*082113134117*07700053*I0*BBZ*,79620769,6X1u10D1500000000,CMMNS,0,3218,2,True,1,1589,105338555,35.137592,-90.093935,2017-03-02 08:36:50,,,False
216,,,True,,,,,,,,,,,,,True,,1023,True,,,,,,147625,3814545,2010-12-31 23:02:39,Low (Severity Low) Engine Coolant Level,04993120*00184243*060712145341*07700027*I0*BBZ*,79619432,6X1u10D1500000000,CMMNS,0,111,17,True,1,1576,105411518,38.998842,-83.782407,2015-08-02 12:51:22,,,False
217,,,True,,,,,,,,,,,,,True,,2035,True,,,,,,88374,2970703,2010-12-31 23:02:39,Low (Severity Low) Engine Coolant Level,04993120*00184243*060712145341*07700027*I0*BBZ*,79619432,6X1u10D1500000000,CMMNS,0,111,17,True,1,1576,105411518,39.002407,-83.782222,2015-06-16 00:23:58,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
816383,55.2,14.3550,False,50.33107,552493.0,179.6,42.0,38.28,205.5313,1410.500,13399.95,61.2,87870.228796,6.908121,,True,82.4,17407,True,,46.41837,,,8.70,931301,34905258,2017-12-16 22:44:13,Low Voltage (Engine Crankcase Pressure),04993120*00179142*051215183709*07700066*I0*BBZ*,79623055,6X1u10D1500000000,CMMNS,0,101,4,True,1,1602,105418094,33.629398,-84.311712,2017-12-16 22:44:48,,,False
817192,100.0,14.5000,False,66.48672,530624.4,185.0,56.0,36.54,208.7375,1424.000,11543.80,27.2,78202.588382,8.981877,,True,87.8,2047,True,,66.86536,,,9.57,933091,35056493,2017-12-19 02:24:29,Low (Severity Medium) Engine Coolant Level,04993120*00190983*042216134005*07700071*I0*BBZ*,79623054,6X1u10D1500000000,CMMNS,0,111,18,True,1,1600,105430885,36.283564,-86.806759,2017-12-19 02:25:05,,,False
817194,,,True,,,,,,,,,,,,,True,,1023,True,,,,,,933093,35056514,2017-12-19 02:26:11,Low (Severity Medium) Engine Coolant Level,04993120*00190983*042216134005*07700071*I0*BBZ*,79623054,6X1u10D1500000000,CMMNS,0,111,18,False,1,1600,105430885,36.270648,-86.794444,2017-12-19 02:26:07,,,False
817227,100.0,14.3550,False,66.48672,530774.3,183.2,83.0,35.38,209.9750,1428.875,11546.20,16.4,78226.495952,14.569130,,True,98.6,2047,True,,66.89449,,,20.30,933122,35061613,2017-12-19 04:48:51,Low (Severity Medium) Engine Coolant Level,04993120*00190983*042216134005*07700071*I0*BBZ*,79623054,6X1u10D1500000000,CMMNS,0,111,18,True,1,1600,105430885,34.938750,-85.163750,2017-12-19 04:49:27,,,False


In [9]:
X = df.drop(['target',  
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ecuSerialNumber',
            'ecuSoftwareVersion',
            'time_derate',
            'time_until_derate',
            'Longitude',
            'Latitude',
            'ESS_Id',
            'RecordID',
            'ecuSource',
            'ServiceDistance',
            'ecuMake',
            'SwitchedBatteryVoltage',
            'EquipmentID',
            'ecuModel'
            ], axis=1)

y = df['target']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, random_state = 27, train_size = 0.6/0.8)

In [12]:
ohe_features = ['spn',
                'fmi',
                ] 

bool_features = X_train.columns.astype(bool)

scale_features = ['AcceleratorPedal',
                  'BarometricPressure',
                  'DistanceLtd',
                  'EngineCoolantTemperature',
                  'EngineLoad',
                  'EngineOilPressure',
                  'EngineOilTemperature',
                  'EngineRpm',
                  'FuelLevel',
                  'FuelLtd',
                  'FuelTemperature',
                  'IntakeManifoldTemperature',
                  'Speed',
                  'TurboBoostPressure',
                  'FuelRate',
                  'activeTransitionCount']

In [15]:
%%time

numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('faiss_imputer', FaissImputer(n_neighbors=3, strategy = 'mean'))
])

categorical_pipeline = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

boolean_pipeline = Pipeline([
    ('si', SimpleImputer(strategy='most_frequent'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, scale_features),
        ('cat', categorical_pipeline, ohe_features),
        ('bool', boolean_pipeline, bool_features)
    ],
    remainder='passthrough'
)

CPU times: total: 0 ns
Wall time: 87.7 μs


In [16]:
%%time

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

  column_agg = np.nanmean(selected_values, axis=0)
  column_agg = np.nanmean(selected_values, axis=0)


CPU times: total: 23min 11s
Wall time: 23min 38s


In [None]:
#param_grid = {
#    'preprocessor__num__scaler__with_mean': [True, False],
#    'preprocessor__num__scaler__with_std': [True, False],
#    'classifier__C': [0.1, 1, 10],
#    'classifier__solver': ['liblinear', 'newton-cg']
#}

#randomized_search = RandomizedSearchCV(pipeline, param_grid, n_iter=10, cv=3)

In [19]:
%%time

model = XGBClassifier().fit(X_train_encoded, y_train)

y_pred = model.predict(X_test_encoded)

ValueError: could not convert string to float: '6X1u10D1500000000'

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
y_val_pred_proba = model.predict_proba(X_val)[:,1]

In [None]:
candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
thresholds = pd.DataFrame({'threshold': candidate_thresholds})
thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_val, y_val_pred_proba > x))
thresholds.sort_values('f1', ascending = False).head()

In [None]:
threshold = 0.30

y_pred_proba = model.predict_proba(X_test)[:,1]

y_pred = y_pred_proba > threshold
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
%%time

hgbc = HistGradientBoostingClassifier().fit(X_train_encoded, y_train)

y_pred_mlp = hgbc.predict(X_test_encoded)

In [None]:
confusion_matrix(y_test, y_pred_mlp)

In [None]:
print(classification_report(y_test, y_pred_mlp, zero_division = 0))

In [None]:
%%time

knc = KNeighborsClassifier().fit(X_train_encoded, y_train)
y_pred_knc = knc.predict(X_test_encoded)

In [None]:
confusion_matrix(y_test, y_pred_knc)

In [None]:
print(classification_report(y_test, y_pred_knc, zero_division = 0))