## Modeling ##

In [1]:
import pickle
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer,SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error,
    accuracy_score,
    matthews_corrcoef,
    brier_score_loss,
    f1_score
)
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree

from sklearn.linear_model import LogisticRegression

from faiss_imputer import FaissImputer

from xgboost import XGBClassifier

from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import HistGradientBoostingClassifier

import time

This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('../data/data_clean_05_01.csv', low_memory=False)

In [4]:
test_date = '2019-01-01'

df_test = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] > test_date]

In [5]:
df_train = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] < test_date]

In [6]:
#df = df.sample(frac=0.50)

In [7]:
columns_to_object = ['ecuSource',
                     'spn',
                     'fmi',
                     'MCTNumber',
                     'RecordID',
                     'ESS_Id',
                     'LampStatus'
                    ]

for column in columns_to_object:
    df[column] = df[column].astype(object)

In [8]:
columns_to_bool = ['CruiseControlActive',
                   'IgnStatus',
                   'ParkingBrake']

for column in columns_to_bool:
    df[column] = df[column].astype(bool)

In [9]:
int64_cols = df.select_dtypes(include='bool').columns
df[int64_cols] = df[int64_cols].astype('int64')

Scaling and encoding features for modeling

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 45 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   AcceleratorPedal           480660 non-null   float64
 1   BarometricPressure         530902 non-null   float64
 2   CruiseControlActive        1057049 non-null  int64  
 3   CruiseControlSetSpeed      521823 non-null   float64
 4   DistanceLtd                530760 non-null   float64
 5   EngineCoolantTemperature   530890 non-null   float64
 6   EngineLoad                 530421 non-null   float64
 7   EngineOilPressure          531008 non-null   float64
 8   EngineOilTemperature       529370 non-null   float64
 9   EngineRpm                  531324 non-null   float64
 10  EngineTimeLtd              527047 non-null   float64
 11  FuelLevel                  455471 non-null   float64
 12  FuelLtd                    530354 non-null   float64
 13  FuelRate    

In [11]:
X = df.drop(['target',  
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ecuSerialNumber',
            'ecuSoftwareVersion',
            'time_derate',
            'time_until_derate',
            'Longitude',
            'Latitude',
            'ESS_Id',
            'RecordID',
            'ecuSource',
            'ServiceDistance',
            'ecuMake',
            'SwitchedBatteryVoltage',
            'EquipmentID',
            'ecuModel',
            'MCTNumber',
            'LampStatus',
            'activeTransitionCount'
            ], axis=1)

y = df['target']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, random_state = 27, train_size = 0.6/0.8)

In [13]:
ohe_features = ['spn',
                'fmi'
                ] 

bool_to_features = X_train.select_dtypes(include='int64')
bool_features = list(bool_to_features.columns)

scale_features = ['AcceleratorPedal',
                  'BarometricPressure',
                  'DistanceLtd',
                  'EngineCoolantTemperature',
                  'EngineLoad',
                  'EngineOilPressure',
                  'EngineOilTemperature',
                  'EngineRpm',
                  'FuelLevel',
                  'FuelLtd',
                  'FuelTemperature',
                  'IntakeManifoldTemperature',
                  'Speed',
                  'TurboBoostPressure',
                  'FuelRate',
                  'EngineTimeLtd',
                  'CruiseControlSetSpeed',
                  'Throttle']

everything = list(set(ohe_features + bool_features + scale_features))
the_rest = X_train.columns.difference(everything)

pca = PCA(n_components = 1)

In [None]:
%%time

numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn_imputer', KNNImputer(n_neighbors=3))
])

categorical_pipeline = Pipeline([
    ('si', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

boolean_pipeline = Pipeline([
    ('si', SimpleImputer(strategy='most_frequent'))
])

ct = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, scale_features),
        ('cat', categorical_pipeline, ohe_features),
        ('bool', boolean_pipeline, bool_features)
    ],
    remainder='drop'
)

pipe = Pipeline(
    steps=[
        ('transformer', ct),
        ('pca', pca)
    ]
)
pipe.fit(X_train)

X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test) 

In [None]:
filename = 'pipe_transformed.pkl'

pickle_list = [pipe, X_train_transformed, X_test_transformed]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [None]:
with open(filename, 'rb') as file:
    pipe, X_train_transformed, X_test_transformed = pickle.load(file)

In [None]:
knn_model = KNeighborsClassifier().fit(X_train_transformed, y_train)

In [None]:
y_pred_knn = knn_model.predict(X_test_transformed)

In [None]:
%%time

xgb = XGBClassifier().fit(x_train_transformed, y_train)

y_pred_xgb = xgb.predict(X_test_transformed)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred_xgb)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_xgb)}')
print(confusion_matrix(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred_knn)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_knn)}')
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn))
print(cross_val_score(knn_model, X_train_transformed, y_train, cv=3))

In [None]:
#param_grid = {
#    'preprocessor__num__scaler__with_mean': [True, False],
#    'preprocessor__num__scaler__with_std': [True, False],
#    'classifier__C': [0.1, 1, 10],
#    'classifier__solver': ['liblinear', 'newton-cg']
#}

#randomized_search = RandomizedSearchCV(pipeline, param_grid, n_iter=10, cv=3)

In [None]:
#%%time

#knc = KNeighborsClassifier().fit(X_train_encoded, y_train)
#y_pred_knc = knc.predict(X_test_encoded)

In [None]:
#print(classification_report(y_test, y_pred_knc, zero_division = 0))

In [None]:
#confusion_matrix(y_test, y_pred_knc)

In [None]:
y_val_pred_proba = pipe.predict_proba(X_val)[:,1]

In [None]:
candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
thresholds = pd.DataFrame({'threshold': candidate_thresholds})
thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_val, y_val_pred_proba > x))
thresholds.sort_values('f1', ascending = False).head()

In [None]:
threshold = 0.10

y_pred_proba = model.predict_proba(X_test)[:,1]

y_pred = y_pred_proba > threshold
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
%%time

hgbc = HistGradientBoostingClassifier().fit(X_train_encoded, y_train)

y_pred_mlp = hgbc.predict(X_test_encoded)

In [None]:
confusion_matrix(y_test, y_pred_mlp)

In [None]:
print(classification_report(y_test, y_pred_mlp, zero_division = 0))