## Modeling ##

In [189]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error
)
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree
from faiss_imputer import FaissImputer

from tqdm.notebook import tqdm, trange
from ipywidgets import interactive

from xgboost import XGBClassifier

This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [10]:
df = pd.read_csv('../data/data_clean.csv', low_memory=False, parse_dates=[26, 38, 39, 40])

In [52]:
df['spn'] = df['spn'].astype(object)
df = df.drop('Unnamed: 0', axis=1)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057048 entries, 1 to 1057048
Data columns (total 41 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   AcceleratorPedal           480659 non-null   float64       
 1   BarometricPressure         530901 non-null   float64       
 2   CruiseControlActive        520764 non-null   object        
 3   CruiseControlSetSpeed      521822 non-null   float64       
 4   DistanceLtd                530759 non-null   float64       
 5   EngineCoolantTemperature   530889 non-null   float64       
 6   EngineLoad                 530420 non-null   float64       
 7   EngineOilPressure          531007 non-null   float64       
 8   EngineOilTemperature       529369 non-null   float64       
 9   EngineRpm                  531323 non-null   float64       
 10  EngineTimeLtd              527046 non-null   float64       
 11  FuelLevel                  455470 non

Scaling and encoding features for modeling

In [197]:
X = df.drop(['target', 
            'target_time_max', 
            'target_time_min', 
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription'], axis=1)

y = df['target']

In [201]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27, stratify=y)

In [203]:
ohe_features = ['spn', 
                'EquipmentID',
                'ecuSoftwareVersion',
                'ecuSerialNumber',
                'Throttle',
                'ParkingBrake',
                'IgnStatus',
                'CruiseControlActive']

scale_features = [col for col in X_train.columns if col not in ohe_features]
                
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('faiss_imputer', FaissImputer(n_neighbors=3, strategy = 'mean'))
])
categorical_pipeline = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('faiss_imputer', FaissImputer(n_neighbors=3, strategy = 'mode')) 
    
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, scale_features),
        ('cat', categorical_pipeline, ohe_features)
    ],
    remainder='passthrough'
)

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

  column_agg = np.nanmean(selected_values, axis=0)
  column_agg = np.nanmean(selected_values, axis=0)


In [205]:
model = RandomForestClassifier(n_estimators=100, random_state=27)
model.fit(X_train_encoded, y_train)

y_pred = model.predict(X_test_encoded)

In [207]:
print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

       False       0.97      1.00      0.99    308095
        True       0.94      0.01      0.03      9020

    accuracy                           0.97    317115
   macro avg       0.96      0.51      0.51    317115
weighted avg       0.97      0.97      0.96    317115



In [259]:
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feature_importances.sort_values(ascending=False)
print(feature_importances)

0       0.014209
1       0.019416
2       0.011422
3       0.026563
4       0.020489
          ...   
5372    0.001061
5373    0.001024
5374    0.001806
5375    0.001289
5376    0.000853
Length: 5377, dtype: float64
