## Modeling ##

In [2]:
# pip install faiss_imputer

In [240]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error
)
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree
from faiss_imputer import FaissImputer

from tqdm.notebook import tqdm, trange
from ipywidgets import interactive
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [242]:
df = pd.read_csv('../data/data_clean.csv', low_memory=False)

In [244]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 47 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Unnamed: 0                 1057049 non-null  int64  
 1   RecordID                   1057049 non-null  int64  
 2   ESS_Id                     1057049 non-null  int64  
 3   EventTimeStamp             1057049 non-null  object 
 4   eventDescription           1006152 non-null  object 
 5   ecuSoftwareVersion         792713 non-null   object 
 6   ecuSerialNumber            751116 non-null   object 
 7   ecuModel                   1001026 non-null  object 
 8   ecuMake                    1001026 non-null  object 
 9   ecuSource                  1057049 non-null  int64  
 10  spn                        1057049 non-null  int64  
 11  fmi                        1057049 non-null  int64  
 12  active                     1057049 non-null  bool   
 13  activeTransi

In [252]:
df = df.drop('Unnamed: 0', axis=1)

In [246]:
object_columns_to_change_to_category = [        
    'eventDescription',  
    'ecuSoftwareVersion',          
    'ecuSerialNumber',         
    'ecuModel',           
    'ecuMake',                  
    'EquipmentID',                                 
    'LampStatus',              
    'ServiceDistance',            
    'Throttle'] 

In [248]:
for column in object_columns_to_change_to_category:
    df[column] = df[column].astype('category')

In [250]:
df['EventTimeStamp'] = pd.to_datetime(df['EventTimeStamp'])
df['LocationTimeStamp'] = pd.to_datetime(df['LocationTimeStamp'])

In [277]:
int_columns_to_categorical = ['RecordID',
'ESS_Id',                    
'ecuSource',                    
'spn',                       
'fmi',  
'active',                     
'activeTransitionCount',        
'MCTNumber',                  
'FaultId']

In [279]:
for column in int_columns_to_categorical:
    df[column] = df[column].astype('category')

In [272]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 46 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   RecordID                   1057049 non-null  category      
 1   ESS_Id                     1057049 non-null  category      
 2   EventTimeStamp             1057049 non-null  datetime64[ns]
 3   eventDescription           1006152 non-null  category      
 4   ecuSoftwareVersion         792713 non-null   category      
 5   ecuSerialNumber            751116 non-null   category      
 6   ecuModel                   1001026 non-null  category      
 7   ecuMake                    1001026 non-null  category      
 8   ecuSource                  1057049 non-null  category      
 9   spn                        1057049 non-null  category      
 10  fmi                        1057049 non-null  category      
 11  active                     1057049 no

Scaling and encoding features for modeling

In [281]:
X = df.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'ServiceDistance',
            'Latitude',
            'Longitude'], axis=1)

y = df['target'].values

In [283]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27, stratify=y)

In [285]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 34 columns):
 #   Column                     Non-Null Count    Dtype   
---  ------                     --------------    -----   
 0   RecordID                   1057049 non-null  category
 1   spn                        1057049 non-null  category
 2   fmi                        1057049 non-null  category
 3   active                     1057049 non-null  category
 4   activeTransitionCount      1057049 non-null  category
 5   EquipmentID                1057049 non-null  category
 6   Latitude                   1057049 non-null  float64 
 7   Longitude                  1057049 non-null  float64 
 8   FaultId                    1057049 non-null  category
 9   AcceleratorPedal           480660 non-null   float64 
 10  BarometricPressure         530902 non-null   float64 
 11  CruiseControlActive        1057049 non-null  bool    
 12  CruiseControlSetSpeed      521823 non-null   float64 
 1

In [114]:
categorical_features = ['spn', 'EquipmentID', 'CruiseControlActive', 'ParkingBrake', 'IgnStatus', 'Throttle', 'next_derate_timestamp', 'time_until_detate'] 
X_train[categorical_features] = X_train[categorical_features].astype("category")
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 739934 entries, 1035968 to 850462
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype   
---  ------                     --------------   -----   
 0   RecordID                   739934 non-null  int64   
 1   spn                        739934 non-null  category
 2   fmi                        739934 non-null  int64   
 3   active                     739934 non-null  bool    
 4   activeTransitionCount      739934 non-null  int64   
 5   EquipmentID                739934 non-null  category
 6   Latitude                   739934 non-null  float64 
 7   Longitude                  739934 non-null  float64 
 8   FaultId                    739934 non-null  int64   
 9   AcceleratorPedal           336870 non-null  float64 
 10  BarometricPressure         371934 non-null  float64 
 11  CruiseControlActive        364849 non-null  category
 12  CruiseControlSetSpeed      365583 non-null  float64 
 13  DistanceLtd  

In [120]:
numeric_features = [x for x in X_train.columns if x not in ['spn', 'EquipmentID', 'CruiseControlActive', 'IgnStatus', 'Throttle', 'next_derate_timestamp', 'time_until_detate']] 
categorical_features = ['spn', 'EquipmentID', 'CruiseControlActive', 'ParkingBrake', 'IgnStatus', 'Throttle', 'next_derate_timestamp', 'time_until_detate'] 

numeric_pipe = Pipeline(
    steps=[
        ('scale', StandardScaler()),
        ('faiss_imputer', FaissImputer(n_neighbors=3, strategy = 'mean')),
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ('faiss_imputer', FaissImputer(n_neighbors=3, strategy = 'mode')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)


ct = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipe, numeric_features),
        ('categorical', categorical_pipe, categorical_features)
    ]
)

pipe = Pipeline(
    steps=[
        ('transformer', ct),
        ('lightgbm', LGBMClassifier(
        ))
    ]
).fit(X, y)
y_pred = pipe.predict(X_test_encoded)
print(classification_report(y_test, y_pred, zero_division = 0))
feature_importances = pd.Series(model.feature_importances_, index=X_train.columns)
feature_importances.sort_values(ascending=False)
print(feature_importances)

  column_agg = np.nanmean(selected_values, axis=0)


ValueError: could not convert string to float: 'R1762'

In [64]:
np.where(df == 'R1762')

(array([4166, 4335, 4337, 4389, 6250, 6258], dtype=int64),
 array([11, 11, 11, 11, 11, 11], dtype=int64))

In [70]:
df.columns[12]

'Latitude'

In [88]:
df[df.isin(['R1762'])].any(axis=0)

RecordID                     False
ESS_Id                       False
EventTimeStamp               False
eventDescription             False
ecuModel                     False
ecuMake                      False
ecuSource                    False
spn                          False
fmi                          False
active                       False
activeTransitionCount        False
EquipmentID                   True
Latitude                     False
Longitude                    False
LocationTimeStamp            False
FaultId                      False
AcceleratorPedal             False
BarometricPressure           False
CruiseControlActive          False
CruiseControlSetSpeed        False
DistanceLtd                  False
EngineCoolantTemperature     False
EngineLoad                   False
EngineOilPressure            False
EngineOilTemperature         False
EngineRpm                    False
EngineTimeLtd                False
FuelLevel                    False
FuelLtd             

In [104]:
df.loc[df['EquipmentID'] == 'R1762']['EquipmentID'].astype(float)

ValueError: could not convert string to float: 'R1762'