## Modeling ##

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error
)
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree

from faiss_imputer import FaissImputer

from xgboost import XGBClassifier

This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [2]:
df = pd.read_csv('../data/data_clean.csv', low_memory=False)

In [3]:
df['spn'] = df['spn'].astype(object)
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 43 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   AcceleratorPedal           480660 non-null   float64
 1   BarometricPressure         530902 non-null   float64
 2   CruiseControlActive        520765 non-null   object 
 3   CruiseControlSetSpeed      521823 non-null   float64
 4   DistanceLtd                530760 non-null   float64
 5   EngineCoolantTemperature   530890 non-null   float64
 6   EngineLoad                 530421 non-null   float64
 7   EngineOilPressure          531008 non-null   float64
 8   EngineOilTemperature       529370 non-null   float64
 9   EngineRpm                  531324 non-null   float64
 10  EngineTimeLtd              527047 non-null   float64
 11  FuelLevel                  455471 non-null   float64
 12  FuelLtd                    530354 non-null   float64
 13  FuelRate    

Scaling and encoding features for modeling

In [5]:
X = df.drop(['target', 
            'target_time_max', 
            'target_time_min', 
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription'], axis=1)

y = df['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27, stratify=y)

In [None]:
ohe_features = ['spn', 
                'EquipmentID']

scale_features = X_train.select_dtypes(include=np.number).columns
                
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('faiss_imputer', FaissImputer(n_neighbors=3, strategy = 'mean'))
])
categorical_pipeline = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
    ('faiss_imputer', FaissImputer(n_neighbors=3, strategy = 'median')) 
    
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, scale_features),
        ('cat', categorical_pipeline, ohe_features)
    ],
    remainder='passthrough'
)

X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

  column_agg = np.nanmean(selected_values, axis=0)


In [None]:
model = XGBClassifier().fit(X_train_encoded, y_train)

y_pred = model.predict(X_test_encoded)

In [None]:
print(classification_report(y_test, y_pred, zero_division = 0))