## Modeling ##

In [2]:
# pip install faiss_imputer

In [42]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error
)
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from scipy.spatial import KDTree
from faiss_imputer import FaissImputer

from tqdm.notebook import tqdm, trange
from ipywidgets import interactive
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, accuracy_score, confusion_matrix, 
    f1_score, fbeta_score, 
    matthews_corrcoef, brier_score_loss
)

This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [5]:
df = pd.read_csv('../data/data_clean.csv', low_memory=False)

In [6]:
#df_sample = df.sample(frac=0.3, random_state=42)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 47 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Unnamed: 0                 1057049 non-null  int64  
 1   RecordID                   1057049 non-null  int64  
 2   ESS_Id                     1057049 non-null  int64  
 3   EventTimeStamp             1057049 non-null  object 
 4   eventDescription           1006152 non-null  object 
 5   ecuSoftwareVersion         792713 non-null   object 
 6   ecuSerialNumber            751116 non-null   object 
 7   ecuModel                   1001026 non-null  object 
 8   ecuMake                    1001026 non-null  object 
 9   ecuSource                  1057049 non-null  int64  
 10  spn                        1057049 non-null  int64  
 11  fmi                        1057049 non-null  int64  
 12  active                     1057049 non-null  bool   
 13  activeTransi

In [8]:
df = df.drop('Unnamed: 0', axis=1)

In [9]:
object_columns_to_change_to_category = [        
    'eventDescription',  
    'ecuSoftwareVersion',          
    'ecuSerialNumber',         
    'ecuModel',           
    'ecuMake',                  
    'EquipmentID',                                 
    'LampStatus',              
    'ServiceDistance',
    'next_derate_timestamp',
    'time_until_detate'] 

In [10]:
for column in object_columns_to_change_to_category:
    df[column] = df[column].astype('category')

In [11]:
df['EventTimeStamp'] = pd.to_datetime(df['EventTimeStamp'])
df['LocationTimeStamp'] = pd.to_datetime(df['LocationTimeStamp'])

In [12]:
int_columns_to_categorical = ['RecordID',
'ESS_Id',                    
'ecuSource',                    
'spn',                       
'fmi',  
'active',       
'MCTNumber',                  
'FaultId']

In [13]:
for column in int_columns_to_categorical:
    df[column] = df[column].astype('category')

In [14]:
float64_cols = df.select_dtypes(include=['float64']).columns
df[float64_cols] = df[float64_cols].astype('float32')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 46 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   RecordID                   1057049 non-null  category      
 1   ESS_Id                     1057049 non-null  category      
 2   EventTimeStamp             1057049 non-null  datetime64[ns]
 3   eventDescription           1006152 non-null  category      
 4   ecuSoftwareVersion         792713 non-null   category      
 5   ecuSerialNumber            751116 non-null   category      
 6   ecuModel                   1001026 non-null  category      
 7   ecuMake                    1001026 non-null  category      
 8   ecuSource                  1057049 non-null  category      
 9   spn                        1057049 non-null  category      
 10  fmi                        1057049 non-null  category      
 11  active                     1057049 no

Scaling and encoding features for modeling

In [17]:
X = df.drop(['target',
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ESS_Id',
            'ecuModel',
            'ecuMake',
            'ecuSource',
            'ecuSoftwareVersion',
            'ecuSerialNumber',
            'MCTNumber',
            'ServiceDistance',
            'Latitude',
            'Longitude',
            'RecordID',
            'next_derate_timestamp',
            'time_until_detate',
            'FaultId'], axis=1)

y = df['target'].values

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 321, train_size = 0.8)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, random_state = 321, train_size = 0.6/0.8)

In [19]:
categorical_features = ['spn', 'fmi', 'EquipmentID', 'LampStatus'] 
bool_features = ['CruiseControlActive', 'ParkingBrake', 'IgnStatus', 'active']
numeric_features = [x for x in X_train.columns if x not in categorical_features + bool_features] 

numeric_pipe = Pipeline(
    steps=[
        ('scale', StandardScaler()),
        ('faiss_imputer', FaissImputer(n_neighbors=3, strategy = 'mean')),
    ]
)

categorical_pipe = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
        ('KNN_imputer', SimpleImputer(strategy='most_frequent')),
    ]
)

bool_pipe = Pipeline(
    steps=[
        ('bool_impute', SimpleImputer(strategy='most_frequent'))
    ]
)

ct = ColumnTransformer(
    transformers=[
        ('numeric', numeric_pipe, numeric_features),
        ('categorical', categorical_pipe, categorical_features),
        ('bool', bool_pipe, bool_features)
    ]
)

pipe = Pipeline(
    steps=[
        ('transformer', ct),
        ('lightgbm', LGBMClassifier(
        ))
    ]
).fit(X_train, y_train)

  column_agg = np.nanmean(selected_values, axis=0)


[LightGBM] [Info] Number of positive: 1510, number of negative: 632719
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6089
[LightGBM] [Info] Number of data points in the train set: 634229, number of used features: 1229
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002381 -> initscore=-6.037917
[LightGBM] [Info] Start training from score -6.037917


In [20]:
# - there rae 1763 positive samples and 738,171 negative samples. This suggests dataset is highly imbalanced, which may require adjustments like class weighting or resampling.
# - LightGBM uses histogram-based learning instead of splitting values directly. This means the dataset has 6531 unique bin partitions, which can improve training efficiency.
# - there are 739,934 rows being used for training. the model is leveraging 1270 features in total.
# - Since it's a binary classification task, LightGBM calculates an initial score (-6.037159). This score is derived from the prior probability (pavg=0.002383), meaning only 0.238% of samples are positive. The low prior probability affects LightGBM's initial weight adjustments to balance the learning process.

# Next steps:
# Given the severe class imbalance, I might: ✔ Use scale_pos_weight in LightGBM to balance classes ✔ Apply oversampling techniques (SMOTE, ADASYN) if necessary ✔ Tune the learning rate for better convergence

In [34]:
y_pred = pipe.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

  column_agg = np.nanmean(selected_values, axis=0)


Accuracy: 0.9884206045125585
MCC: 0.14917750358416457
[[208798   2108]
 [   340    164]]
              precision    recall  f1-score   support

       False       1.00      0.99      0.99    210906
        True       0.07      0.33      0.12       504

    accuracy                           0.99    211410
   macro avg       0.54      0.66      0.56    211410
weighted avg       1.00      0.99      0.99    211410



In [None]:
#feature_importances = pd.Series(pipe.feature_importances_, index=X_train.columns)
#feature_importances.sort_values(ascending=False)
#print(feature_importances)

In [44]:
y_val_pred_proba = pipe.predict_proba(X_val)[:,1]
candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
thresholds = pd.DataFrame({'threshold': candidate_thresholds})
thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_val, y_val_pred_proba > x))
thresholds.sort_values('f1', ascending = False).head()

Unnamed: 0,threshold,f1
2,0.12,0.189059
0,0.1,0.18771
1,0.11,0.187661
3,0.13,0.186441
4,0.14,0.181818


In [None]:
# threshold = 0.31
# y_pred_proba = lr.predict_proba(X_test)[:,1]

# y_pred = y_pred_proba > threshold
# print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
# print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [48]:
threshold = 0.12
y_pred_proba = pipe.predict_proba(X_test)[:,1]

y_pred = y_pred_proba > threshold
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

  column_agg = np.nanmean(selected_values, axis=0)


Accuracy: 0.9818646232439336
MCC: 0.17652521389050582
[[207324   3582]
 [   252    252]]
              precision    recall  f1-score   support

       False       1.00      0.98      0.99    210906
        True       0.07      0.50      0.12       504

    accuracy                           0.98    211410
   macro avg       0.53      0.74      0.55    211410
weighted avg       1.00      0.98      0.99    211410

