## Modeling ##

In [1]:
import time
import pickle
import os
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib.colors import Colormap
import scipy.stats as stats
from numpy import interp
import scikitplot as skplt
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    RandomizedSearchCV, 
    cross_val_score, 
    cross_val_predict
)
from sklearn.metrics import (
    classification_report, 
    confusion_matrix,
    r2_score,
    mean_squared_error, 
    root_mean_squared_error,
    mean_absolute_error, 
    mean_absolute_percentage_error,
    accuracy_score,
    matthews_corrcoef,
    brier_score_loss,
    f1_score
)
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.inspection import PartialDependenceDisplay
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from imblearn.over_sampling import SMOTE

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [2]:
os.chdir('..')
print(f'Current working directory is {os.getcwd()}')

Current working directory is C:\Users\cavin\Documents\NSS_Projects\Python\big-g-big-rigs


This notebook is dedicated to the feature selection and statistical modeling of our trucking data.

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
df = pd.read_csv('data/data_clean_05_05.csv', low_memory=False)

Cleaning features and reassigning them to the proper dtypes

In [5]:
columns_to_object = ['ecuSource',
                     'spn',
                     'fmi',
                     'MCTNumber',
                     'RecordID',
                     'ESS_Id'
                    ]

for column in columns_to_object:
    df[column] = df[column].astype(object)

In [6]:
columns_to_bool = ['CruiseControlActive',
                   'IgnStatus',
                   'ParkingBrake']

for column in columns_to_bool:
    df[column] = df[column].astype(bool)

In [7]:
int64_cols = df.select_dtypes(include='bool').columns
df[int64_cols] = df[int64_cols].astype('int64')

Separating the test and training data.

In [8]:
test_date = '2019-01-01'

df_test = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] > test_date]

In [9]:
df_train = df.sort_values('EventTimeStamp').loc[df['EventTimeStamp'] < test_date]

In [10]:
#df = df.sample(frac=0.50)

Scaling and encoding features for modeling.

In [11]:
X_train = df_train.drop(columns = [
            'target',  
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ecuSerialNumber',
            'ecuSoftwareVersion',
            'time_derate',
            'time_until_derate',
            'Longitude',
            'Latitude',
            'ESS_Id',
            'RecordID',
            'ecuModel',
            'ServiceDistance',
            'ecuMake',
            'SwitchedBatteryVoltage',
            'MCTNumber',
            'EquipmentID',
            'LampStatus',
            'CruiseControlSetSpeed',
            'EngineLoad',
            'TurboBoostPressure',
            'DistanceLtd'
            ], axis=1)

y_train = df_train['target']

In [12]:
X_test = df_test.drop(columns = [
            'target',  
            'LocationTimeStamp',
            'EventTimeStamp',
            'eventDescription',
            'ecuSerialNumber',
            'ecuSoftwareVersion',
            'time_derate',
            'time_until_derate',
            'Longitude',
            'Latitude',
            'ESS_Id',
            'RecordID',
            'ecuModel',
            'ServiceDistance',
            'ecuMake',
            'SwitchedBatteryVoltage',
            'MCTNumber',
            'EquipmentID',
            'LampStatus',
            'CruiseControlSetSpeed',
            'EngineLoad',
            'TurboBoostPressure',
            'DistanceLtd'
            ], axis=1)

y_test = df_test['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify = y_train, random_state = 27, train_size = 0.6/0.8)

Selection of features for each step of the pipeline. The last few lines are for checking to make sure each feature is accounted for.

In [13]:
ohe_features = ['spn',
                'fmi',
                'ecuSource'
                ] 

bool_features = ['CruiseControlActive',
                 'IgnStatus',
                 'ParkingBrake',
                 'active'
                ]

scale_features = [
                  #'AcceleratorPedal',
                  'BarometricPressure',
                  'EngineCoolantTemperature',
                  'EngineOilPressure',
                  'EngineOilTemperature',
                  #'EngineRpm',
                  'FuelLevel',
                  'activeTransitionCount',
                  'FuelLtd',
                  'FuelTemperature',
                  #'IntakeManifoldTemperature',
                  'Speed',
                  #'FuelRate',
                  'EngineTimeLtd',
                  'Throttle'
                 ]

everything = list(set(ohe_features + bool_features + scale_features))
the_rest = X_train.columns.difference(everything)
pca = PCA(n_components = 5)

In [14]:
%%time

numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ii', IterativeImputer(initial_strategy = 'mean',
                           max_iter = 30,
                           sample_posterior = True))
])

categorical_pipeline = Pipeline([
    ('ohe', OneHotEncoder(categories='auto', 
                          handle_unknown = 'ignore')),
    ('si', SimpleImputer(strategy = 'most_frequent'))
])

boolean_pipeline = Pipeline([
    ('ii', IterativeImputer(initial_strategy = 'most_frequent',
                           max_iter = 30,
                           sample_posterior = True))
])

ct = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, scale_features),
        ('cat', categorical_pipeline, ohe_features),
        ('bool', boolean_pipeline, bool_features)
    ],
    remainder='drop'
)

pipe = Pipeline(
    steps=[
        ('transformer', ct)
    ])

pipe.fit(X_train, y_train)

CPU times: total: 6min 31s
Wall time: 3min 43s


The above cell takes 11 minutes to fit the pipe, and the cell below takes 4 minutes to transform the pipe.

In [15]:
%%time

X_train_transformed = pipe.transform(X_train)
X_test_transformed = pipe.transform(X_test) 

CPU times: total: 2min 58s
Wall time: 2min


In [16]:
smote = SMOTE()
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_transformed, y_train)

Transformation of pipe and saving the pipe as a pickle object so that the pipe doesn't need to be fitted again.

In [17]:
filename = 'pipe_transformed.pkl'

pickle_list = [pipe, X_train_balanced, X_test_transformed, y_train_balanced]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [18]:
filename = 'pipe_transformed.pkl'

with open(filename, 'rb') as file:
    pipe, X_train_balanced, X_test_transformed, y_train_balanced = pickle.load(file)

Applying the pipe transformations to models to see which model performs best.

%%time

knn_model = KNeighborsClassifier().fit(X_train_balanced, y_train_balanced)

%%time

y_pred_knn = knn_model.predict(X_test_transformed)

filename = 'knn.pkl'

pickle_list = [y_pred_knn, knn_model]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

filename = 'knn.pkl'

with open(filename, 'rb') as file:
    knn_model, y_pred_knn = pickle.load(file)

Prediction on KNeighborsClassifier takes a while

print(f'Accuracy: {accuracy_score(y_test, y_pred_knn)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_knn)}')
print(confusion_matrix(y_test, y_pred_knn))
print(classification_report(y_test, y_pred_knn, zero_division = 0))
#print(cross_val_score(knn_model, X_train_transformed, y_train, cv=3))

In [26]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [27]:
def objective(space):
    clf = xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], 
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),
                    min_child_weight=(space['min_child_weight']),
                    colsample_bytree=(space['colsample_bytree']),
                    eval_metric="auc",
                    early_stopping_rounds=10)
    
    evaluation = [( X_train_balanced, y_train_balanced), ( X_test_transformed, y_test)]
    
    clf.fit(X_train_balanced, y_train_balanced,
            eval_set=evaluation,
            verbose=False)
    

    pred = clf.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                                         
0.9587679723026971                                                             
SCORE:                                                                         
0.9625620005202213                                                             
  2%|▏      | 2/100 [00:27<22:55, 14.03s/trial, best loss: -0.9625620005202213]

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
%%time

xgbm = XGBClassifier(
    # learning_rate = 0.2,
    # objective="binary:logistic",
    # colsample_bytree = 0.8,
    # gamma = 0.1,
    # max_depth = 6,
    # min_child_weight = 3,
    # alpha = 10.0,
    # n_estimators = 100,
    # verbosity = 2
    # max_delta_step = 5
    # scale_pos_weight = 3
).fit(X_train_balanced, y_train_balanced)

y_pred_xgbm = xgbm.predict(X_test_transformed)

In [None]:
plot_importance(xgbm)
plt.figure(figsize = (16, 12))
#plt.savefig("graphs/feature_importance.png")
plt.show();

In [None]:
filename = 'xgbm.pkl'

pickle_list = [y_pred_xgbm, xgbm]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [None]:
filename = 'xgbm.pkl'

with open(filename, 'rb') as file:
    y_pred_xgbm, xgbm = pickle.load(file)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred_xgbm)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_xgbm)}')
print(confusion_matrix(y_test, y_pred_xgbm))
print(classification_report(y_test, y_pred_xgbm, zero_division = 0))

In [None]:
probas = xgbm.predict_proba(X_test_transformed)

In [None]:
skplt.metrics.plot_precision_recall(y_test, probas)
plt.show()

In [None]:
def target_pred(df, event, equipment, target, pred):
    df = df[[event, equipment, target]].copy()
    df['predict'] = pred
    df[event] = pd.to_datetime(df[event])
    df = df.sort_values(by=[equipment, event])
    df['time_diff'] = df.groupby(equipment)[event].diff().dt.total_seconds() / 3600
    df['valid_group'] = (df['time_diff'].isna()) | (df['time_diff'] <= 2)
    df['temp_group'] = (~df['valid_group']).cumsum()
    df['combined'] = df[target].astype(str) + '_' + df['predict'].astype(str)
    
    result = df.groupby([equipment, 'temp_group'])['combined'].value_counts().reset_index(name='count')
    result = result.groupby(equipment)['combined'].value_counts().unstack(fill_value=0)
    result = result.rename(columns = {'0_0': 'true negative', '0_1': 'false positive', '1_0': 'false negative', '1_1': 'true positive'})
    counts = (result.iloc[:,0].sum() * 0) - (result.iloc[:,1].sum() * 500) - (result.iloc[:,2].sum() * 4000) + (result.iloc[:,3].sum() * 4000)
    return counts

In [None]:
target_pred(df_test, 'EventTimeStamp', 'EquipmentID', 'target', y_pred_xgbm)

In [None]:
%%time

rfc = RandomForestClassifier(n_estimators=5, max_depth=5, random_state=27).fit(X_train_balanced, y_train_balanced)

y_pred_rfc = rfc.predict(X_test_transformed)

In [None]:
filename = 'rfc.pkl'

pickle_list = [y_pred_rfc, rfc]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [None]:
filename = 'rfc.pkl'

with open(filename, 'rb') as file:
    y_pred_rfc, rfc = pickle.load(file)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred_rfc)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_rfc)}')
print(confusion_matrix(y_test, y_pred_rfc))
print(classification_report(y_test, y_pred_rfc, zero_division = 0))

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, y_pred_rfc, normalize=False)
plt.show()

In [None]:
%%time

logreg = LogisticRegression(max_iter=10000).fit(X_train_balanced, y_train_balanced)

y_pred_logreg = logreg.predict(X_test_transformed)

In [None]:
filename = 'logreg.pkl'

pickle_list = [y_pred_logreg, logreg]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [None]:
filename = 'logreg.pkl'

with open(filename, 'rb') as file:
    y_pred_logreg, logreg = pickle.load(file)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred_logreg)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_logreg)}')
print(confusion_matrix(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg, zero_division = 0))

In [None]:
df_test['predictions'] = y_pred_logreg

In [None]:
pred_df = df_test[['EventTimeStamp', 'EquipmentID', 'spn', 'target', 'predictions']]

In [None]:
pred_df[(pred_df['target'] == 1) & (pred_df['predictions'] == 1)].drop_duplicates()

y_val_pred_proba = pipe.predict_proba(X_val)[:,1]

candidate_thresholds = np.arange(start = 0.1, stop = 0.925, step = 0.01)
thresholds = pd.DataFrame({'threshold': candidate_thresholds})
thresholds['f1'] = thresholds['threshold'].apply(lambda x: f1_score(y_val, y_val_pred_proba > x))
thresholds.sort_values('f1', ascending = False).head()

threshold = 0.10

y_pred_proba = model.predict_proba(X_test_transformed)[:,1]

y_pred = y_pred_proba > threshold
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
#param_grid = {
#    'preprocessor__num__scaler__with_mean': [True, False],
#    'preprocessor__num__scaler__with_std': [True, False],
#    'classifier__C': [0.1, 1, 10],
#    'classifier__solver': ['liblinear', 'newton-cg']
#}

#randomized_search = RandomizedSearchCV(pipeline, param_grid, n_iter=10, cv=3)

%%time

hgbc = HistGradientBoostingClassifier().fit(X_train_balanced, y_train_balanced)

y_pred_hgbc = hgbc.predict(X_test_transformed)

confusion_matrix(y_test, y_pred_hgbc)

print(classification_report(y_test, y_pred_hgbc, zero_division = 0))

In [None]:
%%time

dtc = DecisionTreeClassifier().fit(X_train_balanced, y_train_balanced)

y_pred_dtc = dtc.predict(X_test_transformed)

In [None]:
filename = 'dtc.pkl'

pickle_list = [y_pred_dtc, dtc]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [None]:
filename = 'dtc.pkl'

with open(filename, 'rb') as file:
    y_pred_dtc, dtc = pickle.load(file)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred_dtc)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_dtc)}')
print(confusion_matrix(y_test, y_pred_dtc))
print(classification_report(y_test, y_pred_dtc, zero_division = 0))

%%time

svm = SVC(kernel='linear', C=1.0, gamma='auto', probability=True)

svm.fit(X_train_balanced, y_train_balanced)

y_pred_svm = svm.predict(X_test_transformed)

filename = 'svm.pkl'

pickle_list = [y_pred_svm, svm]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

filename = 'svm.pkl'

with open(filename, 'rb') as file:
    y_pred_svm, svm = pickle.load(file)

print(f'Accuracy: {accuracy_score(y_test, y_pred_svm)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_svm)}')
print(confusion_matrix(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, zero_division = 0))

In [None]:
%%time

lgb = LGBMClassifier().fit(X_train_balanced, y_train_balanced)

y_pred_lgb = lgb.predict(X_test_transformed)

In [None]:
filename = 'lgb.pkl'

pickle_list = [y_pred_lgb, lgb]

with open(filename, 'wb') as file:
    pickle.dump(pickle_list, file)

In [None]:
filename = 'lgb.pkl'

with open(filename, 'rb') as file:
    y_pred_lgb, lgb = pickle.load(file)

In [None]:
print(f'Accuracy: {accuracy_score(y_test, y_pred_lgb)}')
print(f'MCC: {matthews_corrcoef(y_test, y_pred_lgb)}')
print(confusion_matrix(y_test, y_pred_lgb))
print(classification_report(y_test, y_pred_lgb, zero_division = 0))

In [None]:
%%time

rf_probas = rfc.predict_proba(X_test_transformed)
lr_probas = logreg.predict_proba(X_test_transformed)
dtc_probas = dtc.predict_proba(X_test_transformed)
xgb_probas = xgb.predict_proba(X_test_transformed)
lgb_probas = lgb.predict_proba(X_test_transformed)
probas_list = [rf_probas, 
               lr_probas, 
               dtc_probas, 
               xgb_probas, 
               lgb_probas]
clf_names = ['Random Forest', 
             'Logistic Regression',
             'Decision Tree', 
             'XGBoost',
             'Light GBM']
skplt.metrics.plot_calibration_curve(y_test,
                                     probas_list,
                                     clf_names)
plt.show()