In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
# Set view options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
obs = pd.read_csv("../data/obs_windows/observation_48H_prediction_24H_48H_72H.csv")

  obs = pd.read_csv("../data/obs_windows/observation_48H_prediction_24H_48H_72H.csv")


### 7d observation window with 24hr prediction window

In [4]:
obs_24 = obs.drop(obs.loc[:, 'spn':'full_derate_within_24hr'], axis=1).drop(obs.loc[:, 'partial_derate_within_48hr':'derate_within_72hr'], axis=1).drop(columns='EventTimeStamp')

In [5]:
eq_ids = obs_24.drop_duplicates(subset='EquipmentID')
eq_ids = eq_ids[['EquipmentID']]
eq_ids.shape[0]

1039

In [6]:
np.random.seed(321)

eq_ids['random'] = (np.random.randint(0, 10000, eq_ids.shape[0]))/10000

In [7]:
#create column 'modeling' labeling ~35% of the data for training, ~30% for validation, and ~35% for testing 
eq_ids['modeling'] = np.where(((eq_ids.random <= 0.35)), 'training', np.where(((eq_ids.random <= 0.65)), 'validation', 'testing'))

In [8]:
#merge modeling column onto original dataframe
obs_24 = obs_24.sort_values(by='EquipmentID')
eq_ids = eq_ids.sort_values(by='EquipmentID')

obs_24 = obs_24.merge(eq_ids, on='EquipmentID')
obs_24 = obs_24.drop(columns='EquipmentID')

In [9]:
#creating training, testing, and validation dfs
obs_24_training = obs_24[obs_24['modeling']=='training']
obs_24_training = obs_24_training.drop(columns=['modeling', 'random'])
print(obs_24_training.shape)

obs_24_test = obs_24[obs_24['modeling']=='testing']
obs_24_test = obs_24_test.drop(columns=['modeling', 'random'])
print(obs_24_test.shape)

obs_24_validation = obs_24[obs_24['modeling']=='validation']
obs_24_validation = obs_24_validation.drop(columns=['modeling', 'random'])
print(obs_24_validation.shape)

(200860, 56)
(202749, 56)
(151246, 56)


In [10]:
df_sampling = obs_24_training
df_sampling_non_tenth = df_sampling[df_sampling['derate_within_24hr'] == 0][0:18855]

In [11]:
df_sampling_derate_tenth = pd.concat([df_sampling[df_sampling['derate_within_24hr']==1], df_sampling_non_tenth])

In [12]:
oversampler = SMOTE(k_neighbors=5, random_state=321)

features = df_sampling_derate_tenth.loc[:, 'Aftertreatment System':].columns.tolist()

target = ['derate_within_24hr']

X_train = df_sampling_derate_tenth[features]
y_train = df_sampling_derate_tenth[target]

In [13]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [14]:
smote_all = pd.concat([y_smote, X_smote], axis=1)
smote_derate = smote_all[smote_all['derate_within_24hr']==1]
smote_derate.shape

(18855, 56)

In [15]:
und_samp = pd.concat([smote_derate, df_sampling[df_sampling['derate_within_24hr']==0]])
und_samp.shape

(215061, 56)

In [16]:
undersampler = RandomUnderSampler(random_state=321, sampling_strategy=1/2)

features = und_samp.loc[:, 'Aftertreatment System':].columns.tolist()

target =['derate_within_24hr']

X_train_final = und_samp[features]
y_train_final = und_samp[target]

X_test = obs_24_test[features]
y_test = obs_24_test[target]

In [17]:
X_resampled, y_resampled = undersampler.fit_resample(X_train_final, y_train_final)

In [18]:
print(y_resampled.shape)

(56565, 1)


In [19]:
rf_pipeline = Pipeline(
    steps = [
        ('scale', MinMaxScaler()),
        ('rf_clf', RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state = 69))
    ]
)

In [20]:
rand_forest = rf_pipeline.fit(X_resampled, y_resampled)
rf_pipeline.score(X_test, y_test)

y_pred = rand_forest.predict(X_test)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [21]:
data = {'feature': np.array(X_resampled.columns), 'importance': rf_pipeline.named_steps['rf_clf'].feature_importances_}
pd.DataFrame(data).sort_values('importance', ascending = False)


Unnamed: 0,feature,importance
29,Engine Protection Torque Partial Derate,0.3403705
0,Aftertreatment System,0.2681677
51,Unknown,0.1221221
18,Engine Coolant,0.06203526
22,Engine Fuel,0.03260432
28,Engine Protection Torque Full Derate,0.02456123
41,Intake Manifold,0.02155602
43,J1939 Network,0.02073887
52,Variable Geometry Turbocharger,0.01863601
39,Injector Metering Rail,0.01515714


In [22]:
confusion_matrix(y_test, y_pred)

array([[190672,   7340],
       [  1174,   3563]], dtype=int64)

In [23]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [24]:
(tn, fp, fn, tp)

(190672, 7340, 1174, 3563)

In [25]:
print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98    198012
         1.0       0.33      0.75      0.46      4737

    accuracy                           0.96    202749
   macro avg       0.66      0.86      0.72    202749
weighted avg       0.98      0.96      0.97    202749



In [26]:
lg_pipeline =  Pipeline(
    steps = [
        ('scale', MinMaxScaler()),
        ('lg_clf', LogisticRegression())
    ]
)

In [27]:
logistic = lg_pipeline.fit(X_resampled, y_resampled)

y_pred2 = logistic.predict(X_test)

y_pred_proba = logistic.predict_proba(X_test)[::,1]

auc = metrics.roc_auc_score(y_test, y_pred_proba)

print(auc)

  y = column_or_1d(y, warn=True)


0.9120225017676337


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 7d observation window with 48hr prediction window

In [28]:
obs_48 = obs.drop(obs.loc[:, 'spn':'full_derate_within_48hr'], axis=1).drop(obs.loc[:, 'partial_derate_within_72hr':'derate_within_72hr'], axis=1).drop(columns='EventTimeStamp')

In [29]:
eq_ids = obs_48.drop_duplicates(subset='EquipmentID')
eq_ids = eq_ids[['EquipmentID']]
eq_ids.shape[0]

1039

In [30]:
np.random.seed(321)

eq_ids['random'] = (np.random.randint(0, 10000, eq_ids.shape[0]))/10000

In [31]:
#create column 'modeling' labeling ~35% of the data for training, ~30% for validation, and ~35% for testing 
eq_ids['modeling'] = np.where(((eq_ids.random <= 0.35)), 'training', np.where(((eq_ids.random <= 0.65)), 'validation', 'testing'))

In [32]:
#merge modeling column onto original dataframe
obs_48 = obs_48.sort_values(by='EquipmentID')
eq_ids = eq_ids.sort_values(by='EquipmentID')

obs_48 = obs_48.merge(eq_ids, on='EquipmentID')
obs_48 = obs_48.drop(columns='EquipmentID')

In [33]:
#creating training, testing, and validation dfs
obs_48_training = obs_48[obs_48['modeling']=='training']
obs_48_training = obs_48_training.drop(columns=['modeling', 'random'])
print(obs_48_training.shape)

obs_48_test = obs_48[obs_48['modeling']=='testing']
obs_48_test = obs_48_test.drop(columns=['modeling', 'random'])
print(obs_48_test.shape)

obs_48_validation = obs_48[obs_48['modeling']=='validation']
obs_48_validation = obs_48_validation.drop(columns=['modeling', 'random'])
print(obs_48_validation.shape)

(200860, 56)
(202749, 56)
(151246, 56)


In [34]:
df_sampling = obs_48_training
df_sampling_non_tenth = df_sampling[df_sampling['derate_within_48hr'] == 0][0:18855]

In [35]:
df_sampling_derate_tenth = pd.concat([df_sampling[df_sampling['derate_within_48hr']==1], df_sampling_non_tenth])

In [36]:
oversampler = SMOTE(k_neighbors=5, random_state=321)

features = df_sampling_derate_tenth.loc[:, 'Aftertreatment System':].columns.tolist()

target = ['derate_within_48hr']

X_train = df_sampling_derate_tenth[features]
y_train = df_sampling_derate_tenth[target]

X_test = obs_48_test[features]
y_test = obs_48_test[target]

In [37]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [38]:
smote_all = pd.concat([y_smote, X_smote], axis=1)
smote_derate = smote_all[smote_all['derate_within_48hr']==1]
smote_derate.shape

(18855, 56)

In [39]:
und_samp = pd.concat([smote_derate, df_sampling[df_sampling['derate_within_48hr']==0]])
und_samp.shape

(214184, 56)

In [40]:
undersampler = RandomUnderSampler(random_state=321, sampling_strategy=1/2)

features = und_samp.loc[:, 'Aftertreatment System':].columns.tolist()

target =['derate_within_48hr']

X_train_final = und_samp[features]
y_train_final = und_samp[target]

In [41]:
X_resampled, y_resampled = undersampler.fit_resample(X_train_final, y_train_final)

In [42]:
print(y_resampled.shape)

(56565, 1)


In [43]:
rf_pipeline = Pipeline(
    steps = [
        ('scale', MinMaxScaler()),
        ('rf_clf', RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state = 69))
    ]
)

In [44]:
rand_forest = rf_pipeline.fit(X_resampled, y_resampled)
rf_pipeline.score(X_test, y_test)

y_pred = rand_forest.predict(X_test)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [45]:
data = {'feature': np.array(X_resampled.columns), 'importance': rf_pipeline.named_steps['rf_clf'].feature_importances_}
pd.DataFrame(data).sort_values('importance', ascending = False)


Unnamed: 0,feature,importance
29,Engine Protection Torque Partial Derate,0.3236371
0,Aftertreatment System,0.2588165
51,Unknown,0.1345544
18,Engine Coolant,0.06072057
22,Engine Fuel,0.03646262
43,J1939 Network,0.02757092
28,Engine Protection Torque Full Derate,0.02295489
41,Intake Manifold,0.0182273
52,Variable Geometry Turbocharger,0.01787327
17,Engine Control Module,0.01723735


In [46]:
confusion_matrix(y_test, y_pred)

array([[189695,   7633],
       [  1645,   3776]], dtype=int64)

In [47]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [48]:
(tn, fp, fn, tp)

(189695, 7633, 1645, 3776)

In [49]:
print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

         0.0       0.99      0.96      0.98    197328
         1.0       0.33      0.70      0.45      5421

    accuracy                           0.95    202749
   macro avg       0.66      0.83      0.71    202749
weighted avg       0.97      0.95      0.96    202749



In [50]:
lg_pipeline =  Pipeline(
    steps = [
        ('scale', MinMaxScaler()),
        ('lg_clf', LogisticRegression())
    ]
)

In [51]:
logistic = lg_pipeline.fit(X_resampled, y_resampled)

y_pred2 = logistic.predict(X_test)

y_pred_proba = logistic.predict_proba(X_test)[::,1]

auc = metrics.roc_auc_score(y_test, y_pred_proba)

print(auc)

  y = column_or_1d(y, warn=True)


0.8853962430040998


### 7d observation window with 72hr prediction window

In [52]:
obs_72 = obs.drop(obs.loc[:, 'spn':'full_derate_within_72hr'], axis=1).drop(columns='EventTimeStamp')

In [53]:
eq_ids = obs_72.drop_duplicates(subset='EquipmentID')
eq_ids = eq_ids[['EquipmentID']]
eq_ids.shape[0]

1039

In [54]:
np.random.seed(321)

eq_ids['random'] = (np.random.randint(0, 10000, eq_ids.shape[0]))/10000

In [55]:
#create column 'modeling' labeling ~35% of the data for training, ~30% for validation, and ~35% for testing 
eq_ids['modeling'] = np.where(((eq_ids.random <= 0.35)), 'training', np.where(((eq_ids.random <= 0.65)), 'validation', 'testing'))

In [56]:
#merge modeling column onto original dataframe
obs_72 = obs_72.sort_values(by='EquipmentID')
eq_ids = eq_ids.sort_values(by='EquipmentID')

obs_72 = obs_72.merge(eq_ids, on='EquipmentID')
obs_72 = obs_72.drop(columns='EquipmentID')

In [57]:
#creating training, testing, and validation dfs
obs_72_training = obs_72[obs_72['modeling']=='training']
obs_72_training = obs_72_training.drop(columns=['modeling', 'random'])
print(obs_72_training.shape)

obs_72_test = obs_72[obs_72['modeling']=='testing']
obs_72_test = obs_72_test.drop(columns=['modeling', 'random'])
print(obs_72_test.shape)

obs_72_validation = obs_72[obs_72['modeling']=='validation']
obs_72_validation = obs_72_validation.drop(columns=['modeling', 'random'])
print(obs_72_validation.shape)

(200860, 56)
(202749, 56)
(151246, 56)


In [58]:
df_sampling = obs_72_training
df_sampling_non_tenth = df_sampling[df_sampling['derate_within_72hr'] == 0][0:18855]

In [59]:
df_sampling_derate_tenth = pd.concat([df_sampling[df_sampling['derate_within_72hr']==1], df_sampling_non_tenth])

In [60]:
oversampler = SMOTE(k_neighbors=5, random_state=321)

features = df_sampling_derate_tenth.loc[:, 'Aftertreatment System':].columns.tolist()

target = ['derate_within_72hr']

X_train = df_sampling_derate_tenth[features]
y_train = df_sampling_derate_tenth[target]

In [61]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [62]:
smote_all = pd.concat([y_smote, X_smote], axis=1)
smote_derate = smote_all[smote_all['derate_within_72hr']==1]
smote_derate.shape

(18855, 56)

In [63]:
und_samp = pd.concat([smote_derate, df_sampling[df_sampling['derate_within_72hr']==0]])
und_samp.shape

(213477, 56)

In [64]:
undersampler = RandomUnderSampler(random_state=321, sampling_strategy=1/2)

features = und_samp.loc[:, 'Aftertreatment System':].columns.tolist()

target =['derate_within_72hr']

X_train_final = und_samp[features]
y_train_final = und_samp[target]

X_test = obs_72_test[features]
y_test = obs_72_test[target]

In [65]:
X_resampled, y_resampled = undersampler.fit_resample(X_train_final, y_train_final)

In [66]:
print(y_resampled.shape)

(56565, 1)


In [67]:
rf_pipeline = Pipeline(
    steps = [
        ('scale', MinMaxScaler()),
        ('rf_clf', RandomForestClassifier(n_estimators = 500, n_jobs = -1, random_state = 69))
    ]
)

In [68]:
rand_forest = rf_pipeline.fit(X_resampled, y_resampled)
rf_pipeline.score(X_test, y_test)

y_pred = rand_forest.predict(X_test)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [69]:
data = {'feature': np.array(X_resampled.columns), 'importance': rf_pipeline.named_steps['rf_clf'].feature_importances_}
pd.DataFrame(data).sort_values('importance', ascending = False)


Unnamed: 0,feature,importance
29,Engine Protection Torque Partial Derate,0.316851
0,Aftertreatment System,0.256301
51,Unknown,0.136391
18,Engine Coolant,0.061466
22,Engine Fuel,0.037242
43,J1939 Network,0.028621
28,Engine Protection Torque Full Derate,0.024568
41,Intake Manifold,0.019558
17,Engine Control Module,0.018392
52,Variable Geometry Turbocharger,0.016485


In [70]:
confusion_matrix(y_test, y_pred)

array([[188151,   8700],
       [  2008,   3890]], dtype=int64)

In [71]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

In [72]:
(tn, fp, fn, tp)

(188151, 8700, 2008, 3890)

In [73]:
print(classification_report(y_test, y_pred, zero_division = 0))

              precision    recall  f1-score   support

         0.0       0.99      0.96      0.97    196851
         1.0       0.31      0.66      0.42      5898

    accuracy                           0.95    202749
   macro avg       0.65      0.81      0.70    202749
weighted avg       0.97      0.95      0.96    202749



In [74]:
lg_pipeline =  Pipeline(
    steps = [
        ('scale', MinMaxScaler()),
        ('lg_clf', LogisticRegression())
    ]
)

In [75]:
logistic = lg_pipeline.fit(X_resampled, y_resampled)

y_pred2 = logistic.predict(X_test)

y_pred_proba = logistic.predict_proba(X_test)[::,1]

auc = metrics.roc_auc_score(y_test, y_pred_proba)

print(auc)

  y = column_or_1d(y, warn=True)


0.8686069880509382


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
