In [145]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

In [146]:
# Set view options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [147]:
obs_7 = pd.read_csv("../data/obs_windows/observation_7D_prediction_24H_48H_72H.csv")

  obs_7 = pd.read_csv("../data/obs_windows/observation_7D_prediction_24H_48H_72H.csv")


### 7d observation window with 24hr prediction window

In [148]:
obs_7_24 = obs_7.drop(obs_7.loc[:, 'spn':'full_derate_within_24hr'], axis=1).drop(obs_7.loc[:, 'partial_derate_within_48hr':'derate_within_72hr'], axis=1).drop(columns='EventTimeStamp')

In [149]:
eq_ids = obs_7_24.drop_duplicates(subset='EquipmentID')
eq_ids = eq_ids[['EquipmentID']]
eq_ids.shape[0]

1017

In [150]:
np.random.seed(321)

eq_ids['random'] = (np.random.randint(0, 10000, eq_ids.shape[0]))/10000

In [151]:
#create column 'modeling' labeling ~35% of the data for training, ~30% for validation, and ~35% for testing 
eq_ids['modeling'] = np.where(((eq_ids.random <= 0.35)), 'training', np.where(((eq_ids.random <= 0.65)), 'validation', 'testing'))

In [152]:
#merge modeling column onto original dataframe
obs_7_24 = obs_7_24.sort_values(by='EquipmentID')
eq_ids = eq_ids.sort_values(by='EquipmentID')

obs_7_24 = obs_7_24.merge(eq_ids, on='EquipmentID')
obs_7_24 = obs_7_24.drop(columns='EquipmentID')

In [153]:
#creating training, testing, and validation dfs
obs_7_24_training = obs_7_24[obs_7_24['modeling']=='training']
obs_7_24_training = obs_7_24_training.drop(columns=['modeling', 'random'])
print(obs_7_24_training.shape)

obs_7_24_test = obs_7_24[obs_7_24['modeling']=='testing']
obs_7_24_test = obs_7_24_test.drop(columns=['modeling', 'random'])
print(obs_7_24_test.shape)

obs_7_24_validation = obs_7_24[obs_7_24['modeling']=='validation']
obs_7_24_validation = obs_7_24_validation.drop(columns=['modeling', 'random'])
print(obs_7_24_validation.shape)

(192999, 56)
(194320, 56)
(144524, 56)


In [154]:
df_sampling = obs_7_24_training
df_sampling_non_tenth = df_sampling[df_sampling['derate_within_24hr'] == 0][0:18855]

In [155]:
df_sampling_derate_tenth = pd.concat([df_sampling[df_sampling['derate_within_24hr']==1], df_sampling_non_tenth])

In [156]:
oversampler = SMOTE(k_neighbors=5, random_state=321)

features = df_sampling_derate_tenth.loc[:, 'Aftertreatment System':].columns.tolist()

target = ['derate_within_24hr']

X_train = df_sampling_derate_tenth[features]
y_train = df_sampling_derate_tenth[target]

In [157]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [158]:
smote_all = pd.concat([y_smote, X_smote], axis=1)
smote_derate = smote_all[smote_all['derate_within_24hr']==1]
smote_derate.shape

(18855, 56)

In [159]:
und_samp = pd.concat([smote_derate, df_sampling[df_sampling['derate_within_24hr']==0]])
und_samp.shape

(207411, 56)

In [160]:
undersampler = RandomUnderSampler(random_state=321, sampling_strategy=1/2)

features = und_samp.loc[:, 'Aftertreatment System':].columns.tolist()

target =['derate_within_24hr']

X_train_final = und_samp[features]
y_train_final = und_samp[target]

In [161]:
X_resampled, y_resampled = undersampler.fit_resample(X_train_final, y_train_final)

In [162]:
print(y_resampled.shape)

(56565, 1)


In [163]:
logistic = LogisticRegression()

In [164]:
logreg = logistic.fit(X_resampled, y_resampled)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [165]:
X_test = obs_7_24_test[features]
y_test = obs_7_24_test[target]

In [166]:
y_pred = logreg.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(f1_score(y_test, y_pred, average = 'weighted'))

0.9615788390284068
0.9675843913401689


### 7d observation window with 48hr prediction window

In [246]:
obs_7_48 = obs_7.drop(obs_7.loc[:, 'spn':'full_derate_within_48hr'], axis=1).drop(obs_7.loc[:, 'partial_derate_within_72hr':'derate_within_72hr'], axis=1).drop(columns='EventTimeStamp')

In [247]:
eq_ids = obs_7_48.drop_duplicates(subset='EquipmentID')
eq_ids = eq_ids[['EquipmentID']]
eq_ids.shape[0]

1017

In [248]:
np.random.seed(321)

eq_ids['random'] = (np.random.randint(0, 10000, eq_ids.shape[0]))/10000

In [249]:
#create column 'modeling' labeling ~35% of the data for training, ~30% for validation, and ~35% for testing 
eq_ids['modeling'] = np.where(((eq_ids.random <= 0.35)), 'training', np.where(((eq_ids.random <= 0.65)), 'validation', 'testing'))

In [250]:
#merge modeling column onto original dataframe
obs_7_48 = obs_7_48.sort_values(by='EquipmentID')
eq_ids = eq_ids.sort_values(by='EquipmentID')

obs_7_48 = obs_7_48.merge(eq_ids, on='EquipmentID')
obs_7_48 = obs_7_48.drop(columns='EquipmentID')

In [251]:
#creating training, testing, and validation dfs
obs_7_48_training = obs_7_48[obs_7_48['modeling']=='training']
obs_7_48_training = obs_7_48_training.drop(columns=['modeling', 'random'])
print(obs_7_48_training.shape)

obs_7_48_test = obs_7_48[obs_7_48['modeling']=='testing']
obs_7_48_test = obs_7_48_test.drop(columns=['modeling', 'random'])
print(obs_7_48_test.shape)

obs_7_48_validation = obs_7_48[obs_7_48['modeling']=='validation']
obs_7_48_validation = obs_7_48_validation.drop(columns=['modeling', 'random'])
print(obs_7_48_validation.shape)

(192999, 56)
(194320, 56)
(144524, 56)


In [252]:
df_sampling = obs_7_48_training
df_sampling_non_tenth = df_sampling[df_sampling['derate_within_48hr'] == 0][0:18855]

In [253]:
df_sampling_derate_tenth = pd.concat([df_sampling[df_sampling['derate_within_48hr']==1], df_sampling_non_tenth])

In [254]:
oversampler = SMOTE(k_neighbors=5, random_state=321)

features = df_sampling_derate_tenth.loc[:, 'Aftertreatment System':].columns.tolist()

target = ['derate_within_48hr']

X_train = df_sampling_derate_tenth[features]
y_train = df_sampling_derate_tenth[target]

In [255]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [256]:
smote_all = pd.concat([y_smote, X_smote], axis=1)
smote_derate = smote_all[smote_all['derate_within_48hr']==1]
smote_derate.shape

(18855, 56)

In [257]:
und_samp = pd.concat([smote_derate, df_sampling[df_sampling['derate_within_48hr']==0]])
und_samp.shape

(206556, 56)

In [258]:
undersampler = RandomUnderSampler(random_state=321, sampling_strategy=1/2)

features = und_samp.loc[:, 'Aftertreatment System':].columns.tolist()

target =['derate_within_48hr']

X_train_final = und_samp[features]
y_train_final = und_samp[target]

In [259]:
X_resampled, y_resampled = undersampler.fit_resample(X_train_final, y_train_final)

In [260]:
print(y_resampled.shape)

(56565, 1)


In [261]:
logistic = LogisticRegression()

In [262]:
logreg = logistic.fit(X_resampled, y_resampled)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [263]:
X_test = obs_7_48_test[features]
y_test = obs_7_48_test[target]

In [264]:
y_pred = logreg.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(f1_score(y_test, y_pred, average = 'weighted'))

0.961583985179086
0.9661284898078232


### 7d observation window with 72hr prediction window

In [227]:
obs_7_72 = obs_7.drop(obs_7.loc[:, 'spn':'full_derate_within_72hr'], axis=1).drop(columns='EventTimeStamp')

In [228]:
eq_ids = obs_7_72.drop_duplicates(subset='EquipmentID')
eq_ids = eq_ids[['EquipmentID']]
eq_ids.shape[0]

1017

In [229]:
np.random.seed(321)

eq_ids['random'] = (np.random.randint(0, 10000, eq_ids.shape[0]))/10000

In [230]:
#create column 'modeling' labeling ~35% of the data for training, ~30% for validation, and ~35% for testing 
eq_ids['modeling'] = np.where(((eq_ids.random <= 0.35)), 'training', np.where(((eq_ids.random <= 0.65)), 'validation', 'testing'))

In [231]:
#merge modeling column onto original dataframe
obs_7_72 = obs_7_72.sort_values(by='EquipmentID')
eq_ids = eq_ids.sort_values(by='EquipmentID')

obs_7_72 = obs_7_72.merge(eq_ids, on='EquipmentID')
obs_7_72 = obs_7_72.drop(columns='EquipmentID')

In [232]:
#creating training, testing, and validation dfs
obs_7_72_training = obs_7_72[obs_7_72['modeling']=='training']
obs_7_72_training = obs_7_72_training.drop(columns=['modeling', 'random'])
print(obs_7_72_training.shape)

obs_7_72_test = obs_7_72[obs_7_72['modeling']=='testing']
obs_7_72_test = obs_7_72_test.drop(columns=['modeling', 'random'])
print(obs_7_72_test.shape)

obs_7_72_validation = obs_7_72[obs_7_72['modeling']=='validation']
obs_7_72_validation = obs_7_72_validation.drop(columns=['modeling', 'random'])
print(obs_7_72_validation.shape)

(192999, 56)
(194320, 56)
(144524, 56)


In [233]:
df_sampling = obs_7_72_training
df_sampling_non_tenth = df_sampling[df_sampling['derate_within_72hr'] == 0][0:18855]

In [234]:
df_sampling_derate_tenth = pd.concat([df_sampling[df_sampling['derate_within_72hr']==1], df_sampling_non_tenth])

In [235]:
oversampler = SMOTE(k_neighbors=5, random_state=321)

features = df_sampling_derate_tenth.loc[:, 'Aftertreatment System':].columns.tolist()

target = ['derate_within_72hr']

X_train = df_sampling_derate_tenth[features]
y_train = df_sampling_derate_tenth[target]

In [236]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

In [237]:
smote_all = pd.concat([y_smote, X_smote], axis=1)
smote_derate = smote_all[smote_all['derate_within_72hr']==1]
smote_derate.shape

(18855, 56)

In [238]:
und_samp = pd.concat([smote_derate, df_sampling[df_sampling['derate_within_72hr']==0]])
und_samp.shape

(205872, 56)

In [239]:
undersampler = RandomUnderSampler(random_state=321, sampling_strategy=1/2)

features = und_samp.loc[:, 'Aftertreatment System':].columns.tolist()

target =['derate_within_72hr']

X_train_final = und_samp[features]
y_train_final = und_samp[target]

In [240]:
X_resampled, y_resampled = undersampler.fit_resample(X_train_final, y_train_final)

In [241]:
print(y_resampled.shape)

(56565, 1)


In [242]:
logistic = LogisticRegression()

In [243]:
logreg = logistic.fit(X_resampled, y_resampled)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [244]:
X_test = obs_7_72_test[features]
y_test = obs_7_72_test[target]

In [245]:
y_pred = logreg.predict(X_test)

print(accuracy_score(y_test,y_pred))
print(f1_score(y_test, y_pred, average = 'weighted'))

0.9588102099629477
0.9632277153652219
