In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
on_faults = pd.read_pickle('../data/on_faults.pkl')

In [None]:
# Drop rows where fault light is being turned off
on_faults = on_faults.loc[on_faults['active'] == True].reset_index(drop = True)

In [None]:
# Drop some unneccesary columns
on_faults = (
    on_faults.drop(columns = ['RecordID', 'ESS_Id', 'active', 'eventDescription', 
                                 'ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake', 
                                 'ecuSource', 'MCTNumber', 'Latitude', 'Longitude', 'EventDate', 'EventTime',
                                 'LocationTimeStamp', 'LocationDate', 'MonthYear', 'LocationTime', 'dist_A', 
                                 'dist_B', 'dist_C'])
)

on_faults.info()

In [None]:
# FuelTemperature, ServiceDistance, and SwitchedBatteryVoltage columns
# have significantly more null values, drop these columns

on_faults = (
    on_faults.drop(columns = ['FuelTemperature', 'ServiceDistance', 'SwitchedBatteryVoltage'])
)

In [None]:
on_faults['month'] = on_faults['EventTimeStamp'].dt.strftime('%b')

In [None]:
on_faults['spn-fmi'] = on_faults['spn'].astype(str) + '-' + on_faults['fmi'].astype(str)

In [None]:
on_faults = on_faults.replace({'CruiseControlActive': {'True': True, 'False': False},
                               'IgnStatus' : {'True': True, 'False': False},
                               'ParkingBrake' : {'True': True, 'False': False}})

In [None]:
on_faults['DistanceLtd']

In [None]:
rolling = on_faults[[ 'EquipmentID', 'EventTimeStamp', 'month', 'spn', 'activeTransitionCount', 'AcceleratorPedal', 
                     'BarometricPressure', 'CruiseControlActive', 'CruiseControlSetSpeed', 'DistanceLtd', 
                     'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm',
                     'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate','IgnStatus', 'IntakeManifoldTemperature', 
                     'LampStatus', 'ParkingBrake', 'Speed',  'Throttle', 'TurboBoostPressure']]


In [None]:
rolling = (pd.get_dummies(data = rolling, columns =['spn', 'month', 'CruiseControlActive',  
                                                       'IgnStatus', 'ParkingBrake'])
           .sort_values(['EquipmentID', 'EventTimeStamp'])
           .reset_index(drop = True)
          )

rolling.info()

In [None]:
rolling

In [None]:
rolling_test = (rolling
                .groupby('EquipmentID')
                .rolling(window = '1D', on = 'EventTimeStamp')
                .sum())

In [None]:
rolling_test = rolling_test.loc[~(rolling_test['spn_5246'] > 1)]

In [None]:
rolling_test.head(20)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler, MinMaxScaler

In [None]:
ml_data = rolling_test

In [None]:
ml_data = ml_data.fillna(0)

In [None]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = ml_data[ml_data.spn_5246==0]
df_minority = ml_data[ml_data.spn_5246==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=2000,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.spn_5246.value_counts()

In [None]:
X = df_downsampled.drop(columns=['spn_5246', 'EventTimeStamp'])

y = df_downsampled['spn_5246']

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.25, random_state = 321)

In [None]:
pipe = Pipeline(
    steps = [('scaler', StandardScaler()),
             ('model', LinearSVC(max_iter=50000))
            ]
)

pipe.fit(X_train, y_train)

In [None]:
print(classification_report(y_test, pipe.predict(X_test)))

In [None]:
from cm import plot_confusion_matrix

In [None]:
plot_confusion_matrix(y_test, pipe.predict(X_test), labels = ['No Derate', 'Derate'])

In [None]:
X1 = ml_data.drop(columns=['spn_5246', 'EventTimeStamp'])

y1 = ml_data['spn_5246']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1,test_size=.25, random_state = 321)

In [None]:
plot_confusion_matrix(y1_test, pipe.predict(X1_test), labels = ['No Derate', 'Derate'])

In [None]:
print(classification_report(y1_test, pipe.predict(X1_test)))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold

In [None]:
log_pipe = Pipeline(
    steps = [('vt', VarianceThreshold()),
             ('scaler', StandardScaler()),
             ('model', LogisticRegression())
            ]
)

log_pipe.fit(X_train, y_train)

In [None]:
plot_confusion_matrix(y_test, log_pipe.predict(X_test), labels = ['No Derate', 'Derate'])

In [None]:
plot_confusion_matrix(y1_test, log_pipe.predict(X1_test), labels = ['No Derate', 'Derate'])

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#rolling_test.to_pickle('../data/rolling_df.pkl')