In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

In [2]:
faults_diagnostics = pd.read_csv('../data/faults_diagnostics.csv',
                                 index_col='RecordID',
                                 parse_dates = ['EventTimeStamp', 'LocationTimeStamp'],
                                 low_memory = False)

In [3]:
faults_diagnostics.loc[faults_diagnostics['spn'] != 1569, 'spn_derate'] = 'neither' 
faults_diagnostics.loc[faults_diagnostics['spn'] != 5246, 'spn_derate'] = 'neither'
faults_diagnostics.loc[faults_diagnostics['spn'] == 5246, 'spn_derate'] = 'full'  
faults_diagnostics.loc[faults_diagnostics['spn'] == 1569, 'spn_derate'] = 'partial'  

faults_diagnostics['spn_derate'] = pd.get_dummies(faults_diagnostics['spn_derate'])

In [7]:
faults_diagnostics.select_dtypes(include=['int64','float64']).columns

Index(['Unnamed: 0', 'ESS_Id', 'spn', 'fmi', 'activeTransitionCount',
       'EquipmentID', 'Latitude', 'Longitude', 'AcceleratorPedal',
       'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
       'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure',
       'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel',
       'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature',
       'LampStatus', 'Speed', 'SwitchedBatteryVoltage', 'Throttle',
       'TurboBoostPressure'],
      dtype='object')

In [37]:
faults_diagnostics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 921737 entries, 1211418 to 1161752
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype
---  ------   --------------   -----
 0   full     921737 non-null  uint8
 1   neither  921737 non-null  uint8
 2   partial  921737 non-null  uint8
dtypes: uint8(3)
memory usage: 9.7 MB


In [None]:
faults_diagnostics.groupby('EquipmentID').

In [8]:
variables = ['activeTransitionCount',
             'Latitude', 'Longitude', 'AcceleratorPedal',
             'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd',
             'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure',
             'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel',
             'FuelLtd', 'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature',
             'LampStatus', 'Speed', 'SwitchedBatteryVoltage', 'Throttle',
             'TurboBoostPressure']
X = faults_diagnostics[variables]
y = faults_diagnostics['spn_derate']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 321)

In [10]:
oversampler = SMOTE(k_neighbors=5, random_state=321)

In [11]:
X_smote, y_smote = oversampler.fit_resample(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
fig, ax = plt.subplots(figsize = (8,6))
sns.scatterplot(x = X_smote.x, y = X_smote.y, hue = y_smote, ax = ax);

In [None]:
lr = LogisticRegression()
lr.fit(X_smote, y_smote)

In [None]:
y_pred = lr.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))