In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import calendar

# Big G notebook Part 3: Feature Selection and Model Preparation
This notebook will attempt to add some features to the dataset and establish encoding to help analyze time_series data

In [2]:
# Here's the pkl - 1,057,461 entries.
on_faults = pd.read_pickle('../data/on_faults.pkl')

Filters

In [3]:
# Filter out only the rows where the 'active' column is True.
on_faults = on_faults[on_faults['active'] == True]

# Drop rows where fault light is being turned off
on_faults = on_faults.loc[on_faults['active'] == True].reset_index(drop = True)

# Remove additional columns that are deemed unnecessary
on_faults = (
    on_faults.drop(columns = ['RecordID', 'ESS_Id', 'active', 'eventDescription', 
                                 'ecuSoftwareVersion', 'ecuSerialNumber', 'ecuModel', 'ecuMake', 
                                 'ecuSource', 'MCTNumber', 'Latitude', 'Longitude', 'EventDate', 'EventTime',
                                 'LocationTimeStamp', 'LocationDate', 'MonthYear', 'LocationTime', 'dist_A', 
                                 'dist_B', 'dist_C'])
)

# Drop columns with high amounts of null values ('FuelTemperature', 'ServiceDistance', and 'SwitchedBatteryVoltage').
on_faults = (
    on_faults.drop(columns = ['FuelTemperature', 'ServiceDistance', 'SwitchedBatteryVoltage'])
)

Create Features

In [4]:
# Create a new column called 'month' which extracts the month from the 'EventTimeStamp' column.
on_faults['month'] = on_faults['EventTimeStamp'].dt.strftime('%b')

# Create a new column called 'spn-fmi' which combines the 'spn' and 'fmi' columns.
on_faults['spn-fmi'] = on_faults['spn'].astype(str) + '-' + on_faults['fmi'].astype(str)

# Replace the strings 'True' and 'False' in the columns 'CruiseControlActive', 'IgnStatus', and 'ParkingBrake' with the corresponding boolean values.
on_faults = on_faults.replace({'CruiseControlActive': {'True': True, 'False': False},
                               'IgnStatus' : {'True': True, 'False': False},
                               'ParkingBrake' : {'True': True, 'False': False}})



Create Rolling Window for Filtered Data

In [5]:
# Subset of on_faults that contains only columns of interest
rolling = on_faults[[ 'EquipmentID', 'EventTimeStamp', 'month', 'spn', 'activeTransitionCount', 'AcceleratorPedal', 
                     'BarometricPressure', 'CruiseControlActive', 'CruiseControlSetSpeed', 'DistanceLtd', 
                     'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm',
                     'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate','IgnStatus', 'IntakeManifoldTemperature', 
                     'LampStatus', 'ParkingBrake', 'Speed',  'Throttle', 'TurboBoostPressure']]

In [6]:
# Encoding columns of interest
too_hot = (pd.get_dummies(data = rolling, columns =['spn', 'month', 'CruiseControlActive',  
                                                       'IgnStatus', 'ParkingBrake'])
           .sort_values(['EquipmentID', 'EventTimeStamp'])
           .reset_index(drop = True)
          )

too_hot

Unnamed: 0,EquipmentID,EventTimeStamp,activeTransitionCount,AcceleratorPedal,BarometricPressure,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,...,month_May,month_Nov,month_Oct,month_Sep,CruiseControlActive_False,CruiseControlActive_True,IgnStatus_False,IgnStatus_True,ParkingBrake_False,ParkingBrake_True
0,301,2015-05-11 13:11:20,127,14.4,13.9200,0.00000,120841.70,185.0,18.0,55.10,...,1,0,0,0,1,0,0,1,1,0
1,301,2015-05-13 08:22:32,3,89.6,14.4275,0.00000,121687.10,186.8,38.0,52.78,...,1,0,0,0,1,0,0,1,1,0
2,301,2015-05-18 09:34:05,6,0.0,14.3550,0.00000,123057.10,185.0,0.0,55.68,...,1,0,0,0,1,0,0,1,1,0
3,301,2015-05-21 13:57:35,127,12.0,14.5725,0.00000,124496.50,183.2,14.0,51.62,...,1,0,0,0,1,0,0,1,1,0
4,301,2015-05-28 13:31:41,4,67.2,13.9200,0.00000,125989.20,183.2,68.0,51.62,...,1,0,0,0,1,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548843,310,2018-09-06 00:34:32,127,0.0,14.2825,64.62260,279964.50,186.8,10.0,22.04,...,0,0,0,1,1,0,0,1,0,1
548844,R1762,2015-02-24 13:45:06,43,0.0,14.0650,66.48672,79796.12,185.0,83.0,42.34,...,0,0,0,0,0,1,0,1,1,0
548845,R1762,2015-02-24 15:31:17,11,0.0,14.1375,66.48672,79913.24,179.6,53.0,42.34,...,0,0,0,0,0,1,0,1,1,0
548846,R1762,2015-02-26 13:12:11,1,0.0,14.2100,66.48672,80838.70,134.6,11.0,38.86,...,0,0,0,0,1,0,0,1,1,0


Create One-Hot Encoder for SPNs

In [7]:
# Sort the DataFrame by EventTimeStamp
too_hot = too_hot.sort_values(by='EventTimeStamp')

hot_rolls = (too_hot
                .groupby('EquipmentID')
                .rolling(window = '1D', on = 'EventTimeStamp')
                .sum())

In [8]:
output = hot_rolls.loc[~(hot_rolls['spn_5246'] > 1)]

In [9]:
output

Unnamed: 0_level_0,Unnamed: 1_level_0,EventTimeStamp,AcceleratorPedal,BarometricPressure,CruiseControlActive_False,CruiseControlActive_True,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,...,spn_938,spn_939,spn_94,spn_940,spn_941,spn_95,spn_96,spn_97,spn_976,spn_98
EquipmentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
301,0,2015-05-11 13:11:20,14.4,13.9200,1.0,0.0,0.00000,120841.70,185.0,18.0,55.10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
301,1,2015-05-13 08:22:32,89.6,14.4275,1.0,0.0,0.00000,121687.10,186.8,38.0,52.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
301,2,2015-05-18 09:34:05,0.0,14.3550,1.0,0.0,0.00000,123057.10,185.0,0.0,55.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
301,3,2015-05-21 13:57:35,12.0,14.5725,1.0,0.0,0.00000,124496.50,183.2,14.0,51.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
301,4,2015-05-28 13:31:41,67.2,13.9200,1.0,0.0,0.00000,125989.20,183.2,68.0,51.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,548843,2018-09-06 00:34:32,0.0,14.2825,1.0,0.0,64.62260,279964.50,186.8,10.0,22.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R1762,548844,2015-02-24 13:45:06,0.0,14.0650,0.0,1.0,66.48672,79796.12,185.0,83.0,42.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R1762,548845,2015-02-24 15:31:17,0.0,28.2025,0.0,2.0,132.97344,159709.36,364.6,136.0,84.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
R1762,548846,2015-02-26 13:12:11,0.0,14.2100,1.0,0.0,66.48672,80838.70,134.6,11.0,38.86,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Pickle the eda info so that I can start building models 

In [10]:
output.to_pickle("../data/spn_only.pkl")