In [None]:
import pandas as pd
import numpy as np
import pickle

from sklearn.impute import SimpleImputer

In [None]:
on_faults = pd.read_pickle('../data/on_faults.pkl')

In [None]:
# Drop rows where fault light is being turned off
on_faults = on_faults.loc[on_faults['active'] == True].reset_index(drop = True)

In [None]:
# FuelTemperature, ServiceDistance, and SwitchedBatteryVoltage columns are more than 50% nulls, dropping them
on_faults = (
    on_faults.drop(columns = ['FuelTemperature', 'ServiceDistance', 'SwitchedBatteryVoltage'])
)

# Drop some other columns that we won't aggregate via rolling window
on_faults = (
    on_faults.drop(columns = ['RecordID', 'ESS_Id', 'active', 'eventDescription', 'MCTNumber', 
                              'Latitude', 'Longitude', 'EventDate', 'EventTime', 'LocationTimeStamp', 
                              'LocationDate', 'MonthYear', 'LocationTime', 'dist_A', 'dist_B', 'dist_C'])
)

In [None]:
# Impute most common values for the binary Cruise Control and Parking Brake
on_faults[['CruiseControlActive', 'ParkingBrake']] = (
    SimpleImputer(strategy='most_frequent')
    .fit_transform(on_faults[['CruiseControlActive', 'ParkingBrake']])
)

In [None]:
# Make the True/False columns have numerical rather than string values
on_faults = on_faults.replace({'CruiseControlActive': {'True': 1, 'False': 0},
                               'IgnStatus' : {'True': 1, 'False': 0},
                               'ParkingBrake' : {'True': 1, 'False': 0}})

In [None]:
# To help with amount of memory used, reduce integer columns to int16
int_cols=['ecuSource', 'spn', 'fmi', 'activeTransitionCount', 'IgnStatus', 'CruiseControlActive', 'ParkingBrake']

on_faults[int_cols] = on_faults[int_cols].astype('int16')

In [None]:
# Create a month column as we suspect some seasonal contribution to derates
on_faults['month'] = on_faults['EventTimeStamp'].dt.strftime('%b')

In [None]:
#on_faults['spn-fmi'] = on_faults['spn'].astype(str) + '-' + on_faults['fmi'].astype(str)

In [None]:
# Clean up entries in the ecuMake column

on_faults['ecuMake'] = (
    on_faults['ecuMake']
    .str.replace('?MMNS', 'CMMNS', regex=False)
    .str.replace('??MNS', 'CMMNS', regex=False)
    .str.replace('????R', 'PACCR', regex=False)
    .str.replace('?ACCR', 'PACCR', regex=False)
    .str.replace('???CR', 'PACCR', regex=False)
    .str.replace('?CAR', 'PCAR', regex=False)
    .str.replace('?NDWS', 'BNDWS', regex=False)
    .str.replace('?ATON', 'EATON', regex=False)
)

missing_cmmns = ['6X1u13D1500000000', '6X1u17D1500000000']

on_faults.loc[(on_faults['ecuMake']=='????S')
              &
              (on_faults['ecuModel'].isin(missing_cmmns)), 'ecuMake'
             ] = 'CMMNS'

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel'].isin(missing_cmmns)), 'ecuMake'
             ] = 'CMMNS'

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel']=='CECU3B-NAMUX4'), 'ecuMake'] = 'PACCR'

missing_bndws = ['EC60-adv', 'EC80ESP']

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel'].isin(missing_bndws)), 'ecuMake'
             ] = 'BNDWS'

on_faults.loc[(on_faults['ecuMake']=='????S')
              &
              (on_faults['ecuModel'].isin(missing_bndws)), 'ecuMake'
             ] = 'BNDWS'

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel'] == '0USA13_13_0415_2238A'), 'ecuMake'
             ] = 'VOLVO'

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel'] == 'EEO-xxF112C'), 'ecuMake'
             ] = 'EATON'

In [None]:
on_faults.info()

In [None]:
on_faults = on_faults.sort_values(['EquipmentID', 'EventTimeStamp']).reset_index(drop=True)

In [None]:
# Back fill and forward fill what null values we can using the same truck's data sorted by time
on_faults = on_faults.groupby('EquipmentID', group_keys=False).apply(lambda x: x.ffill().bfill())

In [None]:
# If we decide to drop instances when the same truck has the same fault in the same second, that would happen here
(
    on_faults
    .loc[~on_faults.duplicated(subset = ['EquipmentID', 'EventTimeStamp', 'spn', 'fmi'], keep = 'last')]
    .reset_index(drop=True)
)

### Shift to prep for rolling window function

In [None]:
# pull out just the columns wanted for the rolling function
roll_prep = on_faults[[ 'EquipmentID', 'EventTimeStamp', 'ecuMake', 'month', 'spn', 'AcceleratorPedal', 
                     'BarometricPressure', 'CruiseControlActive', 'CruiseControlSetSpeed', 'DistanceLtd', 
                     'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 
                     'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IgnStatus', 
                     'IntakeManifoldTemperature', 'LampStatus', 'Speed',  'Throttle', 'TurboBoostPressure']]


In [None]:
# impute remaining missing values in numerical columns using the overall median
cols_to_impute = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 
                  'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 
                  'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature',
                  'Speed', 'Throttle', 'TurboBoostPressure']

roll_prep[cols_to_impute] = SimpleImputer(strategy='median').fit_transform(roll_prep[cols_to_impute])



In [None]:
# Only column with some remaining nulls (fewer than 100 rows) is the ecuMake column
roll_prep.info()

In [None]:
# Get dummy columns for categorical features
roll_prep = (pd.get_dummies(data = roll_prep, columns =['ecuMake', 'spn', 'month'])
           .sort_values(['EquipmentID', 'EventTimeStamp'])
           .reset_index(drop = True)
          )

roll_prep.info()

In [None]:
# Create dictionaries to direct the rolling function on how to aggregate different columns

# Sum the different fault columns to get a total number of each fault type
sum_cols = ['spn_-32642', 'spn_-30009', 'spn_-29519', 'spn_-28271', 'spn_-26443', 'spn_-23346', 'spn_-22448', 'spn_-19274',
            'spn_-18252', 'spn_-16209', 'spn_-15183', 'spn_-13613', 'spn_-11578', 'spn_-11058', 'spn_-9033', 'spn_-4087', 
            'spn_-4085', 'spn_-4017', 'spn_-4015', 'spn_-4014', 'spn_-3990', 'spn_-3986', 'spn_-3958', 'spn_-3875', 'spn_-3335',
            'spn_-3256', 'spn_-758', 'spn_-757', 'spn_-745', 'spn_-255', 'spn_-251', 'spn_-249', 'spn_-234', 'spn_-233', 
            'spn_-217', 'spn_-1', 'spn_0', 'spn_16', 'spn_27', 'spn_33', 'spn_37', 'spn_38', 'spn_51', 'spn_70', 'spn_74', 
            'spn_75', 'spn_77', 'spn_78', 'spn_81', 'spn_84', 'spn_91', 'spn_92', 'spn_94', 'spn_95', 'spn_96', 'spn_97', 
            'spn_98', 'spn_100', 'spn_101', 'spn_102', 'spn_103', 'spn_105', 'spn_107', 'spn_108', 'spn_110', 'spn_111', 
            'spn_116', 'spn_118', 'spn_125', 'spn_127', 'spn_153', 'spn_157', 'spn_158', 'spn_160', 'spn_167', 'spn_168', 
            'spn_171', 'spn_173', 'spn_174', 'spn_175', 'spn_177', 'spn_184', 'spn_188', 'spn_190', 'spn_191', 'spn_228', 
            'spn_235', 'spn_236', 'spn_237', 'spn_245', 'spn_247', 'spn_248', 'spn_251', 'spn_252', 'spn_255', 'spn_256', 
            'spn_411', 'spn_412', 'spn_441', 'spn_442', 'spn_444', 'spn_512', 'spn_522', 'spn_525', 'spn_558', 'spn_560', 
            'spn_563', 'spn_576', 'spn_577', 'spn_578', 'spn_583', 'spn_596', 'spn_603', 'spn_609', 'spn_611', 'spn_612', 
            'spn_614', 'spn_624', 'spn_627', 'spn_628', 'spn_629', 'spn_630', 'spn_632', 'spn_633', 'spn_636', 'spn_639', 
            'spn_641', 'spn_647', 'spn_649', 'spn_651', 'spn_652', 'spn_653', 'spn_654', 'spn_655', 'spn_656', 'spn_677', 
            'spn_705', 'spn_709', 'spn_723', 'spn_729', 'spn_751', 'spn_752', 'spn_767', 'spn_768', 'spn_781', 'spn_788', 
            'spn_789', 'spn_790', 'spn_791', 'spn_792', 'spn_793', 'spn_794', 'spn_795', 'spn_796', 'spn_797', 'spn_798', 
            'spn_799',  'spn_800', 'spn_801', 'spn_802', 'spn_803', 'spn_805', 'spn_806', 'spn_807', 'spn_810', 'spn_811', 
            'spn_829', 'spn_830', 'spn_862', 'spn_886', 'spn_904', 'spn_905', 'spn_906', 'spn_907', 'spn_917', 'spn_929', 
            'spn_932',  'spn_933', 'spn_934', 'spn_937', 'spn_938', 'spn_939', 'spn_940', 'spn_941', 'spn_976', 'spn_1023', 
            'spn_1024', 'spn_1028', 'spn_1043', 'spn_1045', 'spn_1056', 'spn_1059', 'spn_1067', 'spn_1068', 'spn_1071', 
            'spn_1072', 'spn_1075', 'spn_1078', 'spn_1081', 'spn_1127', 'spn_1172', 'spn_1176', 'spn_1209', 'spn_1213', 
            'spn_1231',  'spn_1235', 'spn_1236', 'spn_1239', 'spn_1247', 'spn_1279', 'spn_1321', 'spn_1322', 'spn_1323', 
            'spn_1324', 'spn_1325', 'spn_1326', 'spn_1327', 'spn_1328', 'spn_1347', 'spn_1349', 'spn_1464', 'spn_1481', 
            'spn_1482', 'spn_1483', 'spn_1487', 'spn_1569', 'spn_1612', 'spn_1659', 'spn_1668', 'spn_1675', 'spn_1761', 
            'spn_1787', 'spn_1807', 'spn_1808', 'spn_1809', 'spn_1815', 'spn_2000', 'spn_2017', 'spn_2023', 'spn_2029', 
            'spn_2579', 'spn_2623',  'spn_2629', 'spn_2630', 'spn_2659', 'spn_2791', 'spn_2795', 'spn_2863', 'spn_2866', 
            'spn_2912', 'spn_2917', 'spn_3031', 'spn_3058', 'spn_3060', 'spn_3064', 'spn_3216', 'spn_3217', 'spn_3218', 
            'spn_3222', 'spn_3226', 'spn_3227', 'spn_3228', 'spn_3241', 'spn_3242', 'spn_3245', 'spn_3246', 'spn_3249', 
            'spn_3251', 'spn_3253', 'spn_3360', 'spn_3361', 'spn_3362', 'spn_3363', 'spn_3364', 'spn_3464', 'spn_3480', 
            'spn_3482', 'spn_3490', 'spn_3509', 'spn_3510', 'spn_3511', 'spn_3512', 'spn_3513', 'spn_3514', 'spn_3515', 
            'spn_3521', 'spn_3556', 'spn_3583', 'spn_3584', 'spn_3597', 'spn_3605', 'spn_3610', 'spn_3663', 'spn_3695', 
            'spn_3696', 'spn_3697', 'spn_3698', 'spn_3703', 'spn_3720', 'spn_3821', 'spn_3936', 'spn_4094', 'spn_4095', 
            'spn_4096', 'spn_4219', 'spn_4220', 'spn_4276', 'spn_4331', 'spn_4334', 'spn_4339', 'spn_4340', 'spn_4342', 
            'spn_4344', 'spn_4346', 'spn_4349', 'spn_4354', 'spn_4356', 'spn_4360', 'spn_4363', 'spn_4364', 'spn_4375', 
            'spn_4376', 'spn_4380', 'spn_4382', 'spn_4607', 'spn_4752', 'spn_4765', 'spn_4766', 'spn_4792', 'spn_4794', 
            'spn_4795', 'spn_4796', 'spn_4811', 'spn_4812', 'spn_4813', 'spn_5018', 'spn_5019', 'spn_5024', 'spn_5031', 
            'spn_5052', 'spn_5109', 'spn_5110', 'spn_5111', 'spn_5112', 'spn_5113', 'spn_5114', 'spn_5115', 'spn_5116', 
            'spn_5117', 'spn_5245', 'spn_5246', 'spn_5298', 'spn_5319', 'spn_5321', 'spn_5357', 'spn_5392', 'spn_5394', 
            'spn_5395', 'spn_5396', 'spn_5397', 'spn_5442', 'spn_5443', 'spn_5444', 'spn_5485', 'spn_5491', 'spn_5569', 
            'spn_5571', 'spn_5579', 'spn_5585', 'spn_5614', 'spn_5615', 'spn_5616', 'spn_5625', 'spn_5742', 'spn_5743', 
            'spn_5745', 'spn_5746', 'spn_5835', 'spn_5848', 'spn_5851', 'spn_5853', 'spn_5862', 'spn_5902', 'spn_5903', 
            'spn_5909', 'spn_5939', 'spn_5941', 'spn_5942', 'spn_5953', 'spn_6145', 'spn_6146', 'spn_6147', 'spn_6148', 
            'spn_6713', 'spn_6773', 'spn_6780', 'spn_6802', 'spn_7321', 'spn_7323', 'spn_7360', 'spn_7827', 'spn_7847', 
            'spn_7854', 'spn_8224', 'spn_9295', 'spn_10803', 'spn_12596', 'spn_13600', 'spn_17096', 'spn_17590', 'spn_22585', 
            'spn_22859', 'spn_25780', 'spn_29902', 'spn_32000',]

# Take the mean of most diagnostic parameters
mean_cols = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'EngineCoolantTemperature', 
             'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm', 'FuelLevel', 
             'FuelLtd', 'FuelRate', 'IntakeManifoldTemperature', 'Speed', 'Throttle', 'TurboBoostPressure',]

# Get a count of how many timestamps are present in the time frame
count_cols = ['EventTimeStamp']

# Take the max value to indicate the month and make and to get the final value for binary parameters and engine time/mileage
max_cols = ['ecuMake_5516014', 'ecuMake_?????MX', 'ecuMake_?????MX16U13D13', 'ecuMake_BNDWS', 'ecuMake_CMMNS', 
             'ecuMake_EATON', 'ecuMake_PACCR', 'ecuMake_PCAR', 'ecuMake_VOLVO', 'ecuMake_unknown', 'IgnStatus', 'LampStatus',
             'CruiseControlActive', 'EngineTimeLtd', 'DistanceLtd', 'month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 
             'month_Jan', 'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov', 'month_Oct', 'month_Sep', ]

In [None]:
d1 = dict.fromkeys(sum_cols, 'sum')
d2 = dict.fromkeys(mean_cols, 'mean')
d3 = dict.fromkeys(count_cols, 'count')
d4 = dict.fromkeys(max_cols, 'max')
pre_roll_dict = {**d1, **d2, **d4}
agg_dict = {**d1, **d2, **d3, **d4}

In [None]:
# At this stage, will need to group together faults that occurred in the same truck at the exact same time

roll_prep = roll_prep.groupby(['EquipmentID', 'EventTimeStamp']).agg(pre_roll_dict).reset_index()

In [None]:
rolling_df = (roll_prep
                .groupby('EquipmentID')
                .rolling(window = '1D', on = 'EventTimeStamp')
                .agg(agg_dict))

In [None]:
rolling_df = rolling_df.rename(columns = {'EventTimeStamp' : 'faults_in_period'})

rolling_df = rolling_df.reset_index()

In [None]:
rolling_df

In [None]:
derate_times = (
    on_faults
    .loc[on_faults['spn'] == 5246][['EquipmentID', 'EventTimeStamp']]
    .rename(columns = {'EventTimeStamp' : 'derateTime'})
)

In [None]:
rolling_df1 = (
    pd.merge(rolling_df, derate_times[['EquipmentID', 'derateTime']], how = 'left', 
             left_on = ['EquipmentID', 'EventTimeStamp'], 
             right_on = ['EquipmentID', 'derateTime'])
)

In [None]:
rolling_df1['date'] = rolling_df1['EventTimeStamp'].dt.date

In [None]:
rolling_df1['derateTime_ff'] = rolling_df1.groupby('EquipmentID')['derateTime'].ffill()

In [None]:
rolling_df1['derateTime_bf'] = rolling_df1.groupby('EquipmentID')['derateTime'].bfill()

In [None]:
rolling_df1[['EquipmentID', 'date', 'EventTimeStamp', 'spn_5246', 'derateTime', 'derateTime_ff','derateTime_bf']].iloc[4440:4500, :]

We will need to calculate both a time until next derate column and a time since last derate column. From there, we need to decide which rows to throw out when values in each of these columns are within a certain time frame, potentially. Or do we just throw out subsequent derates that occur within a certain time period of the first?

We also, more broadly, decide how to classify rows that fall just after a first derate of the day. Will these still be considered "hits" for our target variable? How shortly after the first derate should they be?

Target variable hit/miss will be determined by time until (and since?) derate.

In [None]:
rolling_df1.loc[rolling_df1['spn_5246'] <= 1].reset_index(drop=True)

In [None]:
rolling_df1['timeSinceDerate'] = rolling_df1['EventTimeStamp'] - rolling_df1['derateTime_ff']

In [None]:
rolling_df1['timeUntilDerate'] = rolling_df1['derateTime_bf'] - rolling_df1['EventTimeStamp']

In [None]:
rolling_df2 =  rolling_df1.drop(columns = ['derateTime', 'date', 'derateTime_bf', 'derateTime_ff'])

In [None]:
#rolling_df2.to_pickle('../data/model_data.pkl')