In [159]:
import pandas as pd
from geopy.distance import geodesic
import lightgbm as lgb

Notes:

ECM = Electronic Control Module (monitors engine sensors + operating conditions)
SPN = Suspect Parameter Number, identifies what system or component is having the issue
FMI = Failure Mode Identifier, explains how the system is failing (too high, too low, short circuit, etc.)

active = fault light (True = ON, False = OFF)
Equpment_Id = truck number

75% derate (SPN = 1569, FMI = 31) reduces engine torque by 25%
idle level derate (SPN = 5246) will require a tow

In [None]:
faults = pd.read_csv("../data/J1939Faults.csv", low_memory=False)

In [17]:
#MH note: ESS_Id, actionDescription, ecuSoftwareVersion, ecuSerialNumber, ecuModel, ecuMake, ecuSource, faultValue, 
#and MCTNumber are unlikely to provide any predictive value.
faults = faults.drop(columns = ['ESS_Id'
                                , 'actionDescription'
                                , 'ecuSoftwareVersion'
                                , 'ecuSerialNumber'
                                , 'ecuModel'
                                , 'ecuMake'
                                , 'ecuSource'
                                , 'faultValue'
                                , 'MCTNumber'])

In [117]:
diagnostics = pd.read_csv("../data/VehicleDiagnosticOnboardData.csv")

In [119]:
diagnostics = diagnostics.drop(columns = 'Id')

In [123]:
#pivot wider so each column is a status indicator
diagnostics_pivot = diagnostics.pivot(columns = 'Name', index = 'FaultId', values = 'Value').reset_index()
diagnostics_pivot.columns.name = None

In [167]:
diagnostics['Name'].unique()

array(['IgnStatus', 'EngineOilPressure', 'EngineOilTemperature',
       'TurboBoostPressure', 'EngineLoad', 'AcceleratorPedal',
       'IntakeManifoldTemperature', 'FuelRate', 'FuelLtd', 'EngineRpm',
       'LampStatus', 'BarometricPressure', 'FuelLevel', 'Speed',
       'EngineTimeLtd', 'CruiseControlSetSpeed', 'CruiseControlActive',
       'EngineCoolantTemperature', 'ParkingBrake',
       'SwitchedBatteryVoltage', 'DistanceLtd', 'Throttle',
       'FuelTemperature', 'ServiceDistance'], dtype=object)

In [133]:
#check shape to make sure pivoted correctly
print(diagnostics_pivot.shape)
print(diagnostics['FaultId'].nunique())
print(diagnostics['Name'].nunique())

(1187335, 25)
1187335
24


In [257]:
#merge faults and diagnostics
full_df = pd.merge(faults, diagnostics_pivot, how = 'outer', left_on = 'RecordID', right_on = 'FaultId')
full_df = full_df.drop(columns = 'FaultId')

In [259]:
#check shape to make sure nothing hinky in the merge
print(faults.shape)
print(full_df.shape)

(1187335, 11)
(1187335, 35)


In [195]:
#MH note: There are service locations at (36.0666667, -86.4347222), (35.5883333, -86.4438888), and (36.1950, -83.174722), 
#so you should remove any records in the vicinity of these locations, as fault codes may be tripped when working on the vehicles.

#jeff's code (w/ edits):

service_stations = [
    (36.0666667, -86.4347222),
    (35.5883333, -86.4438888),
    (36.1950, -83.174722)
]

threshold_distance = 0.5  

def is_near_service_station(lat, lon):
    point = (lat, lon)
    for station in service_stations:
        distance = geodesic(point, station).miles
        if distance <= threshold_distance:
            return True
    return False

In [261]:
full_df['IsServiceStation'] = full_df.apply(lambda row: is_near_service_station(row['Latitude'], row['Longitude']), axis=1)

In [263]:
full_df_noser = full_df[full_df['IsServiceStation'] == False]

In [265]:
full_df_ser = full_df[full_df['IsServiceStation'] == True]

In [267]:
full_df_ser.columns

Index(['RecordID', 'EventTimeStamp', 'eventDescription', 'spn', 'fmi',
       'active', 'activeTransitionCount', 'EquipmentID', 'Latitude',
       'Longitude', 'LocationTimeStamp', 'AcceleratorPedal',
       'BarometricPressure', 'CruiseControlActive', 'CruiseControlSetSpeed',
       'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad',
       'EngineOilPressure', 'EngineOilTemperature', 'EngineRpm',
       'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 'FuelRate', 'FuelTemperature',
       'IgnStatus', 'IntakeManifoldTemperature', 'LampStatus', 'ParkingBrake',
       'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle',
       'TurboBoostPressure', 'IsServiceStation'],
      dtype='object')

In [295]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 36 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   RecordID                   1187335 non-null  int64  
 1   EventTimeStamp             1187335 non-null  object 
 2   eventDescription           1126490 non-null  object 
 3   spn                        1187335 non-null  int64  
 4   fmi                        1187335 non-null  int64  
 5   active                     1187335 non-null  bool   
 6   activeTransitionCount      1187335 non-null  int64  
 7   EquipmentID                1187335 non-null  object 
 8   Latitude                   1187335 non-null  float64
 9   Longitude                  1187335 non-null  float64
 10  LocationTimeStamp          1187335 non-null  object 
 11  AcceleratorPedal           531889 non-null   object 
 12  BarometricPressure         585976 non-null   object 
 13  CruiseContro

In [317]:
derates = full_df_noser[full_df_noser['spn'] == 5246]
derates = derates.drop(columns = ['RecordID', 'eventDescription', 'spn', 'LocationTimeStamp', 'Latitude', 'Longitude', 
                                  'ServiceDistance', 'CruiseControlActive', 'CruiseControlSetSpeed', 'ParkingBrake', 'IsServiceStation'])
derates = derates[derates['active'] == True]
derates

Unnamed: 0,EventTimeStamp,fmi,active,activeTransitionCount,EquipmentID,AcceleratorPedal,BarometricPressure,DistanceLtd,EngineCoolantTemperature,EngineLoad,...,FuelLtd,FuelRate,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
2089,2015-02-23 05:05:44.000,0,True,1,1630,,,,,,...,33470.466902374,,,False,,22527,,,,
2971,2015-02-23 15:54:22.000,0,True,1,1487,,,,,,...,,,,True,,22527,,,,
5713,2015-02-25 13:53:08.000,0,True,1,1329,,,,,,...,,,,True,,22527,,,,
6534,2015-02-26 22:24:29.000,0,True,1,1419,,14.5,441699.6,185,10,...,69605.769379298,0.6340149,,True,140,22527,0,3276.75,,0.58
6628,2015-02-27 09:09:56.000,0,True,1,1486,,,,,,...,,,,True,,22527,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1177700,2011-01-01 00:03:23.000,0,True,1,2175,0,14.4275,216319.8,120.2,0,...,30812.896059254,0,,True,93.2,6143,0,,100,0
1178571,2020-02-03 15:46:46.000,16,True,1,2211,0,14.2825,126087.4,179.6,11,...,16791.303969224,0.4226766,,True,98.6,18431,0.4757373,,100,0.29
1179414,2020-02-06 07:45:08.000,0,True,1,1854,0,13.92,434329.4,185,12,...,59349.025288346,0.5811803,,True,136.4,22527,0,,100,0.29
1181700,2020-02-13 13:32:39.000,0,True,1,1872,0,13.9925,477486.8,197.6,10,...,63966.356499228,0.4623025,,True,120.2,22527,0,,100,0.58


In [319]:
print(full_df_noser['EquipmentID'].nunique())
print(derates['EquipmentID'].nunique())

1065
193


When evaluating the performance of your model, assume that the cost associated with a missed full derate is approximately $4000 in towing and repairs, and the cost of a false positive prediction is about $500 due to having the truck off the road and serviced unnecessarily. While high accuracy or F1 is nice, we are most interested here in saving the company money, so the final metric to evaulate your model should be the cost savings.