In [1]:
import pandas as pd
from geopy.distance import geodesic
import geopandas as gpd
import lightgbm as lgb

Notes:

ECM = Electronic Control Module (monitors engine sensors + operating conditions)
SPN = Suspect Parameter Number, identifies what system or component is having the issue
FMI = Failure Mode Identifier, explains how the system is failing (too high, too low, short circuit, etc.)

active = fault light (True = ON, False = OFF)
Equpment_Id = truck number

75% derate (SPN = 1569, FMI = 31) reduces engine torque by 25%
idle level derate (SPN = 5246) will require a tow

In [3]:
faults = pd.read_csv("../data/J1939Faults.csv", low_memory=False)

In [5]:
faults['EventTimeStamp'] = pd.to_datetime(faults['EventTimeStamp'])

In [7]:
#MH note: ESS_Id, actionDescription, ecuSoftwareVersion, ecuSerialNumber, ecuModel, ecuMake, ecuSource, faultValue, 
#and MCTNumber are unlikely to provide any predictive value.
faults = faults.drop(columns = ['ESS_Id'
                                , 'actionDescription'
                                , 'ecuSoftwareVersion'
#                                , 'ecuSerialNumber'
                                , 'ecuModel'
                                , 'ecuMake'
                                , 'ecuSource'
                                , 'faultValue'
                                , 'MCTNumber'])

In [9]:
diagnostics = pd.read_csv("../data/VehicleDiagnosticOnboardData.csv")

In [11]:
diagnostics = diagnostics.drop(columns = 'Id')

In [13]:
#pivot wider so each column is a status indicator
diagnostics_pivot = diagnostics.pivot(columns = 'Name', index = 'FaultId', values = 'Value').reset_index()
diagnostics_pivot.columns.name = None

In [None]:
#check shape to make sure pivoted correctly
print(diagnostics_pivot.shape)
print(diagnostics['FaultId'].nunique())
print(diagnostics['Name'].nunique())

In [15]:
#merge faults and diagnostics
full_df = pd.merge(faults, diagnostics_pivot, how = 'outer', left_on = 'RecordID', right_on = 'FaultId')
full_df = full_df.drop(columns = 'FaultId')

In [77]:
#check shape to make sure nothing hinky in the merge
print(faults.shape)
print(full_df.shape)

(1187335, 12)
(1187335, 37)


In [83]:
diagnostics_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1187335 entries, 0 to 1187334
Data columns (total 25 columns):
 #   Column                     Non-Null Count    Dtype 
---  ------                     --------------    ----- 
 0   FaultId                    1187335 non-null  int64 
 1   AcceleratorPedal           531889 non-null   object
 2   BarometricPressure         585976 non-null   object
 3   CruiseControlActive        574916 non-null   object
 4   CruiseControlSetSpeed      576458 non-null   object
 5   DistanceLtd                585819 non-null   object
 6   EngineCoolantTemperature   586071 non-null   object
 7   EngineLoad                 585621 non-null   object
 8   EngineOilPressure          586244 non-null   object
 9   EngineOilTemperature       583912 non-null   object
 10  EngineRpm                  586921 non-null   object
 11  EngineTimeLtd              581366 non-null   object
 12  FuelLevel                  502795 non-null   object
 13  FuelLtd                    

In [None]:
#MH note: There are service locations at (36.0666667, -86.4347222), (35.5883333, -86.4438888), and (36.1950, -83.174722), 
#so you should remove any records in the vicinity of these locations, as fault codes may be tripped when working on the vehicles.

#jeff's code (w/ edits):

service_stations = [
    (36.0666667, -86.4347222),
    (35.5883333, -86.4438888),
    (36.1950, -83.174722)
]

threshold_distance = 0.5  

def is_near_service_station(lat, lon):
    point = (lat, lon)
    for station in service_stations:
        distance = geodesic(point, station).miles
        if distance <= threshold_distance:
            return True
    return False

full_df['IsServiceStation'] = full_df.apply(lambda row: is_near_service_station(row['Latitude'], row['Longitude']), axis=1)

In [17]:
#andrew's code (w/ edits): 
#(lightnight fast, noice!)

# label near service stations

stations = pd.DataFrame(
    {
        "lat": [36.0666667, 35.5883333, 36.1950],
        "lon": [-86.4347222, -86.4438888, -83.174722],
    }
)
threshold_miles = 0.5
threshold_meters = threshold_miles * 1609.34
# create geodataframes with geopandas
gdf_full_df = gpd.GeoDataFrame(
    full_df,
    geometry=gpd.points_from_xy(full_df.Latitude, full_df.Longitude),
    crs="EPSG:4326",  # WGS84 coord ref sys (lat/lon)
)
gdf_stations = gpd.GeoDataFrame(
    stations,
    geometry=gpd.points_from_xy(stations.lat, stations.lon),
    crs="EPSG:4326",
)
target_crs = "EPSG:9311"
# reproject onto new crs for better distance measurement
gdf_full_df_proj = gdf_full_df.to_crs(target_crs)
gdf_stations_proj = gdf_stations.to_crs(target_crs)
# create buffers around stations
station_buf = gdf_stations_proj.geometry.buffer(threshold_meters)
combined_buffer = (
    station_buf.union_all()
)  # turns into single geometry which helps with efficiency
is_within = gdf_full_df_proj.geometry.within(combined_buffer)
full_df["nearStation"] = is_within.values

In [19]:
full_df_notservice = full_df[full_df['nearStation'] == False]

In [None]:
full_df_notservice.columns

In [91]:
full_df_notservice.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1058051 entries, 0 to 1187334
Data columns (total 37 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   RecordID                   1058051 non-null  int64         
 1   EventTimeStamp             1058051 non-null  datetime64[ns]
 2   eventDescription           1007093 non-null  object        
 3   ecuSerialNumber            751879 non-null   object        
 4   spn                        1058051 non-null  int64         
 5   fmi                        1058051 non-null  int64         
 6   active                     1058051 non-null  bool          
 7   activeTransitionCount      1058051 non-null  int64         
 8   EquipmentID                1058051 non-null  object        
 9   Latitude                   1058051 non-null  float64       
 10  Longitude                  1058051 non-null  float64       
 11  LocationTimeStamp          1058051 non-nul

In [None]:
full_df_notservice = full_df_notservice.drop(columns = ['ESS_Id'
                                , 'actionDescription'
                                , 'ecuSoftwareVersion'
#                                , 'ecuSerialNumber'
                                , 'ecuModel'
                                , 'ecuMake'
                                , 'ecuSource'
                                , 'faultValue'
                                , 'MCTNumber'])

In [45]:
derates = full_df_notservice[full_df_notservice['spn'] == 5246]
derates = derates.drop(columns = ['RecordID', 'eventDescription', 'spn', 'LocationTimeStamp', #'Latitude', 'Longitude', 
                                  'AcceleratorPedal', 'CruiseControlActive', 'CruiseControlSetSpeed',
                                  'ServiceDistance', 'CruiseControlActive', 'CruiseControlSetSpeed', 'ParkingBrake', 'nearStation'])
derates = derates[derates['active'] == True]
#derates[derates['fmi'] == True]

In [49]:
pre2019 = derates.loc[(derates['EventTimeStamp'] < '2019-01-01')]

In [69]:
pre2019.groupby('EquipmentID')['fmi'].count().sort_values(ascending=False)#.head(5)

EquipmentID
1524    32
1535    23
1525    15
305     14
1539    14
        ..
1575     1
1571     1
1567     1
1564     1
306      1
Name: fmi, Length: 163, dtype: int64

In [89]:
pre2019[pre2019['EquipmentID'] == '1524']#.head(10)

Unnamed: 0,EventTimeStamp,ecuSerialNumber,fmi,active,activeTransitionCount,EquipmentID,Latitude,Longitude,BarometricPressure,DistanceLtd,...,FuelLtd,FuelRate,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
302632,2015-12-10 10:58:47,unknown,16,True,6,1524,36.067731,-86.429305,14.5725,412388.4,...,59929.625,9.312094,183.2,True,86.0,1279,63.5,13.85,,13.34
331209,2016-01-06 14:07:24,,16,True,1,1524,35.873379,-87.859537,14.7175,412663.1,...,59974.375,9.536641,183.2,True,104.0,1279,63.0,13.75,,19.72
331648,2016-01-06 19:45:54,,0,True,1,1524,35.031481,-89.891018,14.79,,...,59991.75,1.083109,,True,122.0,1279,0.0,13.35,,
331682,2016-01-06 20:23:45,,0,True,2,1524,35.031435,-89.891111,14.79,,...,59991.875,0.0,136.4,True,129.2,1279,0.0,12.75,,0.0
344278,2016-01-18 15:57:16,,15,True,1,1524,36.121574,-85.777685,14.5725,415091.0,...,60353.375,15.79754,181.4,True,89.6,1279,63.5,13.85,,28.42
344348,2016-01-18 16:57:14,,16,True,1,1524,35.986342,-85.013148,13.9925,415140.4,...,60360.875,4.041845,212.0,True,161.6,1279,0.0,13.95,,4.93
350087,2016-01-22 15:05:25,,16,True,2,1524,35.270694,-85.88949,13.92,415686.6,...,60451.0,13.84266,213.8,True,123.8,1279,31.0,13.8,,30.16
352286,2016-01-25 13:19:28,,15,True,1,1524,35.557731,-82.548935,13.8475,415968.4,...,60505.375,5.36271,186.8,True,102.2,1279,58.5,13.8,,13.92
353900,2016-01-26 18:21:14,,15,True,2,1524,35.992222,-80.484675,14.4275,416538.1,...,60586.75,17.8845,186.8,True,107.6,1279,60.5,13.85,,31.32
353916,2016-01-26 18:41:52,,16,True,1,1524,35.870462,-80.707962,14.5,416554.3,...,60589.0,0.7000582,185.0,True,140.0,1279,0.0,13.85,,0.0


In [87]:
full_df_notservice.loc[(full_df_notservice['EquipmentID'] == '306') & (full_df_notservice['EventTimeStamp'] > '2018-09-01') & (full_df_notservice['EventTimeStamp'] < '2018-10-10')]

Unnamed: 0,RecordID,EventTimeStamp,eventDescription,ecuSerialNumber,spn,fmi,active,activeTransitionCount,EquipmentID,Latitude,...,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure,nearStation
1014670,1052764,2018-09-04 10:58:10,Data Drifted High Aftertreatment 1 Outlet NOx,79731935,3226,20,True,2,306,35.995879,...,True,114.8,17407,False,,35.00068,,0.0,0.87,False
1014754,1052848,2018-09-04 13:51:55,Incorrect Data J1939 Network #1 Primary Vehicl...,S051400287,639,2,True,127,306,36.010601,...,True,140.0,1279,False,,4.417561,,0.0,2.32,False
1014759,1052853,2018-09-04 13:57:00,Incorrect Data J1939 Network #1 Primary Vehicl...,S051400287,639,2,False,127,306,36.01199,...,,,255,,,,,,,False
1015193,1053287,2018-09-06 06:23:04,Incorrect Data J1939 Network #1 Primary Vehicl...,S051400287,639,2,False,127,306,36.015462,...,,,255,,,,,,,False
1015333,1053427,2018-09-06 10:50:43,Condition Exists NOx limits exceeded due to In...,79731935,4094,31,True,1,306,36.038055,...,True,131.0,2047,False,,64.88474,,0.0,16.24,False
1015526,1053620,2018-09-07 05:45:39,Data Drifted High Aftertreatment 1 Outlet NOx,79731935,3226,20,True,1,306,36.167777,...,True,104.0,17407,False,,65.39931,,30.4,0.87,False
1015528,1053622,2018-09-07 05:48:43,Condition Exists NOx limits exceeded due to In...,79731935,4094,31,True,1,306,36.169907,...,True,116.6,18431,False,,64.7294,,0.0,21.46,False
1015548,1053642,2018-09-07 06:47:05,Condition Exists Engine Protection Torque Derate,79731935,1569,31,True,1,306,36.17324,...,True,145.4,18431,True,,0.0,,0.0,0.58,False
1015659,1053753,2018-09-07 11:22:40,,79731935,5246,0,True,1,306,35.9975,...,True,149.0,22527,False,,0.0,,0.0,0.29,False
1021242,1060237,2018-09-25 12:39:12,Incorrect Data J1939 Network #1 Primary Vehicl...,S051400287,639,2,True,127,306,36.696898,...,True,125.6,1279,False,,3.602011,,0.0,2.03,False


useful codes:

sometimes notice early, sometimes right before derate
1569 / 31 = torque derate (limp mode)
5394 / 17 = aftertreatment diesel exhaust system
4094 / 18 = NOx limits exceeded


When evaluating the performance of your model, assume that the cost associated with a missed full derate is approximately $4000 in towing and repairs, and the cost of a false positive prediction is about $500 due to having the truck off the road and serviced unnecessarily. While high accuracy or F1 is nice, we are most interested here in saving the company money, so the final metric to evaulate your model should be the cost savings.