I will be working with fault code data and vehicle onboard diagnostic data to try and predict an upcoming full derate. These are indicated by an SPN 5246.

In [1]:
import pandas as pd
from datetime import datetime
import geopandas as gpd
from geopy.distance import distance
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
parse_dates=['EventTimeStamp']

In [3]:
faults = pd.read_csv('data/J1939Faults.csv', low_memory=False, parse_dates=['EventTimeStamp'])
service_fault = pd.read_excel('data/ServiceFaultCodes.xlsx')
vehicle_Diagnostics = pd.read_csv('data/VehicleDiagnosticOnboardData.csv')

  for idx, row in parser.parse():


In [4]:
vehicle_Diagnostics = pd.DataFrame(vehicle_Diagnostics)

In [5]:
# filter out rows that have more than 5 characters in the 'EquipmentID' column
faults =faults[faults['EquipmentID'].str.len() <= 5]

Basic EDA to check how the data looks like:

In [6]:
#get rid of faultValue and actionDescription since they haven't been filled in. 
#FaultId = RecordID

In [7]:
faults = faults.drop(columns = ["actionDescription", "faultValue"])

In [8]:
faults.shape
#service_fault.shape
#vehicle_Diagnostics.shape

(1185166, 18)

In [9]:
faults.isna().sum()
#service_fault.isna().sum()
#vehicle_Diagnostics.isna().sum()

RecordID                      0
ESS_Id                        0
EventTimeStamp                0
eventDescription          60366
ecuSoftwareVersion       295827
ecuSerialNumber          342772
ecuModel                  64649
ecuMake                   64649
ecuSource                     0
spn                           0
fmi                           0
active                        0
activeTransitionCount         0
EquipmentID                   0
MCTNumber                     0
Latitude                      0
Longitude                     0
LocationTimeStamp             0
dtype: int64

Remove faults occurring in the vicinity of the service locations at (36.0666667, -86.4347222), (35.5883333, -86.4438888), and (36.1950, -83.174722)

In [10]:
faults = faults[(faults['Latitude'] != '36.0666667') & (faults['Longitude'] != '-86.4347222')]
faults = faults[(faults['Latitude'] != '35.5883333') & (faults['Longitude'] != '-86.4438888')]
faults = faults[(faults['Latitude'] != '36.1950') & (faults['Longitude'] != '-83.174722')]

To filter out the events near the service stations:

In [11]:
for lat, lon in [(36.0666667, -86.4347222), (35.5883333, -86.4438888), (36.1950, -83.174722)]:
    
    faults = faults.loc[~((abs(lat - faults['Latitude']) <= 0.01) &
                          (abs(lon - faults['Longitude']) <= 0.01))]

In [12]:
faults.loc[faults['spn'] == 5246]

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
2089,2090,1011009,2015-02-23 05:05:44,,05290170*03015749*051914190353*09400015*G1*BDR*,79642446,6X1u13D1500000000,CMMNS,0,5246,0,True,1,1630,105329900,40.733009,-74.087777,2015-02-23 05:08:23.000
2971,2972,1026305,2015-02-23 15:54:22,,unknown,unknown,unknown,unknown,0,5246,0,True,1,1487,105369355,28.077361,-81.897083,2015-02-23 15:54:58.000
5713,5714,1070646,2015-02-25 13:53:08,,unknown,unknown,unknown,unknown,0,5246,0,True,1,1329,105400037,39.399583,-82.974768,2015-02-25 13:56:31.000
5809,5810,1071907,2015-02-25 14:47:00,,unknown,unknown,unknown,unknown,0,5246,0,False,1,1329,105400037,39.399629,-82.974814,2015-02-25 14:46:56.000
6534,6535,1097942,2015-02-26 22:24:29,,04993120*00021657*082113134117*07700053*I0*BBZ*,79466573,6X1u10D1500000000,CMMNS,0,5246,0,True,1,1419,105355995,37.596805,-85.865555,2015-02-26 22:25:05.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1179423,1239564,119961467,2020-02-06 08:03:09,,04358814*06005963*051718174436*09401683*G1*BDR*,79897320,6X1u13D1500000000,CMMNS,0,5246,0,False,1,1854,105385876,35.943472,-83.823240,2020-02-06 08:03:05.000
1181700,1241841,120905759,2020-02-13 13:32:39,,04358814*06026985*051718174436*09401683*G1*BDR*,79903054,6X1u13D1500000000,CMMNS,0,5246,0,True,1,1872,105301976,35.707268,-81.397037,2020-02-13 13:33:15.000
1181717,1241858,120910417,2020-02-13 14:01:40,,04358814*06026985*051718174436*09401683*G1*BDR*,79903054,6X1u13D1500000000,CMMNS,0,5246,0,False,1,1872,105301976,35.708101,-81.395648,2020-02-13 13:59:51.000
1181996,1242137,121038018,2020-02-14 11:21:54,,,,,,49,5246,19,True,88,302,105418777,38.349490,-85.708425,2020-02-14 11:22:30.000


selected unique trucks with partial derate and complete derate (I can change the name, currently I went with ‘total’ derate) and then I compared them.

In [13]:
all_trucks = faults['EquipmentID'].unique()
partial_derate = faults.loc[(faults['spn'] == 1569) & (faults['fmi'] == 31)]['EquipmentID'].unique()
total_derate = faults.loc[faults['spn'] == 5246]['EquipmentID'].unique()

partial_derate_only = partial_derate[np.isin(partial_derate, total_derate, invert=True)]
total_derate_only = total_derate[np.isin(total_derate, partial_derate, invert=True)]
partial_and_total_derate = np.intersect1d(partial_derate, total_derate)
no_derate = all_trucks[np.isin(all_trucks, partial_derate_only, invert=True) | np.isin(all_trucks, total_derate_only, invert=True)]

Dropping dates after 2011 

In [14]:
faults = faults.loc[faults['active'] != False]

In [15]:
faults = faults.loc[faults['EventTimeStamp'].dt.year > 2011]

In [16]:
faults

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,111,17,True,2,1439,105354361,38.857638,-84.626851,2015-02-21 11:34:25.000
1,2,990360,2015-02-21 11:34:34,,unknown,unknown,unknown,unknown,11,629,12,True,127,1439,105354361,38.857638,-84.626851,2015-02-21 11:35:10.000
3,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,2,True,127,1369,105336226,41.421018,-87.767361,2015-02-21 11:36:08.000
5,6,990431,2015-02-21 11:40:22,Low (Severity Low) Engine Coolant Level,04993120*00025921*082113134117*07700053*I0*BBZ*,79466580,6X1u10D1500000000,CMMNS,0,111,17,True,1,1417,105438630,33.043564,-96.179722,2015-02-21 11:40:59.000
6,7,990439,2015-02-21 11:40:52,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,111,17,True,2,1597,105344243,36.902916,-86.436481,2015-02-21 11:41:29.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187324,1248448,123899434,2020-03-06 13:12:43,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,3,True,126,1936,105355619,30.376851,-81.744953,2020-03-06 13:29:33.000
1187328,1248452,123901805,2020-03-06 13:42:48,Low (Severity Medium) Engine Coolant Level,04358814*06030918*051718174436*09401683*G1*BDR*,79904453,6X1u13D1500000000,CMMNS,0,111,18,True,93,1886,105351219,39.015694,-77.031157,2020-03-06 13:43:24.000
1187331,1248455,123905139,2020-03-06 14:04:23,Condition Exists Engine Protection Torque Derate,04358814*06099720*030816202706*09400153*G1*BDR*,79932020,6X1u13D1500000000,CMMNS,0,1569,31,True,5,1994,105354084,34.390740,-79.461805,2020-03-06 14:04:59.000
1187332,1248456,123905996,2020-03-06 14:13:38,Abnormal Rate of Change Aftertreatment 1 Intak...,05317106*05100987*050719120655*09401585*G1*BDR*,79880653,6X1u13D1500000000,CMMNS,0,3216,10,True,1,1850,105336308,34.430370,-84.920509,2020-03-06 14:14:14.000


- Creating a copy of Faults dataframe to be able to make changes without affecting the original data. 

In [17]:
faults_copy = faults 
Diagnostics = vehicle_Diagnostics

Changing the shape of Diagnostics to be able to merge it. 

In [18]:
# Create a wider table from long table, and drop the Name
Diagnostics = Diagnostics.pivot(index="FaultId", columns="Name", values="Value").reset_index()
Diagnostics

Name,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,0,14.21,False,66.48672,423178.7,100.4,11,0,96.74375,...,,False,78.8,1023,True,,0,3276.75,,0
1,2,,,,,,,,,,...,,True,,1279,,,,,,
2,3,,,,,,,,,,...,,,,1279,,,,,,
3,4,,,,,,,,,,...,,True,,1279,,,,,,
4,5,,,,,,,,,,...,,,,16639,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187330,1248454,,,,,,,,,,...,,,,1023,,,,,,
1187331,1248455,100,14.5,True,64.6226,423937.9,185,51,37.12,211.4937,...,32,True,98.6,18431,False,,65.01096,,73.2,7.83
1187332,1248456,0,14.355,True,66.48672,465925.4,186.8,62,41.18,212.8438,...,,True,91.4,17407,,,66.5741,,100,6.96
1187333,1248457,1.6,14.4275,False,67.72946,28606.65625,181.4,0,27.26,221.7312,...,,True,100.4,1023,False,,11.84489,14.1,100,1.74


In [19]:
Diagnostics.isna().sum()

Name
FaultId                            0
AcceleratorPedal              655446
BarometricPressure            601359
CruiseControlActive           612419
CruiseControlSetSpeed         610877
DistanceLtd                   601516
EngineCoolantTemperature      601264
EngineLoad                    601714
EngineOilPressure             601091
EngineOilTemperature          603423
EngineRpm                     600414
EngineTimeLtd                 605969
FuelLevel                     684540
FuelLtd                       602140
FuelRate                      602098
FuelTemperature               888225
IgnStatus                     578881
IntakeManifoldTemperature     601044
LampStatus                         0
ParkingBrake                  787363
ServiceDistance              1187120
Speed                         603419
SwitchedBatteryVoltage       1073276
Throttle                      766832
TurboBoostPressure            603984
dtype: int64

In [20]:
#Diagnostics['CruiseControlActive'].astype('bool')
#Diagnostics['IgnStatus'].astype('bool')


#Diagnostics['CruiseControlActive'] = Diagnostics.CruiseControlActive.astype(bool)
#Diagnostics['IgnStatus'] = Diagnostics.IgnStatus.astype(bool)



In [21]:
# convert columns types
#cols = ['AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed', 'DistanceLtd', 
#        'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 'EngineOilTemperature', 
#        'FuelTemperature',
#        'IntakeManifoldTemperature','ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 
#        'Throttle', 'TurboBoostPressure']

Finding commas in columns using the following code:

In [22]:
#Diagnostics['AcceleratorPedal'] = Diagnostics['AcceleratorPedal'].fillna('')

# Find rows with commas in the 'col1' column
#comma_rows = Diagnostics['AcceleratorPedal'].str.contains(',')

# Print the rows with commas
#print(Diagnostics[comma_rows])

In [23]:
# Remove commas from all 11 columns
for col in Diagnostics.columns[:21]:
    Diagnostics[col] = Diagnostics[col].astype(str).str.replace(',', '')

# Convert all columns to numeric
Diagnostics = Diagnostics.apply(pd.to_numeric, errors='coerce')

Diagnostics

Name,FaultId,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,...,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,0.0,14.2100,,66.48672,423178.70000,100.4,11.0,0.00,96.74375,...,,,78.8,1023,,,0.00000,3276.75,,0.00
1,2,,,,,,,,,,...,,,,1279,,,,,,
2,3,,,,,,,,,,...,,,,1279,,,,,,
3,4,,,,,,,,,,...,,,,1279,,,,,,
4,5,,,,,,,,,,...,,,,16639,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1187330,1248454,,,,,,,,,,...,,,,1023,,,,,,
1187331,1248455,100.0,14.5000,,64.62260,423937.90000,185.0,51.0,37.12,211.49370,...,32.0,,98.6,18431,,,65.01096,,73.2,7.83
1187332,1248456,0.0,14.3550,,66.48672,465925.40000,186.8,62.0,41.18,212.84380,...,,,91.4,17407,,,66.57410,,100.0,6.96
1187333,1248457,1.6,14.4275,,67.72946,28606.65625,181.4,0.0,27.26,221.73120,...,,,100.4,1023,,,11.84489,14.10,100.0,1.74


In [24]:
Diagnostics = Diagnostics.drop(columns=['CruiseControlActive', 'IgnStatus', 'ParkingBrake'])

The following code will allow the transformation of the nan values to the mean per truck in the columns. 

In [25]:
#imputer =  SimpleImputer(strategy='median', fill_value=None)

#Impute the missing values using the median strategy
#imputed_Diagnostics = imputer.fit_transform(Diagnostics)


#imputed_Diagnostics

In [26]:
#[x for x in Diagnostics.columns if x not in
#    imputer.get_feature_names_out()]

In [27]:
# Convert the imputed array back to dataframe
#copy_imputer = pd.DataFrame(imputed_Diagnostics, columns=Diagnostics.columns)

# Replace NaN values with the median value
#copy_imputer

In [28]:
#Merge faults and vehicle dignostic tables
#faults_copy = pd.merge(faults_copy, copy_imputer, left_on='RecordID', right_on='FaultId')
#faults_copy

Repeating the process to check for better results. 

In [29]:
#The following is a copy to avoid data contamination.
Diagnostics_improved = Diagnostics

In [57]:
#Merge faults and vehicle dignostic tables
faults_improved = faults_copy.merge(Diagnostics_improved, left_on='RecordID', right_on='FaultId')
faults_improved

Unnamed: 0,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,...,FuelLtd,FuelRate,FuelTemperature,IntakeManifoldTemperature,LampStatus,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure
0,1,990349,2015-02-21 10:47:13,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,111,...,12300.907429,0.000000,,78.8,1023,,0.000000,3276.75,,0.00
1,2,990360,2015-02-21 11:34:34,,unknown,unknown,unknown,unknown,11,629,...,,,,,1279,,,,,
2,4,990370,2015-02-21 11:35:33,Incorrect Data Steering Wheel Angle,unknown,unknown,unknown,unknown,11,1807,...,,,,,1279,,,,,
3,6,990431,2015-02-21 11:40:22,Low (Severity Low) Engine Coolant Level,04993120*00025921*082113134117*07700053*I0*BBZ*,79466580,6X1u10D1500000000,CMMNS,0,111,...,70349.809964,4.583399,,111.2,1023,,13.602200,3276.75,,6.67
4,7,990439,2015-02-21 11:40:52,Low (Severity Low) Engine Coolant Level,unknown,unknown,unknown,unknown,0,111,...,40961.065437,14.291750,,78.8,1023,,41.534780,3276.75,,20.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
546669,1248448,123899434,2020-03-06 13:12:43,High Voltage (Fuel Level),,,CECU3B-NAMUX4,PACCR,49,96,...,51466.131257,0.620806,,120.2,1279,,0.941766,,100.0,1.16
546670,1248452,123901805,2020-03-06 13:42:48,Low (Severity Medium) Engine Coolant Level,04358814*06030918*051718174436*09401683*G1*BDR*,79904453,6X1u13D1500000000,CMMNS,0,111,...,64491.926797,0.515137,,104.0,2047,,5.932153,,100.0,0.58
546671,1248455,123905139,2020-03-06 14:04:23,Condition Exists Engine Protection Torque Derate,04358814*06099720*030816202706*09400153*G1*BDR*,79932020,6X1u13D1500000000,CMMNS,0,1569,...,58979.184416,7.647805,32.0,98.6,18431,,65.010960,,73.2,7.83
546672,1248456,123905996,2020-03-06 14:13:38,Abnormal Rate of Change Aftertreatment 1 Intak...,05317106*05100987*050719120655*09401585*G1*BDR*,79880653,6X1u13D1500000000,CMMNS,0,3216,...,65080.105870,8.995086,,91.4,17407,,66.574100,,100.0,6.96


In [58]:
grouped = faults_improved['EquipmentID'].reset_index(drop=True)

grouped 

0         1439
1         1439
2         1369
3         1417
4         1597
          ... 
546669    1936
546670    1886
546671    1994
546672    1850
546673    2377
Name: EquipmentID, Length: 546674, dtype: object

In [62]:
# Define the columns to include in the grouping
cols = ['ESS_Id', 'EventTimeStamp','spn', 'fmi', 
        'activeTransitionCount','MCTNumber', 'LocationTimeStamp', 
        'FaultId', 'AcceleratorPedal', 'BarometricPressure', 'CruiseControlSetSpeed',
        'DistanceLtd', 'EngineCoolantTemperature', 'EngineLoad', 'EngineOilPressure', 
        'EngineOilTemperature', 'EngineRpm', 'EngineTimeLtd', 'FuelLevel', 'FuelLtd', 
        'FuelRate', 'FuelTemperature', 'IntakeManifoldTemperature', 'LampStatus', 
        'ServiceDistance', 'Speed', 'SwitchedBatteryVoltage', 'Throttle', 'TurboBoostPressure']

# Creating a new DataFrame to store the imputed values
imputed_values = pd.DataFrame(columns=cols)

# Loop over the rows of the DataFrame
for idx, row in faults_improved.iterrows():

    # Group the current row by EquipmentID and select the columns
    grouped = faults_improved.loc[idx, cols].groupby(faults_improved.loc[idx, 'EquipmentID']).reset_index()
    
    # Impute the missing values using the mean of each group
    group_means = grouped.transform(lambda x: x.mean())
    imputed_row = grouped.transform(lambda x: x.fillna(x.mean()))
    
    # Append the imputed values for the current row to the imputed values DataFrame
    imputed_values = group_means.append(imputed_row, ignore_index=True)
    
# Replace the missing values in the original DataFrame with the imputed values
faults_improved[cols] = imputed_values

KeyError: '1439'

In [69]:
column = 'Speed' 

imputer = SimpleImputer(strategy='mean')

equipment_imputed = faults_improved.groupby('EquipmentID')[column].apply(lambda x: imputer.fit_transform(x.values.reshape(-1, 1)))

equipment_imputed

EquipmentID
1327     [[64.68085], [0.0], [27.56364], [30.2602900000...
1328     [[32.994568], [32.994568], [0.0], [66.51585], ...
1329     [[21.51397357310345], [21.51397357310345], [21...
1330     [[34.05826046666667], [34.05826046666667], [34...
1331     [[65.53524], [59.23415], [40.61667157045394], ...
                               ...                        
308      [[49.09803], [0.0], [0.0], [66.74886], [0.0], ...
309      [[26.31119], [29.461274841417275], [0.0], [29....
310      [[1.339832], [1.339832], [8.2105047035044], [0...
R1762                 [[65.96243], [66.83624], [2.058292]]
R1764                                         [[4.378725]]
Name: Speed, Length: 1042, dtype: object

In [86]:
equipment_imputed.apply(lambda x : x.flatten()).explode()

for Id in equipment_imputed.index :

    faults_improved.loc[faults_improved['EquipmentID']== Id, column]= equipment_imputed.loc[Id].flatten()


In [88]:
faults_improved[column].isna().sum()

0

In [65]:
# Loop over the rows of the DataFrame
#for i in range(len(faults_improved)):
    # Get the current row by row number i
 #   row = faults_improved.iloc[i]

    # Group the current row by EquipmentID and select the desired columns
  #  grouped = row[cols].groupby(row['EquipmentID'])
    
    # Impute the missing values using the mean of each group
   # imputed_values = grouped.apply(lambda x: imputer.fit_transform(x))
    
    # Replace the missing values in the current row with the imputed values
    #faults_improved.loc[i, cols] = imputed_values.unstack()

In [40]:
# Define the columns to impute
#cols = ['IntakeManifoldTemperature', 'FuelTemperature']

# Create a SimpleImputer object with the desired strategy
imputer = SimpleImputer(strategy='mean')

**Rolling window of data - options to use EventTimeStamp to find patterns in the data.**

In [None]:
faults_improved[faults_improved['EquipmentID']== '1327']