In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import timedelta
import numpy as np
import time
import io

In [2]:
faults = pd.read_csv("../data/J1939Faults.csv", low_memory = False)
diagnostics = pd.read_csv("../data/VehicleDiagnosticOnboardData.csv")
sfc = pd.read_excel("../data/Service Fault Codes_1_0_0_167.xlsx")

  warn(msg)


In [3]:
pd.set_option('display.max_columns', None)

In [4]:
faults_prepped = faults.dropna(axis=1, how='all').dropna(axis=0, how='all')

In [5]:
exclude_coords = [
    (36.0666667, -86.4347222),
    (35.5883333, -86.4438888),
    (36.1950, -83.174722)
]
def haversine(lat1, lon1, lat2, lon2):
    R = 3958.7  # Radius of Earth in km - roughly 3958.7 miles
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2 # this is the haversine formula that's accounting for the curvature of the earth. think calculus & a polar coordinate system.
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) # we're calculating the central angle between the two points
    return R * c # calculates the arc length (distance) we're multiplying the radius of earth and the central angle
threshold = 0.5  # Distance in km - this is roughly about 3.1 miles
distances = np.array([
    haversine(faults_prepped['Latitude'], faults_prepped['Longitude'], lat, lon)
    for lat, lon in exclude_coords
]) # I'm calculating the difference between the excluded coordinates and geo coordinate in my dataframe. the array is contains the distances as scalar quantities (floats).
# which rows are outside all vicinities
outside_vicinity = (distances > threshold).all(axis=0) # boolean array true if its outside vicinity
faults_prepped = faults_prepped[outside_vicinity]

In [6]:
diagnostics_pivot = diagnostics.pivot(index=['FaultId'], columns='Name', values='Value')

In [7]:
diagnostics_pivot.shape

(1187335, 24)

In [8]:
#diagnostics_pivot = diagnostics_pivot.dropna(thresh=30)

In [9]:
data = diagnostics_pivot.merge(faults_prepped, left_on = 'FaultId', right_on = 'RecordID', how = 'inner')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 42 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   AcceleratorPedal           480660 non-null   object 
 1   BarometricPressure         530902 non-null   object 
 2   CruiseControlActive        520765 non-null   object 
 3   CruiseControlSetSpeed      521823 non-null   object 
 4   DistanceLtd                530760 non-null   object 
 5   EngineCoolantTemperature   530890 non-null   object 
 6   EngineLoad                 530421 non-null   object 
 7   EngineOilPressure          531008 non-null   object 
 8   EngineOilTemperature       529370 non-null   object 
 9   EngineRpm                  531324 non-null   object 
 10  EngineTimeLtd              527047 non-null   object 
 11  FuelLevel                  455471 non-null   object 
 12  FuelLtd                    530354 non-null   object 
 13  FuelRate    

In [11]:
columns_to_change = ['AcceleratorPedal', 
                     'BarometricPressure',
                     'CruiseControlSetSpeed',
                     'DistanceLtd',
                     'EngineCoolantTemperature',
                     'EngineLoad',
                     'EngineOilPressure',
                     'EngineOilTemperature',
                     'EngineRpm',
                     'EngineTimeLtd',
                     'FuelLevel',
                     'FuelLtd',
                     'FuelRate',
                     'FuelTemperature',
                     'IntakeManifoldTemperature',
                     'Speed',
                     'TurboBoostPressure',
                     'Throttle'
                    ]

In [12]:
for column in columns_to_change:
    data[column] = data[column].str.replace(',', '.').astype(np.float64)

In [13]:
columns_to_object = ['ecuSource',
                     'MCTNumber',
                     'RecordID',
                     'ESS_Id',
                     'LampStatus'
                    ]

for column in columns_to_object:
    data[column] = data[column].astype(object).apply(lambda x: f'{x}' if isinstance(x, str) else x)

In [14]:
to_string = ['spn', 'fmi']

for column in to_string:
    data[column] = data[column].astype('str')

In [15]:
data['spn_fmi'] = data['spn'].str.cat(data['fmi'], sep='_')

In [16]:
data['LocationTimeStamp'] = pd.to_datetime(data['LocationTimeStamp'])
data['EventTimeStamp'] = pd.to_datetime(data['EventTimeStamp'])

In [35]:
data['time_derate'] = data.loc[data['spn'] == '5246']['EventTimeStamp']
data = data.sort_values('EventTimeStamp')

data['time_derate'] = data.groupby('EquipmentID')['time_derate'].bfill()
data['time_until_derate'] = data['time_derate'] - data['EventTimeStamp']

data['target'] = (data['time_until_derate'] > '2 hour') & (data['time_until_derate'] < '24 hour')

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 530965 entries, 1027932 to 1057047
Data columns (total 46 columns):
 #   Column                     Non-Null Count   Dtype          
---  ------                     --------------   -----          
 0   AcceleratorPedal           480317 non-null  float64        
 1   BarometricPressure         530749 non-null  float64        
 2   CruiseControlActive        520744 non-null  object         
 3   CruiseControlSetSpeed      521575 non-null  float64        
 4   DistanceLtd                530506 non-null  float64        
 5   EngineCoolantTemperature   530695 non-null  float64        
 6   EngineLoad                 530246 non-null  float64        
 7   EngineOilPressure          530818 non-null  float64        
 8   EngineOilTemperature       529334 non-null  float64        
 9   EngineRpm                  530959 non-null  float64        
 10  EngineTimeLtd              526450 non-null  float64        
 11  FuelLevel                  454988 non

In [45]:
data.loc[data['spn'] == '5246'].drop_duplicates()

Unnamed: 0,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,EngineRpm,EngineTimeLtd,FuelLevel,FuelLtd,FuelRate,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,spn_fmi,time_derate,time_until_derate,target
1048749,0.0,14.4275,False,0.00000,216319.8,120.2,0.0,0.00,116.48750,0.000,4829.05,93.6,30812.896059,0.000000,,True,93.2,6143,True,,0.000000,,100.0,0.00,1237840,119019218,2011-01-01 00:03:22,,04384413*22115617*090617144354*60701715*G1*BGT*,80035307,6X1u17D1500000000,CMMNS,0,5246,16,True,1,2175,105465629,33.259027,-84.096666,2020-01-28 05:15:35,5246_16,2011-01-01 00:03:22,0 days,False
1048750,0.0,14.4275,False,0.00000,216319.8,120.2,0.0,0.00,116.48750,0.000,4829.05,93.6,30812.896059,0.000000,,True,93.2,6143,True,,0.000000,,100.0,0.00,1237841,119019219,2011-01-01 00:03:23,,04384413*22115617*090617144354*60701715*G1*BGT*,80035307,6X1u17D1500000000,CMMNS,0,5246,0,True,1,2175,105465629,33.259027,-84.096666,2020-01-28 05:15:35,5246_0,2011-01-01 00:03:23,0 days,False
961200,100.0,14.4275,False,64.00124,505000.5,177.8,7.0,38.86,184.15630,1012.625,10191.40,73.6,68730.171113,0.964231,32.0,True,96.8,22527,False,,9.689507,,100.0,1.45,1120933,77308358,2011-01-01 00:03:34,,05317106*04101156*020516150530*09400053*G1*BDR*,79743733,6X1u13D1500000000,CMMNS,0,5246,0,True,1,1751,105411909,38.344490,-85.711712,2019-02-13 18:30:45,5246_0,2011-01-01 00:03:34,0 days,False
336962,0.0,14.2100,False,54.68066,496853.5,120.2,16.0,36.54,134.26250,645.875,9731.35,,72012.905117,0.554763,,True,77.0,22527,True,,0.000000,,,0.00,378085,7367914,2011-01-01 00:04:57,,04993120*00041643*042114185815*07700062*I0*BBZ*,79475366,6X1u10D1500000000,CMMNS,0,5246,0,True,1,1452,105304126,39.616250,-84.231666,2016-02-08 14:17:33,5246_0,2011-01-01 00:04:57,0 days,False
336959,0.0,14.2100,False,54.68066,496853.5,141.8,9.0,0.00,142.36250,0.000,9731.30,,72012.905117,0.000000,,True,102.2,22527,,,0.000000,,,0.00,378082,7367911,2011-01-01 00:16:06,,04993120*00041643*042114185815*07700062*I0*BBZ*,79475366,6X1u10D1500000000,CMMNS,0,5246,0,True,1,1452,105304126,39.616250,-84.231666,2016-02-08 14:17:33,5246_0,2011-01-01 00:16:06,0 days,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044609,8.0,14.6450,False,0.00000,333667.1,147.2,9.0,61.48,149.00000,941.000,11492.35,49.6,229.433427,1.307656,80.6,True,87.8,255,False,,7.296257,,8.0,2.61,1232200,116794344,2020-01-13 13:18:31,,,,,,49,5246,19,True,46,302,105418777,38.192824,-85.859490,2020-01-13 13:19:06,5246_19,2020-01-13 13:18:31,0 days,False
1049534,0.0,14.2825,False,66.48672,126087.4,179.6,11.0,17.98,223.64380,600.500,2978.40,15.2,16791.303969,0.422677,,True,98.6,18431,False,,0.475737,,100.0,0.29,1238712,119571469,2020-02-03 15:46:46,,04384413*22246857*121817205924*60701721*G1*BGT*,80092582,6X1u17D1500000000,CMMNS,0,5246,16,True,1,2211,105329862,35.833935,-86.410925,2020-02-03 15:47:23,5246_16,2020-02-03 15:46:46,0 days,False
1050256,0.0,13.9200,False,66.48672,434329.4,185.0,12.0,24.36,193.38130,599.500,8664.90,66.4,59349.025288,0.581180,,True,136.4,22527,True,,0.000000,,100.0,0.29,1239555,119959276,2020-02-06 07:45:08,,04358814*06005963*051718174436*09401683*G1*BDR*,79897320,6X1u13D1500000000,CMMNS,0,5246,0,True,1,1854,105385876,35.943611,-83.823009,2020-02-06 07:45:44,5246_0,2020-02-06 07:45:08,0 days,False
1052278,0.0,13.9925,False,57.78752,477486.8,197.6,10.0,21.46,204.96880,599.500,9344.30,72.4,63966.356499,0.462303,,True,120.2,22527,True,,0.000000,,100.0,0.58,1241841,120905759,2020-02-13 13:32:39,,04358814*06026985*051718174436*09401683*G1*BDR*,79903054,6X1u13D1500000000,CMMNS,0,5246,0,True,1,1872,105301976,35.707268,-81.397037,2020-02-13 13:33:15,5246_0,2020-02-13 13:32:39,0 days,False


In [44]:
data.loc[data['target'] == True].drop_duplicates()

Unnamed: 0,AcceleratorPedal,BarometricPressure,CruiseControlActive,CruiseControlSetSpeed,DistanceLtd,EngineCoolantTemperature,EngineLoad,EngineOilPressure,EngineOilTemperature,EngineRpm,EngineTimeLtd,FuelLevel,FuelLtd,FuelRate,FuelTemperature,IgnStatus,IntakeManifoldTemperature,LampStatus,ParkingBrake,ServiceDistance,Speed,SwitchedBatteryVoltage,Throttle,TurboBoostPressure,RecordID,ESS_Id,EventTimeStamp,eventDescription,ecuSoftwareVersion,ecuSerialNumber,ecuModel,ecuMake,ecuSource,spn,fmi,active,activeTransitionCount,EquipmentID,MCTNumber,Latitude,Longitude,LocationTimeStamp,spn_fmi,time_derate,time_until_derate,target
5691,,14.2100,False,64.62260,441341.5,204.8,10.0,17.40,215.9937,649.375,9071.80,39.2,69540.650968,0.660432,,True,150.8,17407,,,0.00000,3276.75,,0.58,6274,1078467,2015-02-26 01:18:18,Low Voltage (Aftertreatment 1 Intake NOx),04993120*00021657*082113134117*07700053*I0*BBZ*,79466573,6X1u10D1500000000,CMMNS,0,3216,4,True,1,1419,105355995,41.745370,-84.999444,2015-02-26 01:18:53,3216_4,2015-02-26 22:24:29,0 days 21:06:11,True
5769,,14.4275,False,64.62260,441545.1,186.8,57.0,35.96,220.2688,1386.000,9078.35,68.0,69575.521679,9.497015,,True,84.2,17407,,,65.11776,3276.75,,12.76,6352,1082488,2015-02-26 08:00:14,Low (Severity Low) Engine Coolant Level,04993120*00021657*082113134117*07700053*I0*BBZ*,79466573,6X1u10D1500000000,CMMNS,0,111,17,True,1,1419,105355995,39.496898,-86.017916,2015-02-26 08:00:51,111_17,2015-02-26 22:24:29,0 days 14:24:15,True
5806,,14.5000,False,64.62260,441634.8,141.8,22.0,33.64,153.6125,648.125,9080.00,67.2,69588.466110,2.509642,,True,64.4,17407,,,0.00000,3276.75,,2.32,6389,1086025,2015-02-26 10:44:55,Low (Severity Low) Engine Coolant Level,04993120*00021657*082113134117*07700053*I0*BBZ*,79466573,6X1u10D1500000000,CMMNS,0,111,17,True,1,1419,105355995,38.341018,-85.825324,2015-02-26 10:45:32,111_17,2015-02-26 22:24:29,0 days 11:39:34,True
5858,,14.5725,False,64.62260,441634.8,111.2,30.0,38.86,109.6813,650.625,9080.20,64.0,69588.730282,1.624663,,True,50.0,17407,,,0.00000,3276.75,,2.32,6441,1089722,2015-02-26 13:19:13,Low (Severity Low) Engine Coolant Level,04993120*00021657*082113134117*07700053*I0*BBZ*,79466573,6X1u10D1500000000,CMMNS,0,111,17,True,1,1419,105355995,38.340833,-85.824768,2015-02-26 13:19:49,111_17,2015-02-26 22:24:29,0 days 09:05:16,True
5886,,14.5725,False,64.62260,441652.6,183.2,23.0,35.96,215.8250,1257.250,9081.55,64.4,69593.221207,3.407830,,True,71.6,17407,,,43.33093,3276.75,,3.19,6469,1092590,2015-02-26 15:25:52,Low (Severity Low) Engine Coolant Level,04993120*00021657*082113134117*07700053*I0*BBZ*,79466573,6X1u10D1500000000,CMMNS,0,111,17,True,1,1419,105355995,38.192361,-85.731296,2015-02-26 15:26:28,111_17,2015-02-26 22:24:29,0 days 06:58:37,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1052160,0.0,14.1375,False,57.78752,477435.4,185.0,11.0,23.20,193.8313,598.500,9339.30,86.8,63955.393359,0.501928,,True,69.8,18431,False,,10.83516,,100.0,0.29,1241705,120852584,2020-02-13 08:32:46,Abnormal Update Rate Catalyst Tank Level,04358814*06026985*051718174436*09401683*G1*BDR*,79903054,6X1u13D1500000000,CMMNS,0,1761,9,True,1,1872,105301976,35.303148,-81.613148,2020-02-13 08:33:22,1761_9,2020-02-13 13:32:39,0 days 04:59:53,True
1052162,0.0,14.1375,False,57.78752,477435.4,185.0,11.0,23.20,193.8313,598.500,9339.30,86.8,63955.393359,0.501928,,True,69.8,18431,False,,10.83516,,100.0,0.29,1241707,120852587,2020-02-13 08:32:46,Abnormal Update Rate Catalyst Tank Temperature,04358814*06026985*051718174436*09401683*G1*BDR*,79903054,6X1u13D1500000000,CMMNS,0,3031,9,True,1,1872,105301976,35.303148,-81.613148,2020-02-13 08:33:22,3031_9,2020-02-13 13:32:39,0 days 04:59:53,True
1052163,0.0,14.1375,False,57.78752,477435.4,185.0,11.0,23.20,193.8313,598.500,9339.30,86.8,63955.393359,0.501928,,True,69.8,18431,False,,10.83516,,100.0,0.29,1241708,120852588,2020-02-13 08:32:46,Abnormal Update Rate Aftertreatment 1 Intake NOx,04358814*06026985*051718174436*09401683*G1*BDR*,79903054,6X1u13D1500000000,CMMNS,0,3216,9,True,1,1872,105301976,35.303148,-81.613148,2020-02-13 08:33:22,3216_9,2020-02-13 13:32:39,0 days 04:59:53,True
1052232,68.0,14.0650,False,57.78752,477449.6,185.0,64.0,38.86,220.2688,1351.000,9342.25,80.4,63959.091768,8.189359,,True,78.8,18431,False,,48.69026,,100.0,8.99,1241779,120884619,2020-02-13 11:32:18,,04358814*06026985*051718174436*09401683*G1*BDR*,79903054,6X1u13D1500000000,CMMNS,0,5835,9,True,1,1872,105301976,35.377638,-81.573518,2020-02-13 11:32:54,5835_9,2020-02-13 13:32:39,0 days 02:00:21,True


In [41]:
data = data.dropna(thresh=32, axis='index')

In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 530965 entries, 1027932 to 1057047
Data columns (total 46 columns):
 #   Column                     Non-Null Count   Dtype          
---  ------                     --------------   -----          
 0   AcceleratorPedal           480317 non-null  float64        
 1   BarometricPressure         530749 non-null  float64        
 2   CruiseControlActive        520744 non-null  object         
 3   CruiseControlSetSpeed      521575 non-null  float64        
 4   DistanceLtd                530506 non-null  float64        
 5   EngineCoolantTemperature   530695 non-null  float64        
 6   EngineLoad                 530246 non-null  float64        
 7   EngineOilPressure          530818 non-null  float64        
 8   EngineOilTemperature       529334 non-null  float64        
 9   EngineRpm                  530959 non-null  float64        
 10  EngineTimeLtd              526450 non-null  float64        
 11  FuelLevel                  454988 non

In [23]:
%%time

data.to_csv('../data/data_clean_05_13.csv', index=False)

CPU times: total: 26.6 s
Wall time: 26.7 s


In [24]:
faults_prepped['LocationTimeStamp'] = pd.to_datetime(faults_prepped['LocationTimeStamp'])
faults_prepped['EventTimeStamp'] = pd.to_datetime(faults_prepped['EventTimeStamp'])

faults_prepped['time_derate'] = faults_prepped.loc[faults_prepped['spn'] == 5246]['EventTimeStamp']
faults_prepped = faults_prepped.sort_values('EventTimeStamp')

faults_prepped['time_derate'] = faults_prepped.groupby('EquipmentID')['time_derate'].bfill().ffill()
faults_prepped['time_until_derate'] = faults_prepped['time_derate'] - faults_prepped['EventTimeStamp']

faults_prepped['target'] = (faults_prepped['time_until_derate'] > '2 hour') & (faults_prepped['time_until_derate'] < '14 hour')

In [25]:
faults_prepped.to_csv('../data/faults_prepped.csv', index=False)

In [26]:
%%time

df = pd.read_csv('../data/data_clean_05_05.csv', low_memory=False)

CPU times: total: 12.5 s
Wall time: 12.7 s


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1057049 entries, 0 to 1057048
Data columns (total 45 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   AcceleratorPedal           480660 non-null   float64
 1   BarometricPressure         530902 non-null   float64
 2   CruiseControlActive        520765 non-null   object 
 3   CruiseControlSetSpeed      521823 non-null   float64
 4   DistanceLtd                530760 non-null   float64
 5   EngineCoolantTemperature   530890 non-null   float64
 6   EngineLoad                 530421 non-null   float64
 7   EngineOilPressure          531008 non-null   float64
 8   EngineOilTemperature       529370 non-null   float64
 9   EngineRpm                  531324 non-null   float64
 10  EngineTimeLtd              527047 non-null   float64
 11  FuelLevel                  455471 non-null   float64
 12  FuelLtd                    530354 non-null   float64
 13  FuelRate    