In [2]:
# imports
from numpy import diag
import pandas as pd
from geopy.distance import geodesic

# load and prepare data
faults = pd.read_csv("../data/J1939Faults.csv")
diagnostics = pd.read_csv("../data/VehicleDiagnosticOnboardData.csv")
print("----Raw Data----")
print(f"faults.shape: {faults.shape}", f"diagnostics.shape: {diagnostics.shape}")
print(f"\nfaults: {faults}", f"\ndiagnostics: {diagnostics}")


  faults = pd.read_csv("../data/J1939Faults.csv")


----Raw Data----
faults.shape: (1187335, 20) diagnostics.shape: (12821626, 4)

faults:          RecordID     ESS_Id           EventTimeStamp  \
0               1     990349  2015-02-21 10:47:13.000   
1               2     990360  2015-02-21 11:34:34.000   
2               3     990364  2015-02-21 11:35:31.000   
3               4     990370  2015-02-21 11:35:33.000   
4               5     990416  2015-02-21 11:39:41.000   
...           ...        ...                      ...   
1187330   1248454  123904424  2020-03-06 14:00:26.000   
1187331   1248455  123905139  2020-03-06 14:04:23.000   
1187332   1248456  123905996  2020-03-06 14:13:38.000   
1187333   1248457  123906113  2020-03-06 14:14:13.000   
1187334   1248458  123906131  2020-03-06 14:15:34.000   

                                          eventDescription  actionDescription  \
0                  Low (Severity Low) Engine Coolant Level                NaN   
1                                                      NaN        

In [3]:
# drop unneccesary columns
drop_list = [
    "ESS_Id",
    "actionDescription",
    "ecuSoftwareVersion",
    "ecuSerialNumber",
    "ecuModel",
    "ecuMake",
    "ecuSource",
    "faultValue",
    "LocationTimeStamp",
    "MCTNumber",
]
faults = faults.drop(columns=drop_list)
print(f'\nfaults.head(), shape: {faults.head(), faults.shape}')

# identify service station locations
service_stations = [
    (36.0666667, -86.4347222),
    (35.5883333, -86.4438888),
    (36.1950, -83.174722),
]
threshold_distance = 1.0


def is_near_service_station(lat, lon):
    point = (lat, lon)
    for station in service_stations:
        distance = geodesic(point, station).kilometers
        if distance <= threshold_distance:
            return True
    return False


# create boolean column denoting service station
faults["IsServiceStation"] = faults.apply(
    lambda row: is_near_service_station(row["Latitude"], row["Longitude"]),
    axis=1,
)
print(f'\nfaults["IsServiceStation"]: {faults["IsServiceStation"].value_counts(normalize=True)}')  # proportion near service stations.
diagnostics["Value"] = diagnostics["Value"].replace(
    {"FALSE": False, "TRUE": True}
)



faults.head(), shape: (   RecordID           EventTimeStamp                         eventDescription  \
0         1  2015-02-21 10:47:13.000  Low (Severity Low) Engine Coolant Level   
1         2  2015-02-21 11:34:34.000                                      NaN   
2         3  2015-02-21 11:35:31.000      Incorrect Data Steering Wheel Angle   
3         4  2015-02-21 11:35:33.000      Incorrect Data Steering Wheel Angle   
4         5  2015-02-21 11:39:41.000                                      NaN   

    spn  fmi  active  activeTransitionCount EquipmentID   Latitude  Longitude  
0   111   17    True                      2        1439  38.857638 -84.626851  
1   629   12    True                    127        1439  38.857638 -84.626851  
2  1807    2   False                    127        1369  41.421250 -87.767361  
3  1807    2    True                    127        1369  41.421018 -87.767361  
4  4364   17   False                      2        1674  38.416481 -89.442638  , (1187335

In [4]:

# pivot diagnostics to long format
diagnostics_w = diagnostics.pivot(
    index="FaultId", columns="Name", values="Value"
)
features = diagnostics_w.reset_index()
features.columns.name = None
print("\n----Features after pivoting diagnostics----")
print(f"features.head(): {features.head()}")
combined = pd.merge(
    faults, features, left_on="RecordID", right_on="FaultId", how="left"
)
combined_filtered = combined[~combined["IsServiceStation"]]
combined["IsDerateFull"] = combined["spn"] == 5246
print(f"\ncombined['IsDerateFull']: { combined['IsDerateFull'].value_counts(normalize=True) }")
print("\n----Combined features and faults----")
print(f"combined.head(): {combined.head()}")

features = [
    "AcceleratorPedal",
    "DistanceLtd",
    "EngineOilTemperature",
    "TurboBoostPressure",
    "FuelRate",
    "EngineLoad",
    "EngineOilPressure",
    "EngineCoolantTemperature",
    "BarometricPressure",
    "EngineRpm",
    "IntakeManifoldTemperature",
    "FuelTemperature",
    "SwitchedBatteryVoltage",
]
print(combined[features].info())
print(combined[features].head())


----Features after pivoting diagnostics----
features.head():    FaultId AcceleratorPedal BarometricPressure CruiseControlActive  \
0        1                0              14.21               False   
1        2              NaN                NaN                 NaN   
2        3              NaN                NaN                 NaN   
3        4              NaN                NaN                 NaN   
4        5              NaN                NaN                 NaN   

  CruiseControlSetSpeed DistanceLtd EngineCoolantTemperature EngineLoad  \
0              66.48672    423178.7                    100.4         11   
1                   NaN         NaN                      NaN        NaN   
2                   NaN         NaN                      NaN        NaN   
3                   NaN         NaN                      NaN        NaN   
4                   NaN         NaN                      NaN        NaN   

  EngineOilPressure EngineOilTemperature  ... FuelTemperature IgnS