In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

### Read in the faults data

In [None]:
faults = pd.read_csv('../data/J1939Faults.csv')

# Drop columns with all null values
faults = faults.drop(columns = ['actionDescription', 'faultValue'])

In [None]:
faults

In [None]:
faults.info()

In [None]:
# Convert to datetime
faults['EventTimeStamp'] = pd.to_datetime(faults['EventTimeStamp'])

faults['LocationTimeStamp'] = pd.to_datetime(faults['LocationTimeStamp'])

In [None]:
# Split out date and time
faults['EventDate'], faults['EventTime'] = faults['EventTimeStamp'].dt.normalize(), faults['EventTimeStamp'].dt.time

faults['LocationDate'], faults['LocationTime'] = faults['LocationTimeStamp'].dt.normalize(), faults['LocationTimeStamp'].dt.time

faults

### Read in the onboard diagnostic info

In [None]:
onboard = pd.read_csv('../data/VehicleDiagnosticOnboardData.csv')

# The Id column is not meaningful for us, dropping it
onboard = onboard.drop(columns = 'Id')

In [None]:
onboard

In [None]:
onboard.info()

In [None]:
# Pivot the dataframe so each FaultId is one row
onboard = pd.pivot(onboard, index = ['FaultId'], values = 'Value', columns = ['Name']).reset_index()

onboard.info()

### Read in the Service Fault Codes detailed info

In [None]:
codes = pd.read_excel('../data/ServiceFaultCodes.xlsx')

In [None]:
codes

### Pull together the faults and onboad data tables

In [None]:
on_faults = (
    pd.merge(faults, onboard, left_on = 'RecordID', right_on = 'FaultId')
    .drop(columns = 'FaultId')
)
on_faults

In [None]:
# What is the date range of this data?
on_faults['EventDate'].describe(datetime_is_numeric=True)

# 3/18/2000 - 3/6/2020, but it looks like we have 3 rows that have a future date attached to them

In [None]:
# Filtering out the future dates
on_faults = on_faults.loc[on_faults['EventDate'] <= '2020-03-06'].reset_index(drop = True)

In [None]:
# per the stakeholder, get rid of the rows where the EquipmentID is more than 5 characters
on_faults = on_faults.loc[on_faults['EquipmentID'].astype(str).str.len() <= 5]

In [None]:
# looking for duplicate records
on_faults.loc[on_faults['RecordID'].duplicated()] 

# There are none

In [None]:
# Plot total faults by date
df = on_faults.groupby('EventDate').agg(total_faults = ('RecordID', 'count')).reset_index()


fig = px.scatter(df, x='EventDate', y='total_faults')
fig.show()

In [None]:
# Plot unique trucks having issues by date
df = on_faults.groupby('EventDate').agg(total_faults = ('EquipmentID', 'nunique')).reset_index()


fig = px.scatter(df, x='EventDate', y='total_faults')
fig.show()

In [None]:
on_faults.loc[on_faults['EventDate'] < '2015-01-01']

In [None]:
# Doing some aggregations by truck ID
truck_overview = (
    on_faults
    .groupby('EquipmentID')
    .agg(total_faults = ('RecordID', 'count'),
         unique_faults = ('spn', 'nunique'),
         min_date = ('EventDate', 'min'),
         max_date = ('EventDate', 'max'),
         unique_dates = ('EventDate', 'nunique'))
    .reset_index()
)


truck_overview['timespan'] = truck_overview['max_date'] - truck_overview['min_date']

truck_overview.describe(datetime_is_numeric=True)

In [None]:
df = truck_overview
fig = px.histogram(df, 
                   x="total_faults",
                   marginal="box",
                   hover_data=df.columns)
fig.show()

In [None]:
df = truck_overview
fig = px.histogram(df, 
                   x="unique_faults",
                   marginal="box",
                   hover_data=df.columns).update_traces(marker=dict(color='red'))
fig.show()

In [None]:
df = truck_overview
fig = px.histogram(df, 
                   x="unique_dates",
                   marginal="box",
                   hover_data=df.columns).update_traces(marker=dict(color='green'))
fig.show()

In [None]:
df = truck_overview
fig = px.histogram(df, 
                   x="timespan",
                   marginal="box",
                   hover_data=df.columns).update_traces(marker=dict(color='yellow'))
fig.show()

### Taking a look at full derates

In [None]:
# Filter down to only include full derates
full_derates = on_faults.loc[on_faults['spn'] == 5246]



# Filter out faults where the light is going off
full_derates = full_derates.loc[on_faults['active'] == True].reset_index(drop = True)


full_derates

There are 603 instances of full rates that fit the criteria of 5 or fewer characters for the truck ID and the fault line going on instead of off

In [None]:
# Taking a look at each truck that has had a full derate, how many unique timestamps each is associated with
(
    full_derates
    .groupby('EquipmentID')
    ['EventTimeStamp'].nunique()
    .to_frame()
    .reset_index()
    .rename(columns = {'EventTimeStamp' : 'date_count'})
    .sort_values('date_count', ascending = False)
    #.head(12)
)

217 trucks experienced full derates, 12 of those trucks experienced 10 or more derates

In [None]:
# Taking a look at the timestamps for the truck with the largest number of unique timestamps
full_derates.loc[full_derates['EquipmentID'] == 1524]['EventTimeStamp'].unique()

# It appears that this truck had multiple instances of multiple derates on the same day