In [None]:
import pandas as pd
import numpy as np
import pickle

import plotly.express as px

### Read in the Service Fault Codes detailed info and do some EDA

In [None]:
codes = pd.read_excel('../data/ServiceFaultCodes.xlsx')

In [None]:
codes

In [None]:
codes.columns

In [None]:
# look at codes relating to exhaust
codes.loc[codes['Cummins Description'].str.contains('Exhaust')]

# there are 915(!) of them

In [None]:
# look at codes relating to fuel
codes.loc[codes['Cummins Description'].str.contains('Fuel')]

# there are 574 of them

In [None]:
# look at codes relating to fuel filter
codes.loc[codes['Cummins Description'].str.contains('Fuel Filter')]

# there are 15 of them

### Read in the pickle file of combined and filtered data and so some EDA

In [None]:
on_faults = pd.read_pickle('../data/on_faults.pkl')

In [None]:
on_faults.info()

#### Looking at fault events over time

In [None]:
# Plot total faults by date
df = on_faults.groupby('EventDate').agg(total_faults = ('RecordID', 'count')).reset_index()


fig = px.scatter(df, x='EventDate', y='total_faults')
fig.show()

In [None]:
# Plot unique trucks having issues by date
df = on_faults.groupby('EventDate').agg(total_faults = ('EquipmentID', 'nunique')).reset_index()


fig = px.scatter(df, x='EventDate', y='total_faults')
fig.show()

#### Now doing some aggregations by fault type

In [None]:
on_faults.columns

In [None]:
fault_overview = (
    on_faults
    .groupby(['spn', 'fmi'])
    .agg(total_occurrences = ('RecordID', 'count'),
         unique_trucks = ('EquipmentID', 'nunique'),
         min_date = ('EventDate', 'min'),
         max_date = ('EventDate', 'max'),
         unique_dates = ('EventDate', 'nunique'))
    .reset_index()
)


fault_overview['timespan'] = fault_overview['max_date'] -fault_overview['min_date']

fault_overview[['total_occurrences', 'unique_trucks', 
                'min_date', 'max_date', 
                'unique_dates', 'timespan']].describe(datetime_is_numeric=True)

In [None]:
fault_overview.sort_values('total_occurrences', ascending = False).head(5)

In [None]:
on_faults.loc[((on_faults['spn'] == 929)
             &
              (on_faults['fmi'] == 9))
             |
              ((on_faults['spn'] == 111)
             &
              (on_faults['fmi'] == 17))]

# There are 2 faults that account for nearly half of the data set

In [None]:
# Removing the above faults since they are likely just to cause a lot of "noise" in the analysis
final_faults = on_faults.loc[~(((on_faults['spn'] == 929)
                              &
                              (on_faults['fmi'] == 9))
                             |
                             ((on_faults['spn'] == 111)
                              &
                              (on_faults['fmi'] == 17)))].reset_index(drop = True)

In [None]:
final_faults.info()

In [None]:
# Redo faults overview
fault_overview = (
    final_faults
    .groupby(['spn', 'fmi'])
    .agg(total_occurrences = ('RecordID', 'count'),
         unique_trucks = ('EquipmentID', 'nunique'),
         min_date = ('EventDate', 'min'),
         max_date = ('EventDate', 'max'),
         unique_dates = ('EventDate', 'nunique'))
    .reset_index()
)


fault_overview['timespan'] = fault_overview['max_date'] -fault_overview['min_date']

fault_overview[['total_occurrences', 'unique_trucks', 
                'min_date', 'max_date', 
                'unique_dates', 'timespan']].describe(datetime_is_numeric=True)

#### Doing some aggregations by truck ID

In [None]:
truck_overview = (
    final_faults
    .groupby('EquipmentID')
    .agg(total_faults = ('RecordID', 'count'),
         unique_faults = ('spn', 'nunique'),
         min_date = ('EventDate', 'min'),
         max_date = ('EventDate', 'max'),
         unique_dates = ('EventDate', 'nunique'))
    .reset_index()
)


truck_overview['timespan'] = truck_overview['max_date'] - truck_overview['min_date']

truck_overview.describe(datetime_is_numeric=True)

In [None]:
df = truck_overview
fig = px.histogram(df, 
                   x="total_faults",
                   marginal="box",
                   hover_data=df.columns)
fig.show()

In [None]:
df = truck_overview
fig = px.histogram(df, 
                   x="unique_faults",
                   marginal="box",
                   hover_data=df.columns).update_traces(marker=dict(color='red'))
fig.show()

In [None]:
df = truck_overview
fig = px.histogram(df, 
                   x="unique_dates",
                   marginal="box",
                   hover_data=df.columns).update_traces(marker=dict(color='green'))
fig.show()

In [None]:
df = truck_overview
fig = px.histogram(df, 
                   x="timespan",
                   marginal="box",
                   hover_data=df.columns).update_traces(marker=dict(color='yellow'))
fig.show()

### Taking a look at full derates

In [None]:
# Filter down to only include full derates
full_derates = on_faults.loc[on_faults['spn'] == 5246]



# Filter out faults where the light is going off
full_derates = full_derates.loc[on_faults['active'] == True].reset_index(drop = True)


full_derates

There are 493 instances of full rates that fit all the filtering criteria

In [None]:
# Taking a look at each truck that has had a full derate, how many unique timestamps each is associated with
(
    full_derates
    .groupby('EquipmentID')
    ['EventTimeStamp'].nunique()
    .to_frame()
    .reset_index()
    .rename(columns = {'EventTimeStamp' : 'date_count'})
    .sort_values('date_count', ascending = False)
    #.head(10)
)

194 trucks experienced full derates, 8 of those trucks experienced 10 or more derates

In [None]:
# Taking a look at the timestamps for the truck with the largest number of unique timestamps
full_derates.loc[full_derates['EquipmentID'] == 1539]#['EventTimeStamp'].unique()

# It appears that this truck had multiple instances of multiple derates on the same day