In [None]:
import pandas as pd
import numpy as np
import pickle

import plotly.express as px

In [None]:
on_faults = pd.read_pickle('../data/on_faults.pkl')

In [None]:
# Drop rows where fault light is being turned off
on_faults = on_faults.loc[on_faults['active'] == True].reset_index(drop = True)

In [None]:
# To help with amount of memory used, reduce integer columns to int16
int_cols=['ecuSource', 'spn', 'fmi', 'activeTransitionCount']

on_faults[int_cols] = on_faults[int_cols].astype('int16')

In [None]:
# Filter down to only include full derates
full_derates = on_faults.loc[on_faults['spn'] == 5246].reset_index(drop = True)

In [None]:
#Create indicator column for trucks that did/did not experience at least one derate
on_faults.loc[on_faults['EquipmentID'].isin(full_derates['EquipmentID']), 'derate'] = 'Derate'
on_faults.loc[~on_faults['EquipmentID'].isin(full_derates['EquipmentID']), 'derate'] = 'No Derate'

In [None]:
# Look at instances of faults that occurred in the same second for the same truck
(
    on_faults
    .loc[on_faults.duplicated(subset = ['EquipmentID', 'EventTimeStamp'], keep = 'last')]
    .reset_index(drop=True)
)

In [None]:
# Clean up entries in the ecuMake column
on_faults['ecuMake'] = (
    on_faults['ecuMake']
    .str.replace('?MMNS', 'CMMNS', regex=False)
    .str.replace('??MNS', 'CMMNS', regex=False)
    .str.replace('????R', 'PACCR', regex=False)
    .str.replace('?ACCR', 'PACCR', regex=False)
    .str.replace('???CR', 'PACCR', regex=False)
    .str.replace('?CAR', 'PCAR', regex=False)
    .str.replace('?NDWS', 'BNDWS', regex=False)
    .str.replace('?ATON', 'EATON', regex=False)
)

missing_cmmns = ['6X1u13D1500000000', '6X1u17D1500000000']

on_faults.loc[(on_faults['ecuMake']=='????S')
              &
              (on_faults['ecuModel'].isin(missing_cmmns)), 'ecuMake'
             ] = 'CMMNS'

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel'].isin(missing_cmmns)), 'ecuMake'
             ] = 'CMMNS'

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel']=='CECU3B-NAMUX4'), 'ecuMake'] = 'PACCR'

missing_bndws = ['EC60-adv', 'EC80ESP']

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel'].isin(missing_bndws)), 'ecuMake'
             ] = 'BNDWS'

on_faults.loc[(on_faults['ecuMake']=='????S')
              &
              (on_faults['ecuModel'].isin(missing_bndws)), 'ecuMake'
             ] = 'BNDWS'

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel'] == '0USA13_13_0415_2238A'), 'ecuMake'
             ] = 'VOLVO'

on_faults.loc[(on_faults['ecuMake']=='?????')
              &
              (on_faults['ecuModel'] == 'EEO-xxF112C'), 'ecuMake'
             ] = 'EATON'

In [None]:
derate_times = (
    on_faults
    .loc[on_faults['spn'] == 5246][['EquipmentID', 'EventTimeStamp', 'spn']]
    .rename(columns = {'EventTimeStamp' : 'derateTime'})
)

In [None]:
# Create columns that indicate how far before the next derate/since the last derate for a given truck a given fault occurs
derate_times = (
    on_faults
    .loc[on_faults['spn'] == 5246][['EquipmentID', 'EventTimeStamp', 'spn']]
    .rename(columns = {'EventTimeStamp' : 'derateTime'})
)

derate_times = derate_times.loc[~derate_times.duplicated()]

final_df = (
    pd.merge(on_faults, derate_times, how = 'left', 
             left_on = ['EquipmentID', 'EventTimeStamp', 'spn'], 
             right_on = ['EquipmentID', 'derateTime', 'spn'])
)

final_df = final_df.sort_values(['EquipmentID', 'EventTimeStamp']).reset_index(drop=True)

final_df['derateTime_ff'] = final_df.groupby('EquipmentID')['derateTime'].ffill()
final_df['derateTime_bf'] = final_df.groupby('EquipmentID')['derateTime'].bfill()

final_df['timeSinceDerate'] = final_df['EventTimeStamp'] - final_df['derateTime_ff']
final_df['timeUntilDerate'] = final_df['derateTime_bf'] - final_df['EventTimeStamp']

In [None]:
final_df.info()

In [None]:
final_df[['spn', 'fmi']].value_counts()

In [None]:
# Drop some columns
final_df =  final_df.drop(columns = ['derateTime', 'derateTime_bf', 'derateTime_ff', 'dist_A', 'dist_B', 'dist_C'])

In [None]:
# Make some pickles
#final_df.to_pickle('../data/final_data.pkl')
#truck_derate_status.to_pickle('../data/truck_derate_status.pkl')

In [None]:
# Summary data for trucks that did or did experienced at least one derate
truck_overview = (
    on_faults.groupby(['EquipmentID', 'derate'])
    .agg(total_faults = ('RecordID', 'count'),
         unique_faults = ('spn', 'nunique'),
         unique_dates = ('EventDate', 'nunique'))
    .reset_index()
)

truck_overview

### Create visualizations

In [None]:
import plotly.graph_objects as go

In [None]:
# Total faults by derate status
truck_overview.groupby('derate')['total_faults'].sum()

In [None]:
# Create column to indicate whether a derate was the first for a given truck on a given day
full_derates.loc[~full_derates.duplicated(['EquipmentID', 'EventDate'], keep = 'first'), 'first?'] = 'First'
full_derates.loc[full_derates.duplicated(['EquipmentID', 'EventDate'], keep = 'first'), 'first?'] = 'Subsequent'

In [None]:
# Get the number of faults and the number of trucks with faults for each month in the dataset

df1 = (
    on_faults
    .groupby('MonthYear')
    .agg(total = ('EquipmentID', 'nunique'))
    .reset_index()
)

df1['Category'] = 'Trucks'

df2 = (
    on_faults
    .groupby('MonthYear')
    .agg(total = ('EquipmentID', 'count'))
    .reset_index()
)

df2['Category'] = 'Faults'

totals = pd.concat([df1, df2])

totals

In [None]:
# Plot line graph of the number of faults and the number of trucks with faults over time

df = totals

fig = px.line(df, 
              x='MonthYear', 
              y = 'total',
              color = 'Category',
              labels={
                  'MonthYear' : 'Time Period',
                  'total' : 'Total Number'
              },
              markers = True,
              color_discrete_map={
                  'Trucks' : '#ff9900',
                  'Faults' : '#00e6ff'
              },
              template="plotly_white"
             )

fig.update_layout(
    title_text="Total Faults and Total Trucks with Faults by Month",  
    xaxis_title='', 
    yaxis_title=''
)

fig.show()

In [None]:
#fig.write_html("../data/total_faults.html")

In [None]:
# Create a dataframe with the total number of derates and trucks with derates for each month in the dataset

td_df1 = (
    on_faults
    .loc[on_faults['spn'] == 5246]
    .groupby('MonthYear')
    .agg(total = ('EquipmentID', 'nunique'))
    .reset_index()
)

td_df1['Category'] = 'Trucks'

td_df2 = (
    on_faults
    .loc[on_faults['spn'] == 5246]
    .groupby('MonthYear')
    .agg(total = ('EquipmentID', 'count'))
    .reset_index()
)

td_df2['Category'] = 'Derates'

td_totals = pd.concat([td_df1, td_df2])

td_totals

In [None]:
df = td_totals

fig = px.line(df, 
             x='MonthYear', 
              y = 'total',
              color = 'Category',
              labels={
                  'MonthYear' : 'Time Period',
                  'total' : 'Total'
              },
              markers = True,
              color_discrete_map={
                  'Trucks' : '#ff9900',
                  'Derates' : '#00ff99'
              },
              template="plotly_white"
             )

fig.update_layout(
    title_text="Total Derates and Total Trucks with Derates by Month", 
    xaxis_title='', 
    yaxis_title=''
)

fig.show()

In [None]:
#fig.write_html("../data/total_derates.html")

In [None]:
comparisons = (
    on_faults#.loc[on_faults['EquipmentID'] != 1692]
    .groupby('derate').agg(
        total_faults = ('RecordID', 'count'),
        total_trucks = ('EquipmentID', 'nunique')
                          )
    .reset_index()
)

comparisons['faults_per_truck'] = round(comparisons['total_faults'] / comparisons['total_trucks'], 0)

comparisons

In [None]:
df = comparisons

fig = px.bar(df, 
             x='derate', 
             y='faults_per_truck',
             color ='derate',
             labels={
                'faults_per_truck' : 'Faults per Truck',
                'derate' : 'Derate Status'
              },
              color_discrete_map={
                  'No Derate' : '#9900ff',
                  'Derate' : '#00ff99'
              },
             template="plotly_white",
             text_auto=True       
            )


fig.update_layout(title_text="Total Faults per Truck by Derate Status",
                  xaxis={'visible': False, 'showticklabels': False}
                 )




fig.show()

In [None]:
#fig.write_image("../images/faults_per_truck.jpeg")

In [None]:
firsts = (
    full_derates
    .groupby('first?').agg(
        totals = ('RecordID', 'count')
                          )
    .reset_index()
)

firsts

In [None]:
df = firsts

fig = px.bar(df, 
             x='first?', 
             y='totals',
             color ='first?',
             labels={
                'totals' : 'Total Derates',
                'first?' : 'Derate Sequence'
              },
             color_discrete_map={
                  'First' : '#00ff59',
                  'Subsequent' : '#00ffd9'
              },
             template="plotly_white",
             text_auto=True
             )


fig.update_layout(title_text="Nearly 1/4 of All Derate Faults Occur In Sequence (Same Truck, Same Day)",
                  xaxis={'visible': False, 'showticklabels': False}
                 )




fig.show()

In [None]:
#fig.write_image("../data/derate_sequence.jpeg")

In [None]:
truck_overview

In [None]:
# Comparing distribution of total faults for trucks that did have a derate as opposed to those that did not

df = truck_overview.loc[(truck_overview['derate'] == 'Derate')]
df1 = truck_overview.loc[truck_overview['derate'] == 'No Derate']

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=df['total_faults'],
    histnorm='percent',
    name='Derate', # name used in legend and hover labels
    marker_color='#00ff99',
    opacity=0.75
))
fig.add_trace(go.Histogram(
    x=df1['total_faults'],
    histnorm='percent',
    name='No Derate',
    marker_color='#9900ff',
    opacity=0.75
))

fig.update_layout(
    template = 'plotly_white',
    title_text='Distribution of Trucks Experiencing a Given Number of Faults, Derate Status Comparison',
    xaxis_title_text='Total Faults', # xaxis label
    bargap=0.2, # gap between bars of adjacent location coordinates
    bargroupgap=0.05 # gap between bars of the same location coordinates
)

fig.update_yaxes(ticksuffix = "%")

fig.update_xaxes(range = [-10, 10000])

fig.update_yaxes(range = [0, 70])

fig.show()

In [None]:
#fig.write_html("../images/fault_distribution_comparison.html")

In [None]:
final_df['timeUntilDerate_dec'] = [round(x.total_seconds()/86400, 2) for x in final_df['timeUntilDerate']]

In [None]:
final_df.loc[(final_df['derate'] == 'Derate')
             &
             (final_df['timeSinceDerate'] < '24:00:00')
            &
            (final_df['timeSinceDerate'] > '00:00:00')]

In [None]:
df = final_df.loc[(final_df['derate'] == 'Derate') 
                 &
                 (final_df['EquipmentID'] != 1692)
                 ]

fig = px.histogram(df, 
                   x=(df['timeUntilDerate_dec'] * 24),
                   histnorm = 'percent',
                   labels = {
                       'x' : 'Hours Before Derate'
                   },
                   color_discrete_sequence = ['#00E6FF'],
                   template="plotly_white"
                   )

fig.update_traces(xbins=dict(
        start=0,
        end=48,
        size=4
    ))

fig.for_each_trace(lambda t: t.update(hovertemplate=t.hovertemplate.replace(#"count", "Faults",
                                                                            "percent", "Proportion of Faults")))

fig.update_yaxes(ticksuffix = "%",
                range = [0, 20])

fig.update_xaxes(range = [-1, 48])

fig.update_layout(title_text="Distribution of Faults in the 2 Days Leading Up to Derates",
                  yaxis_title=""
                 )

fig.show()

In [None]:
df = final_df.loc[(final_df['derate'] == 'Derate') 
                 &
                 (final_df['EquipmentID'] != 1692)
                 ]

fig = px.histogram(df, 
                   x=(df['timeUntilDerate_dec']),
                   histnorm = 'percent',
                   labels = {
                       'timeUntilDerate_dec' : 'Days Before Derate'
                   },
                   color_discrete_sequence = ['#00ff99'],
                   template="plotly_white"
                   )

fig.update_traces(xbins=dict(
        start=0,
        end=525,
        size=7
    ))

fig.for_each_trace(lambda t: t.update(hovertemplate=t.hovertemplate.replace(#"count", "Faults",
                                                                            "percent", "Proportion of Faults")))

fig.update_yaxes(ticksuffix = "%",
                range = [0, 6])

fig.update_xaxes(range = [-1, 525])

fig.update_layout(title_text="Distribution of Faults With Respect to Time Ahead of Next Derate",
                  yaxis_title=""
                 )

fig.show()

In [None]:
#fig.write_html("../images/derate_fault_distribution_pre.html")

In [None]:
final_df.loc[~final_df['timeUntilDerate'].isna()]