In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt


In [2]:
#Read in J1939Faults. low_memory = false because of warning about mixed type columns.
faults = pd.read_csv('../data/J1939Faults.csv',
                     index_col = 'RecordID',
                     parse_dates = ['EventTimeStamp', 'LocationTimeStamp'],
                     low_memory = False)
#remove faults that occur at service locations. Had to remove the last digit because the rows dont have it.
service_location1 = faults[(faults["Latitude"] == 36.066666) & (faults["Longitude"] == -86.434722)]

service_location2 = faults[(faults["Latitude"] == 35.588333) & (faults["Longitude"] == -86.443888)]

service_location3 = faults[(faults["Latitude"] == 36.1950) & (faults["Longitude"] ==  -83.174722)]

#concat service_locations for removal
service_locations = pd.concat([service_location1, service_location2, service_location3])
#drop service locations
faults_cleaned = faults.drop(service_locations.index)
#remove EquipmentIDs that are longer than 5 Characters per the README
faults_cleaned = faults_cleaned[faults_cleaned['EquipmentID'].map(len) <= 5]
#removed columns that only contained null values or would not be valuable because they're unique to the truck.
faults_cleaned = faults_cleaned.drop(['actionDescription', 'faultValue', 'ecuSerialNumber',
                                      'ecuSource', 'MCTNumber'], axis = 1).reset_index()


  mask |= (ar1 == a)


In [3]:
faults_cleaned['EventTimeStamp'].dt.to_period('Y').value_counts()

2016    332175
2015    325536
2017    254680
2018    143289
2019    111321
2020     17179
2011       227
2000       219
2010        47
2009        24
2026         3
2002         1
Freq: A-DEC, Name: EventTimeStamp, dtype: int64

### Based on the distribution of years I think it would be best to limit the dataset to years 2015-2020 

#### 2015 -2020 are consecutive while before 2015 it skips years 2014, 2013, 2012 and picks back up at 2011. It's possible that some trucks from 2014 carry over into 2015, but unlikely any from 2011 are carried over since they limit their lease to 4 years.

In [11]:
#Limit to years 2015-2020
faults_cleaned = faults_cleaned.loc[(faults_cleaned['EventTimeStamp'] >= '2015-01-01') & (faults_cleaned['EventTimeStamp'] <= '2020-12-31') ]

# In the full dataset
### How many trucks have a full derate/partial derate? 

#### 210 have full derates 

#### 498 have partial

#### 182 have both. 

Interestingly, when filtering down for years 2015-2020, Full derates went down from 211 to 210 but the partial remained unchanged. 

In [12]:
full = faults_cleaned[(faults_cleaned['spn'] == 5246)]
full = full['EquipmentID'].unique()

len(full)

210

In [13]:
partial = faults_cleaned[(faults_cleaned['spn'] == 1569)]
partial = partial['EquipmentID'].unique()

len(partial)

498

In [14]:
intersection = np.intersect1d(full, partial)

len(intersection)

182

### Filter down to the trucks that have both

In [15]:
both = faults_cleaned[faults_cleaned['EquipmentID'].isin(intersection)]

In [16]:
both.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357259 entries, 5 to 1184681
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   RecordID               357259 non-null  int64         
 1   ESS_Id                 357259 non-null  int64         
 2   EventTimeStamp         357259 non-null  datetime64[ns]
 3   eventDescription       346792 non-null  object        
 4   ecuSoftwareVersion     287748 non-null  object        
 5   ecuModel               347522 non-null  object        
 6   ecuMake                347522 non-null  object        
 7   spn                    357259 non-null  int64         
 8   fmi                    357259 non-null  int64         
 9   active                 357259 non-null  bool          
 10  activeTransitionCount  357259 non-null  int64         
 11  EquipmentID            357259 non-null  object        
 12  Latitude               357259 non-null  flo

In [17]:
both['EventTimeStamp'].dt.to_period('Y').value_counts()

2016    117106
2015    105387
2017     74788
2018     37820
2019     18624
2020      3534
Freq: A-DEC, Name: EventTimeStamp, dtype: int64

In [18]:
both.shape

(357259, 15)

In [20]:
both.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 357259 entries, 5 to 1184681
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   RecordID               357259 non-null  int64         
 1   ESS_Id                 357259 non-null  int64         
 2   EventTimeStamp         357259 non-null  datetime64[ns]
 3   eventDescription       346792 non-null  object        
 4   ecuSoftwareVersion     287748 non-null  object        
 5   ecuModel               347522 non-null  object        
 6   ecuMake                347522 non-null  object        
 7   spn                    357259 non-null  int64         
 8   fmi                    357259 non-null  int64         
 9   active                 357259 non-null  bool          
 10  activeTransitionCount  357259 non-null  int64         
 11  EquipmentID            357259 non-null  object        
 12  Latitude               357259 non-null  flo