In [None]:
import pandas as pd
import numpy as np

from geopy.distance import geodesic

### Read in the faults data

In [None]:
faults = pd.read_csv('../data/J1939Faults.csv')

# Drop columns with all null values
faults = faults.drop(columns = ['actionDescription', 'faultValue'])

In [None]:
faults

In [None]:
faults.info()

In [None]:
# Convert to datetime
faults['EventTimeStamp'] = pd.to_datetime(faults['EventTimeStamp'])

faults['LocationTimeStamp'] = pd.to_datetime(faults['LocationTimeStamp'])

# Split out date and time
faults['EventDate'], faults['EventTime'] = faults['EventTimeStamp'].dt.normalize(), faults['EventTimeStamp'].dt.time

faults['LocationDate'], faults['LocationTime'] = faults['LocationTimeStamp'].dt.normalize(), faults['LocationTimeStamp'].dt.time

# Create a month/year column
faults['MonthYear'] = pd.to_datetime(faults['EventDate'].dt.strftime('%B-%Y'))

faults

In [None]:
faults.loc[faults['EventDate'] < '2015-01-01']

In [None]:
faults.loc[faults['EquipmentID'].astype(str).str.len() > 5]

### Read in the onboard diagnostic info

In [None]:
onboard = pd.read_csv('../data/VehicleDiagnosticOnboardData.csv')

# The Id column is not meaningful for us, dropping it
onboard = onboard.drop(columns = 'Id')

In [None]:
onboard.info()

In [None]:
# Some values have a comma where they should have a decimal point
onboard['Value'] = onboard['Value'].str.replace(',', '.')

onboard

In [None]:
# Pivot the dataframe so each FaultId is one row
onboard = pd.pivot(onboard, index = ['FaultId'], values = 'Value', columns = ['Name']).reset_index()

onboard.info()

### Pull together the faults and onboad data tables

In [None]:
on_faults = (
    pd.merge(faults, onboard, left_on = 'RecordID', right_on = 'FaultId')
    .drop(columns = 'FaultId')
)
on_faults

In [None]:
# What is the date range of this data?
on_faults['EventDate'].describe(datetime_is_numeric=True)

# 3/18/2000 - 3/6/2020, but it looks like we have 3 rows that have a future date attached to them
# and the dates prior to 2015 were errors

In [None]:
# Filtering out the future dates and the pre-2015 dates
on_faults = on_faults.loc[(on_faults['EventDate'] <= '2020-03-06')
                          &
                          (on_faults['EventDate'] >= '2015-01-01')]

In [None]:
# per the stakeholder, get rid of the rows where the EquipmentID is more than 5 characters
on_faults = on_faults.loc[on_faults['EquipmentID'].astype(str).str.len() <= 5]

In [None]:
on_faults.info()

In [None]:
# Removing 0 value lat/lon (6 rows)
on_faults = on_faults.loc[(on_faults['Latitude'] != 0)
                          &
                          (on_faults['Longitude'] != 0)].reset_index(drop = True)

In [None]:
# Define the locations of the 3 service locations
A_point = (36.0666667, -86.4347222)

B_point = (35.5883333, -86.4438888)

C_point = (36.1950, -83.174722)

In [None]:
on_faults

In [None]:
# Calculate the distance (in miles) between the location where fault occurred and each service location
on_faults['dist_A'] = [geodesic([A_point], [on_faults['Latitude'][i], on_faults['Longitude'][i]]).mi
                     for i in range(len(on_faults))]

In [None]:
on_faults['dist_B'] = [geodesic([B_point], [on_faults['Latitude'][i], on_faults['Longitude'][i]]).mi
                     for i in range(len(on_faults))]

on_faults['dist_C'] = [geodesic([C_point], [on_faults['Latitude'][i], on_faults['Longitude'][i]]).mi
                     for i in range(len(on_faults))]

In [None]:
# How many faults occurred within 1/4 mile of a service location?
on_faults.loc[(on_faults['dist_A'] <= 0.25) | (on_faults['dist_B'] <= 0.25) | (on_faults['dist_C'] <= 0.25)]

# 127,178 (about 10%)

In [None]:
# Filter out any fault codes that occurred within 1/4 mile of a service location
on_faults = on_faults.loc[(on_faults['dist_A'] > 0.25) 
                          & 
                          (on_faults['dist_B'] > 0.25) 
                          & 
                          (on_faults['dist_C'] > 0.25)].reset_index(drop = True)

on_faults

In [None]:
# looking for duplicate records
on_faults.loc[on_faults['RecordID'].duplicated()] 

# There are none

In [None]:
len(on_faults.loc[on_faults['active'] == True])

In [None]:
import pickle
#on_faults.to_pickle('../data/on_faults.pkl')