In [1]:

import pandas as pd 
import numpy as np 
import gc
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def trim_categories(df, cols_list):
    for col in cols_list:
        df[col].cat.rename_categories(df[col].cat.categories.str.strip(), inplace=True)

In [3]:
levels_keep_cols = ['TankId', 'TankLevelPercent', 'TankLevelGallons', 'ExactFuelEventId']
levels_col_types = {'TankId': 'category',
             'ExactFuelEventId': 'uint32',
             'TankLevelPercent': 'float32',
             'TankLevelGallons': 'float32'}
#Rows with TankId == '2' are dropped on read-in, after being manually identified
levels_fueltank_df = pd.read_csv('data/Extranet2.ExactFuelTankLevels.csv', 
                                 skiprows=[48671, 1824800, 4579415, 5656381],
                                 usecols = levels_keep_cols, 
                                 dtype=levels_col_types ).set_index(
    'ExactFuelEventId').drop(
    ['TankId'], axis=1)

#levels_fueltank_df = levels_fueltank_df.set_index('ExactFuelEventId').drop(['TankId'], axis=1)

In [4]:
optimization_fuel_df = pd.read_csv('data/BGEIDSC.EF2EFFP.csv',
                                   index_col = 'TRUCK',
                                   dtype={'TANKCAP': 'uint16', 'TRUCK' : 'uint32'},
                                  usecols = ['TRUCK', 'TANKCAP'])
optimization_fuel_df.columns = [s.lower() for s in optimization_fuel_df.columns]

In [5]:
#trim_categories(optimization_fuel_df, ['truck'])
#optimization_fuel_df.set_index('truck', inplace=True)

In [6]:
optimization_fuel_df.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 836 entries, 1 to 2176
Data columns (total 1 columns):
tankcap    836 non-null uint16
dtypes: uint16(1)
memory usage: 8.2 KB


In [7]:
optimization_fuel_df.head(n=5)

Unnamed: 0_level_0,tankcap
TRUCK,Unnamed: 1_level_1
1,200
243,200
244,200
245,200
246,200


In [8]:
units_df = pd.read_csv('data/IBGEFILE.UNITS.csv', 
                       dtype = {'UNMAKE': 'category'})
#Can drop non-int units
units_df.columns = ['unit_number', 'model_year', 'make'] 

# Trim white space from category strings 

# Remove invalid entries
units_df.unit_number = pd.to_numeric(units_df.unit_number, errors = 'coerce')
units_df = units_df[units_df.unit_number != 1].dropna().set_index('unit_number')
#units_df = units_df[units_df.unit_number.isin(['ZEBE', 'CCTRK', 'BGXL', '1']) == False]
#set index

trim_categories(units_df, ['make'])
# Fix typos
units_df.make[units_df.make.isin(['KENOWRTH', 'KEWORTH'])] = 'KENWORTH'

In [9]:
# 5 rows deleted
units_df.shape

(704, 2)

In [10]:
units_df.head(n=5)

Unnamed: 0_level_0,model_year,make
unit_number,Unnamed: 1_level_1,Unnamed: 2_level_1
535.0,1996,PETERBILT
534.0,1996,PETERBILT
533.0,1996,PETERBILT
532.0,1996,PETERBILT
531.0,1996,PETERBILT


In [43]:
transaction_df_cols = ['tr_id', 'tr_date', 'tr_time', 'tr_station_id', 'tr_station_name',
                         'tr_station_cty', 'tr_station_st', 'tr_drvr_code', 'tr_unit_id', 
                         'tr_item_cd1', 'tr_item_qty1', 'tr_item_cd2', 'tr_item_qty2',
                          'tr_item_cd3', 'tr_item_qty3', 'tr_item_cd4', 'tr_item_qty4',
                          'tr_item_cd5', 'tr_item_qty5', 'tr_item_cd6', 'tr_item_qty6']
tr_dtypes= {'tr_id' : 'uint32',  
            'tr_date' : 'uint32', 
            'tr_time' : 'uint32', 
            'tr_station_name' : 'object', 
            'tr_station_cty' : 'object', 
            'tr_station_st' : 'object', 
            'tr_drvr_code' : 'category', 
            'tr_unit_id' : 'category', 
            'tr_item_cd1' : 'category', 
            'tr_item_qty1' : 'float32', 
            'tr_item_cd2' : 'category', 
            'tr_item_qty2' : 'float32', 
            'tr_item_cd3' : 'category', 
            'tr_item_qty3' : 'float32', 
            'tr_item_cd4' : 'category', 
            'tr_item_qty4' : 'float32', 
            'tr_item_cd5' : 'category', 
            'tr_item_qty5' : 'float32', 
            'tr_item_cd6' : 'category', 
            'tr_item_qty6' : 'float32'}
act_fuel_transactions = ['ULSD', 'FUEL', 'CDSL', 'DSL1', 'BDSL']
transaction_df = pd.read_csv('data/BGETCHDATA.PTCHTRANH.csv', names = transaction_df_cols, dtype = tr_dtypes, header=0 )

# Trim white space from category strings 
trim_categories(transaction_df, ['tr_drvr_code','tr_unit_id','tr_item_cd1', 'tr_item_cd2', 'tr_item_cd3', 'tr_item_cd4', 'tr_item_cd5', 'tr_item_cd6' ])

In [44]:
transaction_df.head(n=5)

Unnamed: 0,tr_id,tr_date,tr_time,tr_station_id,tr_station_name,tr_station_cty,tr_station_st,tr_drvr_code,tr_unit_id,tr_item_cd1,...,tr_item_cd2,tr_item_qty2,tr_item_cd3,tr_item_qty3,tr_item_cd4,tr_item_qty4,tr_item_cd5,tr_item_qty5,tr_item_cd6,tr_item_qty6
0,42574765,20170101,20,520277,FLYING J VANDALIA 97,VANDALIA,OH,ULAL,1564,SCLE,...,,0.0,,0.0,,0.0,,0.0,,0.0
1,42575331,20170101,48,516202,PILOT NEW BRAUNFELS 330,NEW BRAUNFELS,TX,MANRI,1584,ULSD,...,DEFD,7.71,CADV,1.0,,0.0,,0.0,,0.0
2,42578678,20170101,134,516277,PILOT 421,DALTON,GA,RUSR,1713,ULSD,...,DEFD,4.12,,0.0,,0.0,,0.0,,0.0
3,42578722,20170101,137,231100,FJ-BRUNSWICK 627,BRUNSWICK,GA,FOSJA,1711,ULSD,...,,0.0,,0.0,,0.0,,0.0,,0.0
4,42579695,20170101,302,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,HICWIL,1837,ULSD,...,,0.0,,0.0,,0.0,,0.0,,0.0


In [46]:
#confirm no junk data in this col. 
None in transaction_df.tr_time

#Combine tr_date and tr_time into a single date time column
    #1. zero pad tr_time on left for interpretation as military time
        #cast tr_time as string
transaction_df.tr_time = transaction_df.tr_time.astype(str)

#ensure none of my strings are over 4 digits in length
for s in transaction_df.tr_time:
    assert len(s) < 5

#check for blanks
'' in transaction_df.tr_time

#add zeros to the left until each cell is len 4
zero_padded_time = []
for s in transaction_df.tr_time:
    while len(s) < 4:
        s = '0' + s
    zero_padded_time.append(s)

#spot check
for s in zero_padded_time:
    assert len(s) == 4

transaction_df.tr_time = zero_padded_time

#append tr_date (as string) and tr_time together, and then convert them to dateime
#transaction_df.tr_datetime 
tst = [datetime.strptime(str(d) + 
                         t, '%Y%m%d%H%M') 
       for d, t in zip(transaction_df.tr_date,
                       transaction_df.tr_time)]
#love list comprehensions

transaction_df['tr_datetime'] = tst#assign

#drop superfluous columns
transaction_df.drop(['tr_date','tr_time'], axis = 'columns', inplace=True)
transaction_df.head()

Unnamed: 0,tr_id,tr_station_id,tr_station_name,tr_station_cty,tr_station_st,tr_drvr_code,tr_unit_id,tr_item_cd1,tr_item_qty1,tr_item_cd2,tr_item_qty2,tr_item_cd3,tr_item_qty3,tr_item_cd4,tr_item_qty4,tr_item_cd5,tr_item_qty5,tr_item_cd6,tr_item_qty6,tr_datetime
0,42574765,520277,FLYING J VANDALIA 97,VANDALIA,OH,ULAL,1564,SCLE,1.0,,0.0,,0.0,,0.0,,0.0,,0.0,2017-01-01 00:20:00
1,42575331,516202,PILOT NEW BRAUNFELS 330,NEW BRAUNFELS,TX,MANRI,1584,ULSD,137.399994,DEFD,7.71,CADV,1.0,,0.0,,0.0,,0.0,2017-01-01 00:48:00
2,42578678,516277,PILOT 421,DALTON,GA,RUSR,1713,ULSD,137.929993,DEFD,4.12,,0.0,,0.0,,0.0,,0.0,2017-01-01 01:34:00
3,42578722,231100,FJ-BRUNSWICK 627,BRUNSWICK,GA,FOSJA,1711,ULSD,45.0,,0.0,,0.0,,0.0,,0.0,,0.0,2017-01-01 01:37:00
4,42579695,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,HICWIL,1837,ULSD,107.360001,,0.0,,0.0,,0.0,,0.0,,0.0,2017-01-01 03:02:00


In [47]:
transaction_df.shape

(147861, 20)

In [48]:
transaction_df.dtypes


tr_id                      uint32
tr_station_id               int64
tr_station_name            object
tr_station_cty             object
tr_station_st              object
tr_drvr_code             category
tr_unit_id               category
tr_item_cd1              category
tr_item_qty1              float32
tr_item_cd2              category
tr_item_qty2              float32
tr_item_cd3              category
tr_item_qty3              float32
tr_item_cd4              category
tr_item_qty4              float32
tr_item_cd5              category
tr_item_qty5              float32
tr_item_cd6              category
tr_item_qty6              float32
tr_datetime        datetime64[ns]
dtype: object

In [15]:
# converting date & time columns to datetime objs 
#transaction_df['tr_date'] = pd.to_datetime(transaction_df['tr_date'],format = '%Y%m%d')
#transaction_df['tr_time'] = pd.to_datetime(transaction_df['tr_time'])

In [55]:
transaction_df.tr_unit_id.unique()

[1564, 1584, 1713, 1711, 1837, ..., 2149, 2155, 2148, 2151, 2154]
Length: 655
Categories (655, object): [1564, 1584, 1713, 1711, ..., 2155, 2148, 2151, 2154]

In [53]:
# These rows are dropped - 85 rows dropped 
transaction_df[transaction_df.tr_unit_id.isin(['OFFICE', '']) == True] 

Unnamed: 0,tr_id,tr_station_id,tr_station_name,tr_station_cty,tr_station_st,tr_drvr_code,tr_unit_id,tr_item_cd1,tr_item_qty1,tr_item_cd2,tr_item_qty2,tr_item_cd3,tr_item_qty3,tr_item_cd4,tr_item_qty4,tr_item_cd5,tr_item_qty5,tr_item_cd6,tr_item_qty6,tr_datetime
1174,42977478,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,SPED,OFFICE,ULSD,50.000000,,0.0,,0.0,,0.0,,0.0,,0.0,2017-01-05 07:40:00
1383,43037746,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,SPED,OFFICE,ULSD,130.570007,,0.0,,0.0,,0.0,,0.0,,0.0,2017-01-05 14:57:00
6812,44866479,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,HILB,,ULSD,60.790001,,0.0,,0.0,,0.0,,0.0,,0.0,2017-01-21 06:08:00
6955,44908231,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,HILB,,ULSD,30.440001,,0.0,,0.0,,0.0,,0.0,,0.0,2017-01-21 14:33:00
13493,47176337,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,SMIFR,,ULSD,96.269997,,0.0,,0.0,,0.0,,0.0,,0.0,2017-02-09 09:11:00
15203,47747773,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,SMIFR,,ULSD,62.840000,,0.0,,0.0,,0.0,,0.0,,0.0,2017-02-14 10:46:00
16503,48174487,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,SMIFR,,ULSD,82.480003,,0.0,,0.0,,0.0,,0.0,,0.0,2017-02-17 11:53:00
16583,48198811,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,SPED,OFFICE,ULSD,50.000000,,0.0,,0.0,,0.0,,0.0,,0.0,2017-02-17 15:01:00
17310,48478526,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,1974,,ULSD,10.060000,,0.0,,0.0,,0.0,,0.0,,0.0,2017-02-20 12:12:00
18077,48727766,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,SMIFR,,ULSD,88.010002,,0.0,,0.0,,0.0,,0.0,,0.0,2017-02-22 09:42:00


In [17]:
transaction_df[transaction_df['tr_drvr_code'] == 'DYET']

Unnamed: 0,tr_id,tr_date,tr_time,tr_station_id,tr_station_name,tr_station_cty,tr_station_st,tr_drvr_code,tr_unit_id,tr_item_cd1,...,tr_item_cd2,tr_item_qty2,tr_item_cd3,tr_item_qty3,tr_item_cd4,tr_item_qty4,tr_item_cd5,tr_item_qty5,tr_item_cd6,tr_item_qty6
108,42611972,2017-01-01,1628,516152,PILOT 270,KNOXVILLE,TN,DYET,1918,ULSD,...,CADV,1.00,,0.0,,0.0,,0.0,,0.0
336,42694694,2017-01-02,2141,516271,PILOT WHITE PINE 412,WHITE PINE,TN,DYET,1918,SCLE,...,,0.00,,0.0,,0.0,,0.0,,0.0
821,42865095,2017-01-04,1052,516255,PILOT WAYNESVILLE 393,WAYNESVILLE,NC,DYET,1918,ULSD,...,DEFD,10.36,,0.0,,0.0,,0.0,,0.0
1296,43013074,2017-01-05,1209,501585,LONDON AUTO TRUCK,LONDON,KY,DYET,1918,SCLE,...,,0.00,,0.0,,0.0,,0.0,,0.0
1598,43124081,2017-01-06,841,524132,PFJ SOUTHEAST 6955,HAW RIVER,NC,DYET,1918,ULSD,...,,0.00,,0.0,,0.0,,0.0,,0.0
2028,43260218,2017-01-07,1142,519109,TA MOCKSVILLE #6251,MOCKSVILLE,NC,DYET,1918,SCLE,...,,0.00,,0.0,,0.0,,0.0,,0.0
2062,43270068,2017-01-07,1322,522919,PILOT MARION 1063,MARION,NC,DYET,1918,ULSD,...,DEFD,0.01,,0.0,,0.0,,0.0,,0.0
2128,43296008,2017-01-07,1835,516152,PILOT 270,KNOXVILLE,TN,DYET,1918,ULSD,...,,0.00,,0.0,,0.0,,0.0,,0.0
2399,43385986,2017-01-08,2133,516152,PILOT 270,KNOXVILLE,TN,DYET,1918,ULSD,...,CADV,1.00,,0.0,,0.0,,0.0,,0.0
2922,43566210,2017-01-10,1131,514632,PETRO CARLISLE,CARLISLE,PA,DYET,1918,SCLE,...,,0.00,,0.0,,0.0,,0.0,,0.0


In [18]:
# Drops Office and blank Unit IDs
transaction_df = transaction_df[transaction_df.tr_unit_id.isin(['OFFICE', '      ']) == False]

In [19]:
transaction_df.head(n=5)

Unnamed: 0,tr_id,tr_date,tr_time,tr_station_id,tr_station_name,tr_station_cty,tr_station_st,tr_drvr_code,tr_unit_id,tr_item_cd1,...,tr_item_cd2,tr_item_qty2,tr_item_cd3,tr_item_qty3,tr_item_cd4,tr_item_qty4,tr_item_cd5,tr_item_qty5,tr_item_cd6,tr_item_qty6
0,42574765,2017-01-01,20,520277,FLYING J VANDALIA 97,VANDALIA,OH,ULAL,1564,SCLE,...,,0.0,,0.0,,0.0,,0.0,,0.0
1,42575331,2017-01-01,48,516202,PILOT NEW BRAUNFELS 330,NEW BRAUNFELS,TX,MANRI,1584,ULSD,...,DEFD,7.71,CADV,1.0,,0.0,,0.0,,0.0
2,42578678,2017-01-01,134,516277,PILOT 421,DALTON,GA,RUSR,1713,ULSD,...,DEFD,4.12,,0.0,,0.0,,0.0,,0.0
3,42578722,2017-01-01,137,231100,FJ-BRUNSWICK 627,BRUNSWICK,GA,FOSJA,1711,ULSD,...,,0.0,,0.0,,0.0,,0.0,,0.0
4,42579695,2017-01-01,302,523050,BIG G EXPRESS TERMINAL,GLADEVILLE,TN,HICWIL,1837,ULSD,...,,0.0,,0.0,,0.0,,0.0,,0.0


In [20]:
events_keep_cols = ['Id', 'EventTimeStamp', 'EquipmentID', 
                    'DriverID', 'Latitude', 'Longitude', 'LocationTimeStamp', 
                    'Speed', 'Odometer', 'IgnitionStatus', 'EFReportReason']
events_col_types = {'DriverID': 'category',
             'EFReportReason': 'category',
             'EquipmentID': 'uint32',
             'Id': 'uint32',
             'IgnitionStatus': 'category',
             'Latitude': 'float64',
             'Longitude': 'float64',
             'Odometer': 'float64',
             'Speed': 'uint16'}

events_levels_df = pd.read_csv('data/Extranet2.ExactFuelEvents.csv', 
                             usecols=events_keep_cols, 
                            dtype=events_col_types, 
                             index_col = 'Id',
                             parse_dates=['EventTimeStamp', 'LocationTimeStamp'], 
                            infer_datetime_format=True).merge(
    levels_fueltank_df, copy=False, how='inner', left_index=True, right_index=True).merge(
    optimization_fuel_df, copy=False, how='left', left_on='EquipmentID', right_index=True)

trim_categories(events_levels_df, ['DriverID','EFReportReason', 'IgnitionStatus' ])

  mask |= (ar1 == a)


In [21]:
# Save 220 MB memory and 
levels_fueltank_df.index = levels_fueltank_df.index.astype('uint64')
gc.collect()

3232

In [22]:
events_levels_df.shape

(7908406, 13)

In [23]:
performance_df = pd.read_csv('data/Extranet2.QCPerformanceExtracts.csv',
                            parse_dates=['EventTimeStamp','LocationTimeStamp', 'DataStartTimeStamp','DataEndTimeStamp' ], 
                            infer_datetime_format=True)
# Remove invalid Truck IDs
#performance_df = performance_df[performance_df.EquipmentID < 9999]

In [24]:
performance_df.head(n=5)

Unnamed: 0,Id,ESS_Id,EventTimeStamp,EquipmentID,MCTNumber,EquipmentType,DriverID,Driver2ID,Latitude,Longitude,...,ExcessSpeedTime,TotalFuelUsed,IdleFuelUsed,FaultFlag,RegisteredDriver,CruiseControlTime,TopGearTime,GearDataSource,SpeedUnits,SpeedMatrix
0,76818,13641356,2017-01-01 07:29:49,1995,105356577,tractor,LYNS,,36.066249,-86.434814,...,0,0.3,0.3,0,0,0,0,3,MPH,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
1,76819,13641357,2017-01-01 07:29:50,1995,105356577,tractor,LYNS,,36.066249,-86.434814,...,0,239.7,21.2,0,1,49,1399,3,MPH,"301,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
2,76820,13641432,2017-01-01 07:49:44,105420005,105420005,tractor,LOTG,,36.195138,-83.174583,...,0,246.1,3.2,0,1,1014,1362,3,MPH,"62,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
3,76821,13643644,2017-01-01 13:58:17,1972,105363528,tractor,DINE,,36.194444,-83.175185,...,0,0.2,0.1,0,0,0,0,3,MPH,"110,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
4,76822,13643645,2017-01-01 13:58:18,1972,105363528,tractor,DINE,,36.194444,-83.175185,...,0,202.1,1.5,1,1,12,1298,3,MPH,"112,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


In [25]:
events_levels_df.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 7908406 entries, 12649083 to 21150506
Data columns (total 13 columns):
EventTimeStamp       datetime64[ns]
EquipmentID          uint32
DriverID             category
Latitude             float64
Longitude            float64
LocationTimeStamp    datetime64[ns]
Speed                uint16
Odometer             float64
IgnitionStatus       category
EFReportReason       category
TankLevelPercent     float32
TankLevelGallons     float32
tankcap              uint16
dtypes: category(3), datetime64[ns](2), float32(2), float64(3), uint16(2), uint32(1)
memory usage: 512.9 MB


In [26]:
events_levels_df.isnull().sum()

EventTimeStamp            0
EquipmentID               0
DriverID             109883
Latitude                  0
Longitude                 0
LocationTimeStamp         0
Speed                     0
Odometer                  0
IgnitionStatus            0
EFReportReason            0
TankLevelPercent          0
TankLevelGallons          0
tankcap                   0
dtype: int64

In [27]:
events_levels_df.head(n=5)

Unnamed: 0,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,tankcap
12649083,2017-01-01 00:00:32.387,1992,BATK,30.34642,-81.70724,2017-01-01 00:00:30,0,39895.5,2,1,50.700001,101.400002,200
12649084,2017-01-01 00:01:24.867,1712,TAYANT,36.93953,-84.09541,2017-01-01 00:01:27,0,360378.8,1,2,78.0,156.0,200
12649092,2017-01-01 00:04:15.053,1713,RUSR,35.58664,-84.52786,2017-01-01 00:04:17,67,377577.4,1,2,31.700001,63.400002,200
12649102,2017-01-01 00:04:10.260,1585,MCAD,35.1543,-90.14263,2017-01-01 00:04:10,0,513931.8,1,2,43.200001,108.0,250
12649103,2017-01-01 00:10:13.187,1564,ULAL,39.90268,-84.19254,2017-01-01 00:10:11,3,498691.0,1,2,50.299999,125.75,250


In [28]:
events_levels_df[events_levels_df['DriverID'] == 'DYET']

Unnamed: 0,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,tankcap
12649149,2017-01-01 00:51:47.470,1918,DYET,36.06672,-86.43406,2017-01-01 00:51:46,0,90818.7,1,0,51.200001,102.400002,200
12649194,2017-01-01 01:06:47.660,1918,DYET,36.06672,-86.43408,2017-01-01 01:06:46,0,90818.7,1,2,51.200001,102.400002,200
12649213,2017-01-01 01:21:47.707,1918,DYET,36.06671,-86.43411,2017-01-01 01:21:46,0,90818.7,1,2,51.200001,102.400002,200
12649234,2017-01-01 01:36:47.847,1918,DYET,36.06671,-86.43411,2017-01-01 01:36:47,0,90818.7,1,2,51.200001,102.400002,200
12649245,2017-01-01 01:51:47.877,1918,DYET,36.06674,-86.43410,2017-01-01 01:51:47,0,90818.7,1,2,51.200001,102.400002,200
12651765,2017-01-01 02:36:58.003,1918,DYET,36.06671,-86.43410,2017-01-01 02:36:58,0,90818.7,1,2,51.200001,102.400002,200
12651766,2017-01-01 02:51:58.127,1918,DYET,36.06675,-86.43405,2017-01-01 02:51:54,0,90818.7,1,2,51.200001,102.400002,200
12651767,2017-01-01 03:06:58.160,1918,DYET,36.06679,-86.43406,2017-01-01 03:06:57,0,90818.7,1,2,51.200001,102.400002,200
12651768,2017-01-01 03:21:58.220,1918,DYET,36.06668,-86.43411,2017-01-01 03:21:58,0,90818.7,1,2,51.200001,102.400002,200
12651769,2017-01-01 03:36:58.330,1918,DYET,36.06671,-86.43411,2017-01-01 03:36:58,0,90818.7,1,2,51.200001,102.400002,200


In [29]:
performance_df[performance_df['DriverID']=='DYET']

Unnamed: 0,Id,ESS_Id,EventTimeStamp,EquipmentID,MCTNumber,EquipmentType,DriverID,Driver2ID,Latitude,Longitude,...,ExcessSpeedTime,TotalFuelUsed,IdleFuelUsed,FaultFlag,RegisteredDriver,CruiseControlTime,TopGearTime,GearDataSource,SpeedUnits,SpeedMatrix
318,77136,13660247,2017-01-03 04:25:40,1918,105381868,tractor,DYET,,36.194999,-83.174999,...,0,96.3,2.5,1,1,85,563,3,MPH,"1796,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
771,77589,13803686,2017-01-10 04:27:51,1918,105381868,tractor,DYET,,40.373703,-76.67662,...,0,437.1,5.2,0,1,707,2485,3,MPH,"3941,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
1582,78400,13942767,2017-01-17 04:19:42,1918,105381868,tractor,DYET,,37.932685,-79.233981,...,0,436.2,2.6,1,1,527,2484,3,MPH,"4117,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
2137,78955,14083105,2017-01-24 04:18:59,1918,105381868,tractor,DYET,,33.457314,-90.62949,...,0,248.5,2.6,0,1,415,1359,3,MPH,"3546,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
2815,79633,14219049,2017-01-31 04:14:19,1918,105381868,tractor,DYET,,35.992962,-87.488703,...,0,271.8,2.5,0,1,661,1617,3,MPH,"1872,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
3378,80196,14357014,2017-02-07 04:14:19,1918,105381868,tractor,DYET,,40.030509,-81.447453,...,0,305.6,4.8,0,1,452,1949,3,MPH,"3208,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
3958,80776,14499475,2017-02-14 04:15:56,1918,105381868,tractor,DYET,,38.705046,-80.66287,...,0,463.8,4.0,0,1,862,2761,3,MPH,"1975,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
4579,81397,14642654,2017-02-21 04:07:53,1918,105381868,tractor,DYET,,37.932453,-79.23412,...,0,353.1,4.6,1,1,500,2231,3,MPH,"1580,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
5224,82042,14784556,2017-02-28 04:21:00,1918,105381868,tractor,DYET,,36.467638,-86.685833,...,0,347.2,4.3,0,1,318,1840,3,MPH,"3199,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."
5856,82674,14923633,2017-03-07 04:18:03,1918,105381868,tractor,DYET,,36.066435,-86.435277,...,0,392.7,5.5,0,1,632,2325,3,MPH,"3433,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0..."


In [30]:
events_levels_df['DriverID'].value_counts()

DYET          33866
SIMJIM        27161
VAUJAM        25712
ULAL          25449
ECKS          24105
CHAJO         23888
BULR          22974
BROD03        22253
CUNW          22195
PERT          21992
STEGA         21950
NARJ          21551
GALJU         21513
CHRN          21337
ANTR          20995
SWEJ          20839
FARMI         20808
TAYJON        20780
EDWTI         20767
SAVE          20310
DINE          20227
MORJAM        20193
CRES          19952
RUTP          19832
PRUH          19723
MARWI         19672
DEPK          19665
SIMJE         19398
SUTD          19355
WIET          19267
              ...  
36728             1
DAVKEN            1
KUNDAV            1
FURLA             1
BEAT              1
TAYDO1            1
RYAC              0
DUNG              0
BRAMAT            0
EDNJ              0
OWEJE             0
SHOP!             0
WORJON            0
BIVD              0
HARJI             0
KANR              0
NESD              0
JOHLO             0
CUMR              0


In [31]:
performance_df = pd.read_csv('data/Extranet2.QCPerformanceExtracts.csv',
                            parse_dates=['EventTimeStamp','LocationTimeStamp', 'DataStartTimeStamp','DataEndTimeStamp' ], 
                            infer_datetime_format=True)
# Remove invalid Truck IDs
performance_df = performance_df[performance_df.EquipmentID < 9999]

In [32]:
df_names_list = ['events_levels_df', 'levels_fueltank_df', 'optimization_fuel_df', 'units_df', 'transaction_df', 'performance_df']
df_list = [events_levels_df, levels_fueltank_df, optimization_fuel_df, units_df, transaction_df, performance_df]
df_dict = dict(zip(df_names_list, df_list))

for k, v in df_dict.items():
    print(k + ' shape: ' + str(v.shape))

events_levels_df shape: (7908406, 13)
levels_fueltank_df shape: (7908824, 2)
optimization_fuel_df shape: (836, 1)
units_df shape: (704, 2)
transaction_df shape: (147813, 21)
performance_df shape: (32253, 32)


In [39]:

events_levels_df.head()

Unnamed: 0,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,tankcap
12649083,2017-01-01 00:00:32.387,1992,BATK,30.34642,-81.70724,2017-01-01 00:00:30,0,39895.5,2,1,50.700001,101.400002,200
12649084,2017-01-01 00:01:24.867,1712,TAYANT,36.93953,-84.09541,2017-01-01 00:01:27,0,360378.8,1,2,78.0,156.0,200
12649092,2017-01-01 00:04:15.053,1713,RUSR,35.58664,-84.52786,2017-01-01 00:04:17,67,377577.4,1,2,31.700001,63.400002,200
12649102,2017-01-01 00:04:10.260,1585,MCAD,35.1543,-90.14263,2017-01-01 00:04:10,0,513931.8,1,2,43.200001,108.0,250
12649103,2017-01-01 00:10:13.187,1564,ULAL,39.90268,-84.19254,2017-01-01 00:10:11,3,498691.0,1,2,50.299999,125.75,250
