In [1]:
import pandas as pd 
import numpy as np 
#import gc
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns 

# Read in the merged data 2.0

In [2]:
col_names = ['idx1', 'idx2', 'Id', 'EventTimeStamp', 'EquipmentID', 'DriverID', 'Latitude',
       'Longitude', 'LocationTimeStamp', 'Speed', 'Odometer', 'IgnitionStatus',
       'EFReportReason', 'TankLevelPercent', 'TankLevelGallons',
       'ExactFuelEventId', 'truck', 'tankcap', 'refuel_datetime',
       'refuel_tr_sum', 'refuel_tr_id']
date_cols = ['EventTimeStamp', 'LocationTimeStamp', 'refuel_datetime' ]

In [3]:
dtype_dict = {
    'Id' : 'uint64', 
    'EquipmentID' : 'category', 
    'DriverID' : 'category', 
    'Latitude' : 'float64', 
    'Longitude' : 'float64', 
    'Speed' : 'uint16', 
    'Odometer' : 'float64', 
    'IgnitionStatus' : 'category', 
    'EFReportReason' : 'category', 
    'TankLevelPercent' : 'float32', 
    'TankLevelGallons' : 'float32', 
    'ExactFuelEventId' : 'uint64', 
    'truck' : 'category', 
    'tankcap' : 'uint16', 
    'refuel_tr_sum' : 'float32', 
    'refuel_tr_id' : 'category' }

In [4]:
## Read the csv after creating the dtype_dict, col_names, and date_cols objects
events_levels_df = pd.read_csv('/Users/paulomartinez/Downloads/events_levels_df.gz', skiprows=1, names=col_names,
                               dtype=dtype_dict, usecols=np.arange(2,19), 
                              parse_dates = date_cols, infer_datetime_format = True)
#9:29 pm - 9:30 pm

In [5]:
events_levels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7908406 entries, 0 to 7908405
Data columns (total 17 columns):
Id                   uint64
EventTimeStamp       datetime64[ns]
EquipmentID          category
DriverID             category
Latitude             float64
Longitude            float64
LocationTimeStamp    datetime64[ns]
Speed                uint16
Odometer             float64
IgnitionStatus       category
EFReportReason       category
TankLevelPercent     float32
TankLevelGallons     float32
ExactFuelEventId     uint64
truck                category
tankcap              uint16
refuel_datetime      datetime64[ns]
dtypes: category(5), datetime64[ns](3), float32(2), float64(3), uint16(2), uint64(2)
memory usage: 633.6 MB


# Drop the noisy rows

In [6]:
#list of noisy row ids
bad_row_Id = [12657126,
 12657127,
 12746275,
 12946311,
 13039741,
 13092342,
 13092343,
 13092344,
 13092345,
 13092346,
 13092347,
 13092555,
 13156996,
 13156997,
 13174091,
 13313530,
 13381098,
 13381099,
 13381100,
 13381101,
 13381102,
 13381103,
 13555164,
 13555165,
 13566460,
 13636409,
 13729945,
 13729946,
 13780017,
 13810682,
 13838489,
 13912193,
 13912251,
 13914679,
 13914680,
 13935785,
 14083809,
 14089445,
 14160586,
 14237432,
 14237433,
 14237448,
 14237449,
 14238192,
 14242725,
 14261769,
 14261770,
 14261771,
 14328059,
 14328060,
 14328061,
 14328062,
 14351176,
 14351177,
 14359863,
 14359864,
 14417642,
 14417643,
 14417644,
 14417645,
 14447049,
 14544626,
 14544627,
 14557081,
 14557505,
 14564960,
 14564961,
 14585667,
 14668575,
 14730292,
 14730293,
 14730294,
 14730295,
 14781469,
 14781470,
 14871213,
 14871214,
 14871215,
 14871237,
 14898651,
 14898652,
 14898653,
 14898654,
 14898655,
 14898656,
 14898657,
 14898658,
 14898659,
 14898660,
 14898661,
 14898662,
 14898663,
 14898664,
 14898665,
 14898666,
 14898667,
 14898668,
 14898669,
 14898670,
 14898671,
 14898672,
 14898673,
 14898674,
 14898675,
 14898676,
 14940092,
 14988050,
 14988052,
 14988053,
 14988054,
 15213067,
 15272473,
 15272474,
 15272475,
 15272476,
 15272477,
 15338152,
 15338153,
 15338154,
 15338155,
 15338156,
 15409791,
 15409792,
 15409793,
 15409794,
 15434374,
 15434375,
 15459392,
 15513384,
 15513385,
 15513386,
 15529682,
 15530029,
 15530030,
 15552006,
 15670484,
 15670485,
 15670486,
 15686110,
 15686111,
 15686139,
 15686140,
 15686141,
 15686142,
 15686143,
 15686144,
 15686145,
 15970716,
 15970719,
 15970720,
 16121249,
 16206897,
 16285004,
 16297322,
 16297323,
 16297324,
 16297325,
 16297326,
 16297327,
 16297328,
 16297329,
 16297330,
 16297331,
 16297332,
 16349951,
 16414146,
 16414147,
 16521380,
 16521381,
 16521382,
 16773389,
 16946873,
 16946874,
 16946875,
 17029642,
 17044451,
 17048531,
 17048532,
 17048533,
 17052293,
 17079226,
 17091437,
 17091438,
 17135503,
 17141248,
 17141249,
 17239056,
 17306363,
 17306364,
 17306365,
 17306366,
 17353645,
 17353647,
 17523789,
 17558391,
 17600517,
 17600518,
 17600519,
 17600520,
 17600521,
 17625382,
 17848054,
 18034883,
 18077107,
 18077136,
 18219769,
 18219770,
 18286629,
 18287766,
 18287797,
 18372956,
 18397269,
 18442627,
 18442628,
 18442629,
 18442630,
 18479608,
 18571739,
 18571740,
 18600441,
 18600444,
 18600445,
 18604899,
 18769793,
 18769794,
 18769795,
 18769796,
 18778132,
 18778133,
 18878427,
 18943054,
 18943055,
 18943056,
 18986682,
 19064535,
 19129470,
 19129471,
 19151185,
 19151187,
 19431189,
 19431190,
 19473744,
 19473747,
 19473748,
 19473749,
 19473750,
 19516211,
 19585856,
 19647247,
 19647248,
 19647249,
 19647307,
 19647308,
 19647309,
 19659265,
 19667110,
 19682036,
 19682037,
 19771347,
 19805675,
 19865184,
 19865185,
 19865186,
 19865187,
 19865188,
 19865189,
 19865190,
 19865191,
 19865192,
 19865193,
 19865194,
 19865195,
 19865196,
 19865197,
 19865198,
 19865199,
 19866576,
 19866577,
 19867559,
 19867560,
 19867561,
 19867562,
 19867563,
 19867564,
 19867891,
 19867892,
 19867893,
 19867894,
 19867895,
 19867896,
 19867897,
 19867898,
 19867899,
 19867900,
 19867901,
 19868038,
 19869152,
 19869153,
 19869463,
 19869464,
 19869465,
 19869466,
 19869467,
 19869468,
 19869469,
 19869587,
 19869588,
 19869589,
 19869590,
 19870675,
 19870676,
 19870677,
 19871176,
 19871177,
 19872846,
 19872847,
 19872848,
 19872849,
 19872850,
 19882277,
 19882278,
 19882279,
 19882280,
 19882281,
 19917072,
 19917073,
 19917074,
 19917075,
 19917076,
 19950534,
 19950535,
 19953297,
 19953300,
 19965259,
 19970595,
 20002163,
 20008440,
 20008443,
 20052156,
 20052455,
 20052456,
 20131852,
 20162872,
 20169350,
 20169351,
 20207146,
 20246645,
 20248564,
 20270484,
 20270487,
 20270488,
 20383219,
 20432075,
 20432104,
 20475553,
 20482760,
 20502251,
 20526997,
 20526998,
 20526999,
 20527000,
 20527090,
 20527091,
 20527092,
 20527093,
 20534852,
 20534853,
 20586712,
 20587078,
 20587079,
 20667306,
 20667307,
 20667932,
 20735846,
 20752293,
 20794736,
 20794737,
 20794738,
 20794739,
 20794740,
 20795223,
 20795224,
 20795225,
 20795226,
 20795227,
 20795545,
 20795546,
 20795547,
 20795550,
 20795551,
 20795553,
 20796460,
 20796461,
 20796462,
 20802387,
 20802388,
 20826624,
 20833078,
 20833079,
 20895567,
 20895568,
 20895569,
 20895570,
 20895571,
 20896364,
 20974377,
 20975272,
 20975273,
 20975274,
 20994196,
 21093100]

In [7]:
type(bad_row_Id[0])

int

In [12]:
bs = [i not in bad_row_Id for i in events_levels_df.Id]
bs
#this list comprehension will take aproximately 1.5 hours

#started running anew 12:32 am
    #try again for all of it. 8Mil is 8 times more so, 
    #expect 80 minutes...
    
    # 1,000,000 rows began at 12:17 am - 12:28 am
        #I suppose that if 100K took 2 minutes, then ten times that
        #would take 20 minutes
    
    #100,000 rows took 12:15 am - 12:17 am

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 Tru

In [13]:
#redefine the df
t0 = datetime.now()
print('started at ', datetime.now())
events_levels_df = events_levels_df[bs]
tf = datetime.now()
print('finished at ', tf)
print('whole process took ', tf-t0)
events_levels_df.info()

started at  2018-05-04 02:03:07.371353
finished at  2018-05-04 02:03:09.292286


NameError: name 't0' is not defined

In [14]:
events_levels_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7907995 entries, 0 to 7908405
Data columns (total 17 columns):
Id                   uint64
EventTimeStamp       datetime64[ns]
EquipmentID          category
DriverID             category
Latitude             float64
Longitude            float64
LocationTimeStamp    datetime64[ns]
Speed                uint16
Odometer             float64
IgnitionStatus       category
EFReportReason       category
TankLevelPercent     float32
TankLevelGallons     float32
ExactFuelEventId     uint64
truck                category
tankcap              uint16
refuel_datetime      datetime64[ns]
dtypes: category(5), datetime64[ns](3), float32(2), float64(3), uint16(2), uint64(2)
memory usage: 693.9 MB


# Row cleaning was successful ✅ 

#### Now I need to drop all rows with timestamps inside of the time range of consecutive refuelings
- to do this, I'll need to identify each trucks fueling purchases

It looks like Jarrod’s merge produced a litany of duplicate refuel_datetime values. ⛽️ ⛽️ ...⛽️ 

- ✅ This could make sense since every fuel level measurement that happened between refuel_datetime times might have received the preceding/merged refuel_datetime time-stamp.

Assuming this is the case, I should be able to run a modified version of my flag function to flag "inter-fueling-rows". 🇲🇽
- leaving us with a "feather-light" 89,976 rows

##### attempt to flag inter-fueling-rows

    🦂 
    Q: (is it possible that I would have dropped a noisy row that also happened to identify the next fueling event? 
    
    
    A: Perhaps, (although the odds of having this problem are only 411/7.9Million). Regardless, we should be fine, since the following row will have a sufficiently close timestamp and odometer reading.) 
    So, we're good to go 
    😁 🚀 

In [20]:
#sort by truck and date
t0 = datetime.now()
print('started at ', t0)
events_levels_df.sort_values(['truck', 'EventTimeStamp'],
                                     inplace = True
                                         )
tf = datetime.now()
print('finished at ', tf)
print('whole process took ', tf-t0)

events_levels_df
#started sort at 12:10 am - 12:10 am

started at  2018-05-04 02:40:12.298870
finished at  2018-05-04 02:40:19.543207
whole process took  0:00:07.244337


Unnamed: 0,Id,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
2441,12804900,2017-01-09 08:03:30.687,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,1,25.200001,57.959999,12804900,1508,230,NaT
2443,12804901,2017-01-09 08:05:30.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,0,25.200001,57.959999,12804901,1508,230,NaT
2449,12804902,2017-01-09 08:17:10.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,25.200001,57.959999,12804902,1508,230,NaT
2512,12804904,2017-01-09 12:24:31.170,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.000000,55.200001,12804904,1508,230,NaT
2513,12804903,2017-01-09 12:25:41.203,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.000000,55.200001,12804903,1508,230,NaT
2528,12804905,2017-01-09 14:04:41.767,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.700001,56.810001,12804905,1508,230,NaT
2529,12804906,2017-01-09 14:06:31.780,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.700001,56.810001,12804906,1508,230,NaT
2533,12804907,2017-01-09 14:26:11.797,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,1,1,22.900000,52.669998,12804907,1508,230,NaT
2534,12804908,2017-01-09 14:28:21.813,1508,,36.06612,-86.43410,2016-12-10 14:29:31,4,575631.0,1,0,23.000000,52.900002,12804908,1508,230,NaT
2541,12804899,2017-01-09 14:36:01.877,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575632.3,2,1,16.799999,38.639999,12804899,1508,230,NaT


In [21]:
#reset the index now that we have dropped the noisy odometer rows
t0 = datetime.now()
print('started at ', t0)
events_levels_df.reset_index(inplace = True, drop = True)
events_levels_df
tf = datetime.now()
print('finished at ', tf)
print('whole process took ', tf-t0)

started at  2018-05-04 02:40:40.961868
finished at  2018-05-04 02:40:40.970311
whole process took  0:00:00.008443


In [22]:
events_levels_df

Unnamed: 0,Id,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
0,12804900,2017-01-09 08:03:30.687,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,1,25.200001,57.959999,12804900,1508,230,NaT
1,12804901,2017-01-09 08:05:30.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,0,25.200001,57.959999,12804901,1508,230,NaT
2,12804902,2017-01-09 08:17:10.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,25.200001,57.959999,12804902,1508,230,NaT
3,12804904,2017-01-09 12:24:31.170,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.000000,55.200001,12804904,1508,230,NaT
4,12804903,2017-01-09 12:25:41.203,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.000000,55.200001,12804903,1508,230,NaT
5,12804905,2017-01-09 14:04:41.767,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.700001,56.810001,12804905,1508,230,NaT
6,12804906,2017-01-09 14:06:31.780,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.700001,56.810001,12804906,1508,230,NaT
7,12804907,2017-01-09 14:26:11.797,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,1,1,22.900000,52.669998,12804907,1508,230,NaT
8,12804908,2017-01-09 14:28:21.813,1508,,36.06612,-86.43410,2016-12-10 14:29:31,4,575631.0,1,0,23.000000,52.900002,12804908,1508,230,NaT
9,12804899,2017-01-09 14:36:01.877,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575632.3,2,1,16.799999,38.639999,12804899,1508,230,NaT


In [26]:
def flag_droppable_fuel_event_rows(df):#!!!! requires a dataframe with a reset index!!!!
    '''takes a dataframe with a (fueling) datetime column and returns a boolean index 
    identifying which rows are inter-fueling readings and can be dropped.
    '''
    t0 = datetime.now()
    print('started at ', t0)
    
    flags = [] #initialize boolean index as a list
    flag = True #set first flag to keep
    for i in range(len(df.refuel_datetime)-1):#loop through the n-1 rows of the df (we'll assign the nth row below, once the loop is done).
        flags.append(flag) #We'll keep the first row (i = 0)
        #if the next row has the same refuel_datetime timestamp, then we want to flag it for dropping
        if (df.refuel_datetime[i + 1] == df.refuel_datetime[i]):
            flag = False #turn the flag state from keep to drop
        else: #if the next refuel_datetime is different, then we want to keep it. Notice that this will keep the last refuel_datetime row for any given truck because the next row will have a different time stamp
            flag = True #turn the flag state from drop to keep 
            #since we sorted by truck and EventTimeStamp it should be extremely unlikely that the next truck's first recorded fueling event is at the exact same time-stamp as the former truck's last fueling timestamp, so we'll just assume that we don't need to verify we're working with the same truck 
    flags.append(True)#make sure to keep the last row (which will probably have the same timestamp as it's predecessor but, since it is the last record, we need to keep it to determine the last odometer reading of the truck at the last row)
    
    tf = datetime.now()
    print('finished at ', tf)
    print('whole process took ', tf-t0)
    
    return flags
            

In [27]:
events_levels_df

Unnamed: 0,Id,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
0,12804900,2017-01-09 08:03:30.687,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,1,25.200001,57.959999,12804900,1508,230,NaT
1,12804901,2017-01-09 08:05:30.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,0,25.200001,57.959999,12804901,1508,230,NaT
2,12804902,2017-01-09 08:17:10.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,25.200001,57.959999,12804902,1508,230,NaT
3,12804904,2017-01-09 12:24:31.170,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.000000,55.200001,12804904,1508,230,NaT
4,12804903,2017-01-09 12:25:41.203,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.000000,55.200001,12804903,1508,230,NaT
5,12804905,2017-01-09 14:04:41.767,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.700001,56.810001,12804905,1508,230,NaT
6,12804906,2017-01-09 14:06:31.780,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.700001,56.810001,12804906,1508,230,NaT
7,12804907,2017-01-09 14:26:11.797,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,1,1,22.900000,52.669998,12804907,1508,230,NaT
8,12804908,2017-01-09 14:28:21.813,1508,,36.06612,-86.43410,2016-12-10 14:29:31,4,575631.0,1,0,23.000000,52.900002,12804908,1508,230,NaT
9,12804899,2017-01-09 14:36:01.877,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575632.3,2,1,16.799999,38.639999,12804899,1508,230,NaT


In [28]:
#Ok, lets test the flag_droppable_fuel_event_rows function on a toy slice
toydf = events_levels_df[0:1000]
toydf
#visual inspection confirms we have more than one truck in there

Unnamed: 0,Id,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
0,12804900,2017-01-09 08:03:30.687,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,1,25.200001,57.959999,12804900,1508,230,NaT
1,12804901,2017-01-09 08:05:30.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,0,25.200001,57.959999,12804901,1508,230,NaT
2,12804902,2017-01-09 08:17:10.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,25.200001,57.959999,12804902,1508,230,NaT
3,12804904,2017-01-09 12:24:31.170,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.000000,55.200001,12804904,1508,230,NaT
4,12804903,2017-01-09 12:25:41.203,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.000000,55.200001,12804903,1508,230,NaT
5,12804905,2017-01-09 14:04:41.767,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.700001,56.810001,12804905,1508,230,NaT
6,12804906,2017-01-09 14:06:31.780,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.700001,56.810001,12804906,1508,230,NaT
7,12804907,2017-01-09 14:26:11.797,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,1,1,22.900000,52.669998,12804907,1508,230,NaT
8,12804908,2017-01-09 14:28:21.813,1508,,36.06612,-86.43410,2016-12-10 14:29:31,4,575631.0,1,0,23.000000,52.900002,12804908,1508,230,NaT
9,12804899,2017-01-09 14:36:01.877,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575632.3,2,1,16.799999,38.639999,12804899,1508,230,NaT


In [29]:
flags = flag_droppable_fuel_event_rows(toydf)
flags
#this took .06 seconds for 1,000 rows

started at  2018-05-04 03:01:38.910555
finished at  2018-05-04 03:01:38.972743
whole process took  0:00:00.062188


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 Fa

In [30]:
#let's try it on 10,000 rows
flags = flag_droppable_fuel_event_rows(events_levels_df[0:10000])
flags
#this only took .4 seconds

started at  2018-05-04 03:04:03.065515
finished at  2018-05-04 03:04:03.470719
whole process took  0:00:00.405204


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 Fa

In [31]:
#let's try it on 100,000 rows
flags = flag_droppable_fuel_event_rows(events_levels_df[0:100000])
flags
#this only took 3.5 seconds

started at  2018-05-04 03:04:36.784410
finished at  2018-05-04 03:04:40.343165
whole process took  0:00:03.558755


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 Fa

In [32]:
#let's try it on 1,000,000 rows
flags = flag_droppable_fuel_event_rows(events_levels_df[0:1000000])
flags
#this should only take about 30 seconds. But it actually took 51 seconds.
    #so 8 times longer should be under 8 minutes

started at  2018-05-04 03:05:15.717123
finished at  2018-05-04 03:06:07.374676
whole process took  0:00:51.657553


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 Fa

In [33]:
#let's try it on the whole shebang
flags = flag_droppable_fuel_event_rows(events_levels_df)
flags
#expecting 8 minute turnaround. (Actually took 7 minutes and 9 seconds)

started at  2018-05-04 03:07:15.920177
finished at  2018-05-04 03:14:25.610831
whole process took  0:07:09.690654


[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 Fa

In [35]:
#alright, redefine the df by dropping the noisy rows
t0 = datetime.now()
print('started at ', datetime.now())
truck_performance_df = events_levels_df[flags]
tf = datetime.now()
print('finished at ', tf)
print('whole process took ', tf-t0)
truck_performance_df.info()

started at  2018-05-04 03:15:50.633119
finished at  2018-05-04 03:15:50.970853
whole process took  0:00:00.337769
<class 'pandas.core.frame.DataFrame'>
Int64Index: 219156 entries, 0 to 7907994
Data columns (total 17 columns):
Id                   219156 non-null uint64
EventTimeStamp       219156 non-null datetime64[ns]
EquipmentID          219156 non-null category
DriverID             109320 non-null category
Latitude             219156 non-null float64
Longitude            219156 non-null float64
LocationTimeStamp    219156 non-null datetime64[ns]
Speed                219156 non-null uint16
Odometer             219156 non-null float64
IgnitionStatus       219156 non-null category
EFReportReason       219156 non-null category
TankLevelPercent     219156 non-null float32
TankLevelGallons     219156 non-null float32
ExactFuelEventId     219156 non-null uint64
truck                219156 non-null category
tankcap              219156 non-null uint16
refuel_datetime      106630 non-null da

In [36]:
#let's reset the index for abstemiousness
#alright, redefine the df by dropping the noisy rows
t0 = datetime.now()
print('started at ', datetime.now())
truck_performance_df.reset_index(inplace = True, drop = True)
tf = datetime.now()
print('finished at ', tf)
print('whole process took ', tf-t0)
truck_performance_df.info()

started at  2018-05-04 03:27:51.400248
finished at  2018-05-04 03:27:51.400942
whole process took  0:00:00.000742
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 219156 entries, 0 to 219155
Data columns (total 17 columns):
Id                   219156 non-null uint64
EventTimeStamp       219156 non-null datetime64[ns]
EquipmentID          219156 non-null category
DriverID             109320 non-null category
Latitude             219156 non-null float64
Longitude            219156 non-null float64
LocationTimeStamp    219156 non-null datetime64[ns]
Speed                219156 non-null uint16
Odometer             219156 non-null float64
IgnitionStatus       219156 non-null category
EFReportReason       219156 non-null category
TankLevelPercent     219156 non-null float32
TankLevelGallons     219156 non-null float32
ExactFuelEventId     219156 non-null uint64
truck                219156 non-null category
tankcap              219156 non-null uint16
refuel_datetime      106630 non-null dat

In [37]:
truck_performance_df

Unnamed: 0,Id,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
0,12804900,2017-01-09 08:03:30.687,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,1,25.200001,57.959999,12804900,1508,230,NaT
1,12804901,2017-01-09 08:05:30.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,0,25.200001,57.959999,12804901,1508,230,NaT
2,12804902,2017-01-09 08:17:10.670,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,25.200001,57.959999,12804902,1508,230,NaT
3,12804904,2017-01-09 12:24:31.170,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.000000,55.200001,12804904,1508,230,NaT
4,12804903,2017-01-09 12:25:41.203,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.000000,55.200001,12804903,1508,230,NaT
5,12804905,2017-01-09 14:04:41.767,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.700001,56.810001,12804905,1508,230,NaT
6,12804906,2017-01-09 14:06:31.780,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.700001,56.810001,12804906,1508,230,NaT
7,12804907,2017-01-09 14:26:11.797,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,1,1,22.900000,52.669998,12804907,1508,230,NaT
8,12804908,2017-01-09 14:28:21.813,1508,,36.06612,-86.43410,2016-12-10 14:29:31,4,575631.0,1,0,23.000000,52.900002,12804908,1508,230,NaT
9,12804899,2017-01-09 14:36:01.877,1508,,36.06612,-86.43410,2016-12-10 14:29:31,0,575632.3,2,1,16.799999,38.639999,12804899,1508,230,NaT


I'm seeing a lot of NaT values in the date time column and NaN values  in the DriverID column... Why are there so many null values?

In [38]:
truck_performance_df.refuel_datetime.value_counts()

2017-06-06 11:12:00    27
2017-07-25 21:02:00    24
2017-06-06 19:08:00    21
2017-11-07 01:11:00    21
2017-12-28 02:41:00    17
2018-01-22 21:52:00    15
2017-04-04 01:56:00    15
2017-06-25 13:48:00    14
2017-02-20 19:50:00    12
2017-01-04 15:47:00    10
2017-11-28 03:31:00     9
2017-08-12 10:19:00     9
2018-02-19 23:53:00     9
2017-07-27 15:25:00     8
2017-08-23 11:40:00     8
2017-09-25 14:22:00     8
2018-03-14 08:38:00     8
2017-06-29 04:05:00     8
2017-03-02 07:22:00     7
2017-01-04 07:09:00     7
2017-08-13 03:50:00     7
2017-01-04 07:52:00     7
2017-01-05 06:39:00     7
2017-07-04 17:51:00     7
2018-03-17 15:47:00     7
2017-08-07 21:23:00     7
2017-05-27 21:19:00     7
2017-08-21 01:13:00     7
2017-07-27 09:54:00     7
2017-12-17 01:07:00     7
                       ..
2017-02-14 04:19:00     1
2017-11-07 07:10:00     1
2017-08-13 13:57:00     1
2017-11-14 01:50:00     1
2017-04-02 10:03:00     1
2017-01-20 04:30:00     1
2017-08-27 15:10:00     1
2017-07-02 1

This value_counts call is alarming. I didn't expect to have any fueling events time-stamped on the exact same time. 

In [41]:
truck_performance_df.refuel_datetime.isnull().sum()
#112,526 of my rows are null.... that's over half of what I began with

112526

In [42]:
len(truck_performance_df.refuel_datetimeuel_datetime)

219156

## It does not look like the process wen't according to plan.

# Why do we have so many NA values?