In [13]:
import pandas as pd 
import numpy as np 
#import gc
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns 
import timeit

In [15]:
#timeit setup
setup = '''
import random

random.seed('slartibartfast')
s = [random.random() for i in range(1000)]
timesort = list.sort
'''

print(min(timeit.Timer('a = s[:]; timesort(a)', 
                       setup = setup).repeat(7,1000)))

0.14642716399976052


# Read in the merged data 2.0

In [2]:
col_names = ['idx1', 'idx2', 'Id', 'EventTimeStamp', 'EquipmentID', 'DriverID', 'Latitude',
       'Longitude', 'LocationTimeStamp', 'Speed', 'Odometer', 'IgnitionStatus',
       'EFReportReason', 'TankLevelPercent', 'TankLevelGallons',
       'ExactFuelEventId', 'truck', 'tankcap', 'refuel_datetime',
       'refuel_tr_sum', 'refuel_tr_id']
date_cols = ['EventTimeStamp', 'LocationTimeStamp', 'refuel_datetime' ]

In [3]:
dtype_dict = {
    'Id' : 'uint64', 
    'EquipmentID' : 'category', 
    'DriverID' : 'category', 
    'Latitude' : 'float64', 
    'Longitude' : 'float64', 
    'Speed' : 'uint16', 
    'Odometer' : 'float64', 
    'IgnitionStatus' : 'category', 
    'EFReportReason' : 'category', 
    'TankLevelPercent' : 'float32', 
    'TankLevelGallons' : 'float32', 
    'ExactFuelEventId' : 'uint64', 
    'truck' : 'category', 
    'tankcap' : 'uint16', 
    'refuel_tr_sum' : 'float32', 
    'refuel_tr_id' : 'category' }

In [4]:
## Read the csv after creating the dtype_dict, col_names, and date_cols objects
events_levels_df = pd.read_csv('/Users/paulomartinez/Downloads/events_levels_df.gz', skiprows=1, names=col_names,
                               dtype=dtype_dict, usecols=np.arange(2,19), 
                              parse_dates = date_cols, infer_datetime_format = True)
#11:20 pm - 11:21 pm
#11:30 pm - 11:31 pm

In [5]:
events_levels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7908406 entries, 0 to 7908405
Data columns (total 17 columns):
Id                   uint64
EventTimeStamp       datetime64[ns]
EquipmentID          category
DriverID             category
Latitude             float64
Longitude            float64
LocationTimeStamp    datetime64[ns]
Speed                uint16
Odometer             float64
IgnitionStatus       category
EFReportReason       category
TankLevelPercent     float32
TankLevelGallons     float32
ExactFuelEventId     uint64
truck                category
tankcap              uint16
refuel_datetime      datetime64[ns]
dtypes: category(5), datetime64[ns](3), float32(2), float64(3), uint16(2), uint64(2)
memory usage: 633.6 MB


In [6]:
events_levels_df.head()

Unnamed: 0,Id,EventTimeStamp,EquipmentID,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
0,12649676,2017-01-01 00:19:45.357,1559,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649676,1559,250,NaT
1,12649677,2017-01-01 00:34:45.370,1559,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649677,1559,250,NaT
2,12649678,2017-01-01 00:49:45.417,1559,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649678,1559,250,NaT
3,12649679,2017-01-01 01:04:45.463,1559,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649679,1559,250,NaT
4,12649680,2017-01-01 01:19:45.480,1559,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649680,1559,250,NaT


# Drop the noisy rows

Ok, let's try an alternative.
1. drop superfluous columns
2. sort by truck and datetime
3. run flag function
4. define slice
    - remove noisy df

## --- lets first make sure the df isn't corrupted somehow

In [7]:
list(events_levels_df.columns.values)

['Id',
 'EventTimeStamp',
 'EquipmentID',
 'DriverID',
 'Latitude',
 'Longitude',
 'LocationTimeStamp',
 'Speed',
 'Odometer',
 'IgnitionStatus',
 'EFReportReason',
 'TankLevelPercent',
 'TankLevelGallons',
 'ExactFuelEventId',
 'truck',
 'tankcap',
 'refuel_datetime']

In [8]:
for col in list(events_levels_df.columns.values):
    print('len of set of col:', str(col), 
          len(set(events_levels_df[str(col)]))
         )

len of set of col: Id 7908406
len of set of col: EventTimeStamp 7527560
len of set of col: EquipmentID 629
len of set of col: DriverID 925
len of set of col: Latitude 1032388
len of set of col: Longitude 1286842
len of set of col: LocationTimeStamp 6551558
len of set of col: Speed 88
len of set of col: Odometer 3002046
len of set of col: IgnitionStatus 2
len of set of col: EFReportReason 3
len of set of col: TankLevelPercent 1000
len of set of col: TankLevelGallons 4333
len of set of col: ExactFuelEventId 7908406
len of set of col: truck 629
len of set of col: tankcap 5
len of set of col: refuel_datetime 89976


In [9]:
#1. drop superfluous columns
events_levels_df.drop('EquipmentID', inplace=True, axis='columns')
events_levels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7908406 entries, 0 to 7908405
Data columns (total 16 columns):
Id                   uint64
EventTimeStamp       datetime64[ns]
DriverID             category
Latitude             float64
Longitude            float64
LocationTimeStamp    datetime64[ns]
Speed                uint16
Odometer             float64
IgnitionStatus       category
EFReportReason       category
TankLevelPercent     float32
TankLevelGallons     float32
ExactFuelEventId     uint64
truck                category
tankcap              uint16
refuel_datetime      datetime64[ns]
dtypes: category(4), datetime64[ns](3), float32(2), float64(3), uint16(2), uint64(2)
memory usage: 618.5 MB


In [10]:
events_levels_df.head()

Unnamed: 0,Id,EventTimeStamp,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
0,12649676,2017-01-01 00:19:45.357,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649676,1559,250,NaT
1,12649677,2017-01-01 00:34:45.370,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649677,1559,250,NaT
2,12649678,2017-01-01 00:49:45.417,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649678,1559,250,NaT
3,12649679,2017-01-01 01:04:45.463,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649679,1559,250,NaT
4,12649680,2017-01-01 01:19:45.480,,36.06712,-86.43476,2017-01-01 00:01:05,0,505629.3,1,2,96.0,240.0,12649680,1559,250,NaT


In [17]:
#sort by truck and date
events_levels_df.sort_values(['truck', 'EventTimeStamp'],
                                     inplace = True
                                         )
events_levels_df
#started sort at 12:10 am - 12:10 am

Unnamed: 0,Id,EventTimeStamp,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
2443,12804900,2017-01-09 08:03:30.687,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,1,25.200001,57.959999,12804900,1508,230,NaT
2445,12804901,2017-01-09 08:05:30.670,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,0,25.200001,57.959999,12804901,1508,230,NaT
2451,12804902,2017-01-09 08:17:10.670,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,25.200001,57.959999,12804902,1508,230,NaT
2514,12804904,2017-01-09 12:24:31.170,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.000000,55.200001,12804904,1508,230,NaT
2515,12804903,2017-01-09 12:25:41.203,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.000000,55.200001,12804903,1508,230,NaT
2530,12804905,2017-01-09 14:04:41.767,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.700001,56.810001,12804905,1508,230,NaT
2531,12804906,2017-01-09 14:06:31.780,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.700001,56.810001,12804906,1508,230,NaT
2535,12804907,2017-01-09 14:26:11.797,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,1,1,22.900000,52.669998,12804907,1508,230,NaT
2536,12804908,2017-01-09 14:28:21.813,,36.06612,-86.43410,2016-12-10 14:29:31,4,575631.0,1,0,23.000000,52.900002,12804908,1508,230,NaT
2543,12804899,2017-01-09 14:36:01.877,,36.06612,-86.43410,2016-12-10 14:29:31,0,575632.3,2,1,16.799999,38.639999,12804899,1508,230,NaT


In [18]:
#reset the index
events_levels_df.reset_indext_index(inplace = True, drop = True)
events_levels_df

Unnamed: 0,Id,EventTimeStamp,DriverID,Latitude,Longitude,LocationTimeStamp,Speed,Odometer,IgnitionStatus,EFReportReason,TankLevelPercent,TankLevelGallons,ExactFuelEventId,truck,tankcap,refuel_datetime
0,12804900,2017-01-09 08:03:30.687,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,1,25.200001,57.959999,12804900,1508,230,NaT
1,12804901,2017-01-09 08:05:30.670,,36.06612,-86.43410,2016-12-10 14:29:31,0,575630.9,1,0,25.200001,57.959999,12804901,1508,230,NaT
2,12804902,2017-01-09 08:17:10.670,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,25.200001,57.959999,12804902,1508,230,NaT
3,12804904,2017-01-09 12:24:31.170,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.000000,55.200001,12804904,1508,230,NaT
4,12804903,2017-01-09 12:25:41.203,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.000000,55.200001,12804903,1508,230,NaT
5,12804905,2017-01-09 14:04:41.767,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,1,24.700001,56.810001,12804905,1508,230,NaT
6,12804906,2017-01-09 14:06:31.780,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,2,0,24.700001,56.810001,12804906,1508,230,NaT
7,12804907,2017-01-09 14:26:11.797,,36.06612,-86.43410,2016-12-10 14:29:31,0,575631.0,1,1,22.900000,52.669998,12804907,1508,230,NaT
8,12804908,2017-01-09 14:28:21.813,,36.06612,-86.43410,2016-12-10 14:29:31,4,575631.0,1,0,23.000000,52.900002,12804908,1508,230,NaT
9,12804899,2017-01-09 14:36:01.877,,36.06612,-86.43410,2016-12-10 14:29:31,0,575632.3,2,1,16.799999,38.639999,12804899,1508,230,NaT


In [22]:
def flag_anomal_odom_rows_2(df):#!!!! requires a dataframe with a reset index!!!!
    '''takes a dataframe with an Odometer column and returns a boolean index of 
    identifying which rows are noisy Odometer readings and should be dropped.
    '''
    #initialize a series, without attaching it to the df in the hopes of expediency
    OdomDiff = df.Odometer.diff()
    #initialize list which will serve as a boolean index
    flags = []
    flag = 1 #this will be flipped like a switch 1 means do not drop, 0 means drop
    #loop through each difference measurement
    for i in range(len(OdomDiff) - 1):
    # the loop will flag all but the last row because otherwise I'd have to find a way to loop around the "corner" of the index
        #will start by flaggin with 1 (do not drop) until an anomaly flips it.
        flags.append(bool(flag))
    #   # for each truck, if odometer change is anomalous (i.e. spikes up or down) flip the flag switch
        if (df.truck[i + 1] == df.truck[i]) & (abs(OdomDiff[i + 1]) > 200): 
            flag = (flag + 1)%2
    #       #the flag will remain on the delete (i.e. 0) setting until the loop detects the spike that corrects the odometer
    #   # if we change trucks before the anomaly is corrected reset the flag to do not drop (i.e. 1)   
        if df.truck[i + 1] != df.truck[i]:
            flag = 1
    #now that the loop is done, we append the last flag (which is in the state the loop left it. So, if a loop ends before the anomaly is corrected, the flag will still be set at 0/False)
    flags.append(bool(flag))
    return flags

In [None]:
t0 = datetime.now()
print('started at ', datetime.now())
flags = flag_anomal_odom_rows_2(events_levels_df)
tf = datetime.now()
print('finished at ', tf)
print('whole process took ', tf-t0)
    #eliminated the print statement, and some empty lines
    #ran second attempt at 5/4/18, 00:41 am CST

    #the print statement I had in the function was taking too long.
    # I'm getting about 3 rows/second
        # => 2,650,000 seconds = 44, 167 minutes = 736 hours = 31 days... 😒 

started at  2018-05-04 03:43:05.333591


In [None]:
#alright, assuming this works I'll want to redefine the df by dropping the noisy rows
print('started at ', datetime.now())
events_levels_df = events_levels_df[flags]
tf = datetime.now()
print('finished at ', tf)
print('whole process took ', tf-t0)
events_levels_df.info()

# Assuming row cleaning was successful 🤞

#### Now I need to drop all rows with timestamps inside of the time range of consecutive refuelings
- to do this, I'll need to identify each trucks fueling purchases

It looks like Jarrod’s merge produced a litany of duplicate refuel_datetime values. ⛽️ ⛽️ ...⛽️ 

- ✅ This could make sense since every fuel level measurement that happened between refuel_datetime times might have received the preceding refuel_datetime time-stamp.

Assuming this is the case, I should be able to run a modified version of my flag function to flag "inter-fueling-rows". 🇲🇽 
- leaving us with a "feather-light" 89,976 rows


##### attempt to flag inter-fueling-rows

    🦂 
    (is it possible that I would have dropped a noisy row that also happened to identify the next fueling event? Perhaps, (although the odds of having this problem are only 411/7.9Million). Regardless, we should be fine, since the following row will have the appropriate timestamp and odometer reading.) 
    So, we're good to go 
    😁 🚀 

In [None]:
#reset the index now that we have dropped the noisy odometer rows
events_levels_df.reset_indext_index(inplace = True, drop = True)
events_levels_df