In [1]:
import pandas as pd 
import numpy as np 
#import gc
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns 

# Read in the merged data 2.0

In [2]:
col_names = ['idx1', 'idx2', 'Id', 'EventTimeStamp', 'EquipmentID', 'DriverID', 'Latitude',
       'Longitude', 'LocationTimeStamp', 'Speed', 'Odometer', 'IgnitionStatus',
       'EFReportReason', 'TankLevelPercent', 'TankLevelGallons',
       'ExactFuelEventId', 'truck', 'tankcap', 'refuel_datetime',
       'refuel_tr_sum', 'refuel_tr_id']
date_cols = ['EventTimeStamp', 'LocationTimeStamp', 'refuel_datetime' ]

In [3]:
dtype_dict = {
    'Id' : 'uint64', 
    'EquipmentID' : 'category', 
    'DriverID' : 'category', 
    'Latitude' : 'float64', 
    'Longitude' : 'float64', 
    'Speed' : 'uint16', 
    'Odometer' : 'float64', 
    'IgnitionStatus' : 'category', 
    'EFReportReason' : 'category', 
    'TankLevelPercent' : 'float32', 
    'TankLevelGallons' : 'float32', 
    'ExactFuelEventId' : 'uint64', 
    'truck' : 'category', 
    'tankcap' : 'uint16', 
    'refuel_tr_sum' : 'float32', 
    'refuel_tr_id' : 'category' }

In [4]:
## Read the csv after creating the dtype_dict, col_names, and date_cols objects
events_levels_df = pd.read_csv('/Users/paulomartinez/Downloads/events_levels_df.gz', skiprows=1, names=col_names,
                               dtype=dtype_dict, usecols=np.arange(2,19), 
                              parse_dates = date_cols, infer_datetime_format = True)
#9:29 pm - 9:30 pm

In [5]:
events_levels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7908406 entries, 0 to 7908405
Data columns (total 17 columns):
Id                   uint64
EventTimeStamp       datetime64[ns]
EquipmentID          category
DriverID             category
Latitude             float64
Longitude            float64
LocationTimeStamp    datetime64[ns]
Speed                uint16
Odometer             float64
IgnitionStatus       category
EFReportReason       category
TankLevelPercent     float32
TankLevelGallons     float32
ExactFuelEventId     uint64
truck                category
tankcap              uint16
refuel_datetime      datetime64[ns]
dtypes: category(5), datetime64[ns](3), float32(2), float64(3), uint16(2), uint64(2)
memory usage: 633.6 MB


# Drop the noisy rows

In [6]:
#list of noisy row ids
bad_row_Id = [12657126,
 12657127,
 12746275,
 12946311,
 13039741,
 13092342,
 13092343,
 13092344,
 13092345,
 13092346,
 13092347,
 13092555,
 13156996,
 13156997,
 13174091,
 13313530,
 13381098,
 13381099,
 13381100,
 13381101,
 13381102,
 13381103,
 13555164,
 13555165,
 13566460,
 13636409,
 13729945,
 13729946,
 13780017,
 13810682,
 13838489,
 13912193,
 13912251,
 13914679,
 13914680,
 13935785,
 14083809,
 14089445,
 14160586,
 14237432,
 14237433,
 14237448,
 14237449,
 14238192,
 14242725,
 14261769,
 14261770,
 14261771,
 14328059,
 14328060,
 14328061,
 14328062,
 14351176,
 14351177,
 14359863,
 14359864,
 14417642,
 14417643,
 14417644,
 14417645,
 14447049,
 14544626,
 14544627,
 14557081,
 14557505,
 14564960,
 14564961,
 14585667,
 14668575,
 14730292,
 14730293,
 14730294,
 14730295,
 14781469,
 14781470,
 14871213,
 14871214,
 14871215,
 14871237,
 14898651,
 14898652,
 14898653,
 14898654,
 14898655,
 14898656,
 14898657,
 14898658,
 14898659,
 14898660,
 14898661,
 14898662,
 14898663,
 14898664,
 14898665,
 14898666,
 14898667,
 14898668,
 14898669,
 14898670,
 14898671,
 14898672,
 14898673,
 14898674,
 14898675,
 14898676,
 14940092,
 14988050,
 14988052,
 14988053,
 14988054,
 15213067,
 15272473,
 15272474,
 15272475,
 15272476,
 15272477,
 15338152,
 15338153,
 15338154,
 15338155,
 15338156,
 15409791,
 15409792,
 15409793,
 15409794,
 15434374,
 15434375,
 15459392,
 15513384,
 15513385,
 15513386,
 15529682,
 15530029,
 15530030,
 15552006,
 15670484,
 15670485,
 15670486,
 15686110,
 15686111,
 15686139,
 15686140,
 15686141,
 15686142,
 15686143,
 15686144,
 15686145,
 15970716,
 15970719,
 15970720,
 16121249,
 16206897,
 16285004,
 16297322,
 16297323,
 16297324,
 16297325,
 16297326,
 16297327,
 16297328,
 16297329,
 16297330,
 16297331,
 16297332,
 16349951,
 16414146,
 16414147,
 16521380,
 16521381,
 16521382,
 16773389,
 16946873,
 16946874,
 16946875,
 17029642,
 17044451,
 17048531,
 17048532,
 17048533,
 17052293,
 17079226,
 17091437,
 17091438,
 17135503,
 17141248,
 17141249,
 17239056,
 17306363,
 17306364,
 17306365,
 17306366,
 17353645,
 17353647,
 17523789,
 17558391,
 17600517,
 17600518,
 17600519,
 17600520,
 17600521,
 17625382,
 17848054,
 18034883,
 18077107,
 18077136,
 18219769,
 18219770,
 18286629,
 18287766,
 18287797,
 18372956,
 18397269,
 18442627,
 18442628,
 18442629,
 18442630,
 18479608,
 18571739,
 18571740,
 18600441,
 18600444,
 18600445,
 18604899,
 18769793,
 18769794,
 18769795,
 18769796,
 18778132,
 18778133,
 18878427,
 18943054,
 18943055,
 18943056,
 18986682,
 19064535,
 19129470,
 19129471,
 19151185,
 19151187,
 19431189,
 19431190,
 19473744,
 19473747,
 19473748,
 19473749,
 19473750,
 19516211,
 19585856,
 19647247,
 19647248,
 19647249,
 19647307,
 19647308,
 19647309,
 19659265,
 19667110,
 19682036,
 19682037,
 19771347,
 19805675,
 19865184,
 19865185,
 19865186,
 19865187,
 19865188,
 19865189,
 19865190,
 19865191,
 19865192,
 19865193,
 19865194,
 19865195,
 19865196,
 19865197,
 19865198,
 19865199,
 19866576,
 19866577,
 19867559,
 19867560,
 19867561,
 19867562,
 19867563,
 19867564,
 19867891,
 19867892,
 19867893,
 19867894,
 19867895,
 19867896,
 19867897,
 19867898,
 19867899,
 19867900,
 19867901,
 19868038,
 19869152,
 19869153,
 19869463,
 19869464,
 19869465,
 19869466,
 19869467,
 19869468,
 19869469,
 19869587,
 19869588,
 19869589,
 19869590,
 19870675,
 19870676,
 19870677,
 19871176,
 19871177,
 19872846,
 19872847,
 19872848,
 19872849,
 19872850,
 19882277,
 19882278,
 19882279,
 19882280,
 19882281,
 19917072,
 19917073,
 19917074,
 19917075,
 19917076,
 19950534,
 19950535,
 19953297,
 19953300,
 19965259,
 19970595,
 20002163,
 20008440,
 20008443,
 20052156,
 20052455,
 20052456,
 20131852,
 20162872,
 20169350,
 20169351,
 20207146,
 20246645,
 20248564,
 20270484,
 20270487,
 20270488,
 20383219,
 20432075,
 20432104,
 20475553,
 20482760,
 20502251,
 20526997,
 20526998,
 20526999,
 20527000,
 20527090,
 20527091,
 20527092,
 20527093,
 20534852,
 20534853,
 20586712,
 20587078,
 20587079,
 20667306,
 20667307,
 20667932,
 20735846,
 20752293,
 20794736,
 20794737,
 20794738,
 20794739,
 20794740,
 20795223,
 20795224,
 20795225,
 20795226,
 20795227,
 20795545,
 20795546,
 20795547,
 20795550,
 20795551,
 20795553,
 20796460,
 20796461,
 20796462,
 20802387,
 20802388,
 20826624,
 20833078,
 20833079,
 20895567,
 20895568,
 20895569,
 20895570,
 20895571,
 20896364,
 20974377,
 20975272,
 20975273,
 20975274,
 20994196,
 21093100]

In [7]:
type(bad_row_Id[0])

int

In [None]:
bs = [i not in bad_row_Id for i in events_levels_df.Id]
bs
#started running anew 12:32 am
    #try again for all of it. 8Mil is 8 times more so, 
    #expect 80 minutes...
    
    # 1,000,000 rows began at 12:17 am - 12:28 am
        #I suppose that if 100K took 2 minutes, then ten times that
        #would take 20 minutes
    
    #100,000 rows began at 12:15 am - 12:17 am
    

#started running list comprehension 11:02 pm - 11:12 pm
    #alright, this thing is taking about as long to generate the
    #boolean index as the function did.
    
    #I might as well just sort the df by truck and date
    #and run the function on it
    
    #11:17 pm still running
    
    #11:28 pm, why is a list comprehension taking so long?
    
    #something must be off 11:40 pm
    
    #srsly 11:45 pm
    
    #11:57 pm
    
    #12:13 this is terrible. how could a list comp take this long?

In [None]:
#alright, assuming this works I'll want to redefine the df
print('started at ', datetime.now())
events_levels_df = events_levels_df[bs]
tf = datetime.now()
print('finished at ', tf)
print('whole process took ', tf-t0)
events_levels_df.info()