In [1]:
import boto3
import pandas as pd
import numpy as np
from io import StringIO
from scipy.stats import entropy
from datetime import datetime

S3_BUCKET = 'dmm-microbench'

s3 = boto3.client('s3', aws_access_key_id="AKIASVDNFDSGZYUVLQED", aws_secret_access_key="XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")

def download_s3_file(file_name, destination_file_name):
    s3.download_file(Bucket=S3_BUCKET, Key=file_name, Filename=destination_file_name)

def get_content(file_name, expression):
    return s3.select_object_content(
        Bucket=S3_BUCKET,
        Key=file_name,
        ExpressionType='SQL',
        Expression=expression,
        InputSerialization={'CSV': {"FileHeaderInfo": "Use"}},
        OutputSerialization={'CSV': {}},
    )


def convert_data_to_df(data, record_header):
    for event in data['Payload']:
        if 'Records' in event:
            record_header.append(event['Records']['Payload'])
    csv_content = ''.join(r.decode('utf-8').replace("\r", "") for r in record_header)
    csv_pd = pd.read_csv(StringIO(csv_content))

    print('\n##################################')
    print(f"Length of dataframe: {len(csv_pd)}")
    print(f"Memory usage of dataframe: \n {csv_pd.info(memory_usage='deep')}")
    print('\n##################################')

    return pd.DataFrame(csv_pd)


In [6]:
%%time
for i in range(1,9):
    download_s3_file(f"yellow_tripdata_2019-0{i}.csv", f"yellow_tripdata_2019-0{i}.csv")

CPU times: user 16.5 s, sys: 20.1 s, total: 36.5 s
Wall time: 34.2 s


In [4]:
%%time
import vaex
vaex_df = vaex.open("2Mn_200Cols.csv", convert=True)

CPU times: user 1min 8s, sys: 21 s, total: 1min 29s
Wall time: 1min 35s


In [11]:
%%time
import vaex
vaex_df = vaex.open("2Mn_200Cols.csv.hdf5")

CPU times: user 68 ms, sys: 0 ns, total: 68 ms
Wall time: 107 ms


In [15]:
vaex_df

#,VendorID_10,tpep_10pickup_10datetime_10,tpep_10dropoff_10datetime_10,passenger_10count_10,trip_10distance_10,RatecodeID_10,store_10and_10fwd_10flag_10,PULocationID_10,DOLocationID_10,payment_10type_10,fare_10amount_10,extra_10,mta_10tax_10,tip_10amount_10,tolls_10amount_10,improvement_10surcharge_10,total_10amount_10,VendorID_11,tpep_11pickup_11datetime_11,tpep_11dropoff_11datetime_11,passenger_11count_11,trip_11distance_11,RatecodeID_11,store_11and_11fwd_11flag_11,PULocationID_11,DOLocationID_11,payment_11type_11,fare_11amount_11,extra_11,mta_11tax_11,tip_11amount_11,tolls_11amount_11,improvement_11surcharge_11,total_11amount_11,VendorID_12,tpep_12pickup_12datetime_12,tpep_12dropoff_12datetime_12,passenger_12count_12,trip_12distance_12,RatecodeID_12,store_12and_12fwd_12flag_12,PULocationID_12,DOLocationID_12,payment_12type_12,fare_12amount_12,extra_12,mta_12tax_12,tip_12amount_12,tolls_12amount_12,improvement_12surcharge_12,total_12amount_12,VendorID_2,tpep_2pickup_2datetime_2,tpep_2dropoff_2datetime_2,passenger_2count_2,trip_2distance_2,RatecodeID_2,store_2and_2fwd_2flag_2,PULocationID_2,DOLocationID_2,payment_2type_2,fare_2amount_2,extra_2,mta_2tax_2,tip_2amount_2,tolls_2amount_2,improvement_2surcharge_2,total_2amount_2,VendorID_3,tpep_3pickup_3datetime_3,tpep_3dropoff_3datetime_3,passenger_3count_3,trip_3distance_3,RatecodeID_3,store_3and_3fwd_3flag_3,PULocationID_3,DOLocationID_3,payment_3type_3,fare_3amount_3,extra_3,mta_3tax_3,tip_3amount_3,tolls_3amount_3,improvement_3surcharge_3,total_3amount_3,VendorID_4,tpep_4pickup_4datetime_4,tpep_4dropoff_4datetime_4,passenger_4count_4,trip_4distance_4,RatecodeID_4,store_4and_4fwd_4flag_4,PULocationID_4,DOLocationID_4,payment_4type_4,fare_4amount_4,extra_4,mta_4tax_4,tip_4amount_4,tolls_4amount_4,improvement_4surcharge_4,total_4amount_4,VendorID_5,tpep_5pickup_5datetime_5,tpep_5dropoff_5datetime_5,passenger_5count_5,trip_5distance_5,RatecodeID_5,store_5and_5fwd_5flag_5,PULocationID_5,DOLocationID_5,payment_5type_5,fare_5amount_5,extra_5,mta_5tax_5,tip_5amount_5,tolls_5amount_5,improvement_5surcharge_5,total_5amount_5,VendorID_6,tpep_6pickup_6datetime_6,tpep_6dropoff_6datetime_6,passenger_6count_6,trip_6distance_6,RatecodeID_6,store_6and_6fwd_6flag_6,PULocationID_6,DOLocationID_6,payment_6type_6,fare_6amount_6,extra_6,mta_6tax_6,tip_6amount_6,tolls_6amount_6,improvement_6surcharge_6,total_6amount_6,VendorID_7,tpep_7pickup_7datetime_7,tpep_7dropoff_7datetime_7,passenger_7count_7,trip_7distance_7,RatecodeID_7,store_7and_7fwd_7flag_7,PULocationID_7,DOLocationID_7,payment_7type_7,fare_7amount_7,extra_7,mta_7tax_7,tip_7amount_7,tolls_7amount_7,improvement_7surcharge_7,total_7amount_7,VendorID_8,tpep_8pickup_8datetime_8,tpep_8dropoff_8datetime_8,passenger_8count_8,trip_8distance_8,RatecodeID_8,store_8and_8fwd_8flag_8,PULocationID_8,DOLocationID_8,payment_8type_8,fare_8amount_8,extra_8,mta_8tax_8,tip_8amount_8,tolls_8amount_8,improvement_8surcharge_8,total_8amount_8,VendorID_9,tpep_9pickup_9datetime_9,tpep_9dropoff_9datetime_9,passenger_9count_9,trip_9distance_9,RatecodeID_9,store_9and_9fwd_9flag_9,PULocationID_9,DOLocationID_9,payment_9type_9,fare_9amount_9,extra_9,mta_9tax_9,tip_9amount_9,tolls_9amount_9,improvement_9surcharge_9,total_9amount_9,VendorID_1,tpep_1pickup_1datetime_1,tpep_1dropoff_1datetime_1,passenger_1count_1,trip_1distance_1,RatecodeID_1,store_1and_1fwd_1flag_1,PULocationID_1,DOLocationID_1,payment_1type_1,fare_1amount_1,extra_1,mta_1tax_1,tip_1amount_1,tolls_1amount_1,improvement_1surcharge_1,total_1amount_1
0,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8,2,12/13/2018 10:18:24 PM,12/13/2018 10:39:31 PM,1,2.41,1,N,48,107,1,14.5,0.5,0.5,3.0,0.0,0.3,18.8
1,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16,2,12/13/2018 10:41:23 PM,12/13/2018 10:47:10 PM,1,0.73,1,N,107,79,1,5.5,0.5,0.5,1.36,0.0,0.3,8.16
2,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8,2,12/13/2018 10:48:52 PM,12/13/2018 11:20:13 PM,1,6.54,1,N,79,188,1,24.5,0.5,0.5,4.0,0.0,0.3,29.8
3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3,2,12/13/2018 10:09:31 PM,12/13/2018 10:14:42 PM,1,1.0,1,N,238,142,2,6.0,0.5,0.5,0.0,0.0,0.3,7.3
4,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76,2,12/13/2018 10:32:06 PM,12/13/2018 10:51:36 PM,1,2.66,1,N,143,238,1,13.5,0.5,0.5,2.96,0.0,0.3,17.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1999995,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96,2,12/20/2018 05:34:07 PM,12/20/2018 05:57:01 PM,1,1.91,1,N,158,224,1,14.0,1.0,0.5,3.16,0.0,0.3,18.96
1999996,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8,2,12/20/2018 05:47:18 PM,12/20/2018 06:02:49 PM,1,0.84,1,N,161,161,2,10.0,1.0,0.5,0.0,0.0,0.3,11.8
1999997,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8,2,12/20/2018 05:57:21 PM,12/20/2018 06:15:46 PM,2,3.87,1,N,13,246,2,16.0,1.0,0.5,0.0,0.0,0.3,17.8
1999998,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3,2,12/20/2018 05:13:11 PM,12/20/2018 05:54:21 PM,1,2.41,1,N,226,141,2,23.5,1.0,0.5,0.0,0.0,0.3,25.3


In [3]:
list(vaex_df.columns.keys())

['DOLocationID_1',
 'DOLocationID_10',
 'DOLocationID_11',
 'DOLocationID_12',
 'DOLocationID_2',
 'DOLocationID_3',
 'DOLocationID_4',
 'DOLocationID_5',
 'DOLocationID_6',
 'DOLocationID_7',
 'DOLocationID_8',
 'DOLocationID_9',
 'PULocationID_1',
 'PULocationID_10',
 'PULocationID_11',
 'PULocationID_12',
 'PULocationID_2',
 'PULocationID_3',
 'PULocationID_4',
 'PULocationID_5',
 'PULocationID_6',
 'PULocationID_7',
 'PULocationID_8',
 'PULocationID_9',
 'RatecodeID_1',
 'RatecodeID_10',
 'RatecodeID_11',
 'RatecodeID_12',
 'RatecodeID_2',
 'RatecodeID_3',
 'RatecodeID_4',
 'RatecodeID_5',
 'RatecodeID_6',
 'RatecodeID_7',
 'RatecodeID_8',
 'RatecodeID_9',
 'VendorID_1',
 'VendorID_10',
 'VendorID_11',
 'VendorID_12',
 'VendorID_2',
 'VendorID_3',
 'VendorID_4',
 'VendorID_5',
 'VendorID_6',
 'VendorID_7',
 'VendorID_8',
 'VendorID_9',
 'extra_1',
 'extra_10',
 'extra_11',
 'extra_12',
 'extra_2',
 'extra_3',
 'extra_4',
 'extra_5',
 'extra_6',
 'extra_7',
 'extra_8',
 'extra_9',
 

In [7]:
columns = ["passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge", "total_amount", "congestion_surcharge"]

In [4]:
columns = list(vaex_df.columns.keys())
columns

['DOLocationID_1',
 'DOLocationID_10',
 'DOLocationID_11',
 'DOLocationID_12',
 'DOLocationID_2',
 'DOLocationID_3',
 'DOLocationID_4',
 'DOLocationID_5',
 'DOLocationID_6',
 'DOLocationID_7',
 'DOLocationID_8',
 'DOLocationID_9',
 'PULocationID_1',
 'PULocationID_10',
 'PULocationID_11',
 'PULocationID_12',
 'PULocationID_2',
 'PULocationID_3',
 'PULocationID_4',
 'PULocationID_5',
 'PULocationID_6',
 'PULocationID_7',
 'PULocationID_8',
 'PULocationID_9',
 'RatecodeID_1',
 'RatecodeID_10',
 'RatecodeID_11',
 'RatecodeID_12',
 'RatecodeID_2',
 'RatecodeID_3',
 'RatecodeID_4',
 'RatecodeID_5',
 'RatecodeID_6',
 'RatecodeID_7',
 'RatecodeID_8',
 'RatecodeID_9',
 'VendorID_1',
 'VendorID_10',
 'VendorID_11',
 'VendorID_12',
 'VendorID_2',
 'VendorID_3',
 'VendorID_4',
 'VendorID_5',
 'VendorID_6',
 'VendorID_7',
 'VendorID_8',
 'VendorID_9',
 'extra_1',
 'extra_10',
 'extra_11',
 'extra_12',
 'extra_2',
 'extra_3',
 'extra_4',
 'extra_5',
 'extra_6',
 'extra_7',
 'extra_8',
 'extra_9',
 

In [5]:
%%time
for col in columns:
    try:
        limits = vaex_df.limits(vaex_df[col])
        bins = vaex_df.bin_edges(expression=col, limits=limits)
        histogram = vaex_df.count(expression="*", binby=vaex_df[col], edges=bins.all())
        print(histogram, bins)
    except Exception as e:
        print(e)

[     0      0   4106   9529   8317    180   1007  18764   1830   2817
    414    455    522  10544    537    671    114   5895   1881   2216
   2662  21236  26579   5206  52072  23671   2006    683    773     93
    160   3843    338   6205  45119   1147    490  40874    889  44928
   3062   2270    384  15468   9015  28603    943   2496   3413    191
  29446      0    973  43182     25   4416  46178   5360    187    453
    396    533   9708   1503   3848  17469   1526  27801  23643  89088
  83941  23340   2612  19520  17817   3279    340    622  20263  74357
 106707  40276  15925  60884    322    872    167    600   2551   6576
    136  50542   1455   2190    462   1914   1371   1539    743   1064
    363    174   5738  16790    664    322   1392    767    898   3744
  10032   5073  37837  90492  88772  89145 126519  52888    592  10202
  30438    955  33036    266    283   8964   1360   2286  37124  75581
   4563] [  1.       3.0625   5.125    7.1875   9.25    11.3125  13.375   15.

In [20]:
## Compute histograms for categorical columns

In [23]:
categorical_columns = ["store_and_fwd_flag", "payment_type"]

In [17]:
categorical_columns = []
for i in range(1,10):
    categorical_columns.append(f"store_{i}and_{i}fwd_{i}flag_{i}")
    categorical_columns.append(f"payment_{i}type_{i}")

In [18]:
%%time
for col in categorical_columns:
    try:
        bins = vaex_df[col].unique()
        histograms = vaex_df.groupby(vaex_df[col], agg='count')
        print(bins, histograms)
    except Exception as e:
        print(e)

['N' 'Y']   #  store_1and_1fwd_1flag_1              count
  0  N                              1.98752e+06
  1  Y                          12484
[1 2 3 4]   #    payment_1type_1             count
  0                  2  562487
  1                  1       1.42497e+06
  2                  3    9442
  3                  4    3097
['N' 'Y']   #  store_2and_2fwd_2flag_2              count
  0  N                              1.98752e+06
  1  Y                          12484
[2 1 3 4]   #    payment_2type_2             count
  0                  2  562487
  1                  1       1.42497e+06
  2                  3    9442
  3                  4    3097
['N' 'Y']   #  store_3and_3fwd_3flag_3              count
  0  N                              1.98752e+06
  1  Y                          12484
[2 1 3 4]   #    payment_3type_3             count
  0                  2  562487
  1                  1       1.42497e+06
  2                  3    9442
  3                  4    3097
['N' 'Y']   #

In [16]:
%%time
for col in columns:
    vaex_df[col].dropnan()

CPU times: user 3.22 s, sys: 12 ms, total: 3.23 s
Wall time: 3.23 s
