In [26]:
%config Completer.use_jedi = False

import io
import requests
import pandas as pd
import dask.dataframe as dd

int(128e6)

128000000

### NYC Taxi Data - [Data Page](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml)

---

Yellow Taxi File

In [29]:
def clean_df(df) -> pd.DataFrame:
    """
    Clean the dataframe to have self.columns

    Parameters
    ----------
    df: dataframe coming in from load_from_url() method

    Returns
    -------
    pd.DataFrame: cleaned to only have self.columns
    """

    columns = [
        'pickup_datetime',
        'dropoff_datetime',
        'tip_amount',
        'fare_amount',
        'total_amount',
        'vendor_id',
        'passenger_count',
        'trip_distance',
        'payment_type',
        'tolls_amount',
    ]
    
    df.columns = [col.lower() for col in df.columns]
    df = df.rename(columns={'vendor_name': 'vendor_id',
                            'total_amt': 'total_amount',
                            'tolls_amt': 'tolls_amount',
                            'fare_amt': 'fare_amount',
                            'tip_amt': 'tip_amount',
                            'trip_pickup_datetime': 'pickup_datetime',
                            'trip_dropoff_datetime': 'dropoff_datetime'
                           })
    df.columns = map(lambda col: 
                     col.replace('_', '').replace('tpep', ''),
                     df.columns
                    )
    df = df.rename(columns={col.replace('_', ''): col for col in columns})
    return df.loc[:, columns]

In [43]:
def read_chunk(url: str, n_kb: int=int(1e5)) -> pd.DataFrame:
    """
    Read a chunk of data from url in kb
    """
    tries = 0
    while True:
        try:
            with requests.get(url, stream=True) as response:
                chunk = next(response.iter_content(chunk_size=n_kb))
            break
        except Exception as exc:
            tries += 1
            if tries > 5: 
                raise exc
    return pd.read_csv(io.BytesIO(chunk))

url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2013-11.csv'
df1 = read_chunk(url)
print(df1.shape)
df = clean_df(df1)
df1.head()

(582, 18)


Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,CMT,2013-11-25 15:53:33,2013-11-25 16:00:51,1,0.6,-73.978104,40.752966,1.0,N,-73.985756,40.762685,CRD,6.0,1.0,0.5,1.0,0.0,8.5
1,CMT,2013-11-25 15:24:41,2013-11-25 15:30:18,1,0.5,-73.982313,40.764827,1.0,N,-73.982129,40.758889,CRD,5.5,0.0,0.5,3.0,0.0,9.0
2,CMT,2013-11-25 09:43:42,2013-11-25 10:02:57,1,3.3,-73.982013,40.762507,1.0,N,-74.006854,40.719582,CRD,15.0,0.0,0.5,2.0,0.0,17.5
3,CMT,2013-11-25 06:49:58,2013-11-25 07:04:22,1,3.8,-73.976005,40.744481,1.0,N,-74.016063,40.717298,CRD,14.0,0.0,0.5,2.9,0.0,17.4
4,CMT,2013-11-25 10:02:12,2013-11-25 10:17:15,1,2.2,-73.952625,40.780962,1.0,N,-73.98163,40.777978,CRD,12.0,0.0,0.5,2.0,0.0,14.5


In [5]:
url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-06.csv'
df = read_chunk(url)
print(df.shape)
df.head()

(5, 19)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2016-06-09 21:06:36,2016-06-09 21:13:08,2,0.79,-73.98336,40.760937,1,N,-73.977463,40.753979,2.0,6.0,0.5,0.5,0.0,0.0,0.3,7.3
1,2,2016-06-09 21:06:36,2016-06-09 21:35:11,1,5.22,-73.98172,40.736668,1,N,-73.981636,40.670242,1.0,22.0,0.5,0.5,4.0,0.0,0.3,27.3
2,2,2016-06-09 21:06:36,2016-06-09 21:13:10,1,1.26,-73.994316,40.751072,1,N,-74.004234,40.742168,1.0,6.5,0.5,0.5,1.56,0.0,0.3,9.36
3,2,2016-06-09 21:06:36,2016-06-09 21:36:10,1,7.39,-73.982361,40.773891,1,N,-73.929466,40.85154,1.0,26.0,0.5,0.5,1.0,0.0,0.3,28.3
4,2,2016-06-09 21:06:36,2016-06-09 21:23:23,1,3.1,-73.987106,40.733173,1,,,,,,,,,,,


In [27]:
url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2012-06.csv'
df = read_chunk(url)
print(df.shape)
df.rename(columns={'vendor_id': 'VendorID'}).head()

(4, 18)


Unnamed: 0,VendorID,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,rate_code,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,surcharge,mta_tax,tip_amount,tolls_amount,total_amount
0,VTS,2012-06-12 09:44:00,2012-06-12 09:45:00,1,0.17,-73.977357,40.761192,1,,-73.978863,40.762155,CSH,2.9,0,0.5,0.0,0.0,3.4
1,CMT,2012-06-12 11:50:25,2012-06-12 12:18:53,1,2.8,-73.978159,40.751348,1,N,-73.945857,40.775246,CSH,15.3,0,0.5,0.0,0.0,15.8
2,CMT,2012-06-12 11:29:12,2012-06-12 11:46:59,1,3.7,-73.993988,40.761407,1,N,-74.007421,40.72652,CRD,12.5,0,0.5,2.6,0.0,15.6
3,VTS,2012-06-12 11:29:00,2012-06-12 12:03:00,1,10.0,-73.87341,40.774012,1,,-74.002347,40.740632,CRD,27.7,0,0.5,6.92,4.799999,


In [7]:
from distributed import Client
from distributed import progress
from dask import delayed

client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:40061  Dashboard: http://127.0.0.1:8787,Cluster  Workers: 8  Cores: 8  Memory: 10.03 GB


In [32]:
def clean_df(df):
    df.columns = [col.lower().replace('_', '') for col in df.columns]
    return df

def write_df(df,fn):
    import random
    df.to_csv('/tmp/{}.csv'.format(fn), index=False)
    return True

urls = [
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-02.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-03.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-04.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-05.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-06.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-07.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-08.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-09.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-10.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-11.csv',
    'https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-12.csv',
]

raw_dfs = [delayed(read_chunk)(url, int(1e5)) for url in urls]
cleaned_dfs = [delayed(clean_df)(df) for df in raw_dfs]
writes = [delayed(write_df)(df, i) for i, df in enumerate(cleaned_dfs)]
writes

[Delayed('write_df-2d79f2d6-0081-40ef-9380-2c1eef4a635a'),
 Delayed('write_df-dd6b7865-0508-4ad4-a846-e72b09582ee8'),
 Delayed('write_df-fdf1269d-c81d-4f16-8010-119e2d37be3f'),
 Delayed('write_df-5bfe309b-3671-4e57-8114-a21c9ed1af56'),
 Delayed('write_df-554655dc-dcd3-4cd4-a5e9-b1f54ec542b3'),
 Delayed('write_df-dd414fbb-cdb7-43e1-80d4-fa7bf5533af2'),
 Delayed('write_df-b41f55d3-4d49-4da0-8de5-7b9413f9ca0d'),
 Delayed('write_df-b50fd4de-a5f1-4fc7-8170-d765912e1554'),
 Delayed('write_df-714a0d3e-c5ed-4ead-bb71-16876f730e18'),
 Delayed('write_df-ffd3bfb0-bd14-4387-8b4b-fd387f52b0f2'),
 Delayed('write_df-fe518d97-ab45-4d1e-82a6-a906b72e07db'),
 Delayed('write_df-62e5154b-9f1d-4e9d-84b1-b1afbd486450')]

In [33]:
writes = client.compute(writes)
progress(writes)

In [45]:
import dask.dataframe as dd

df = dd.read_csv('/tmp/*.csv', dtype={'tripdistance': 'object', 
                                      'vendorid': 'object',
                                      'passengercount': 'float64'})
df.head(5)

Unnamed: 0,vendorid,tpeppickupdatetime,tpepdropoffdatetime,passengercount,tripdistance,pickuplongitude,pickuplatitude,ratecodeid,storeandfwdflag,dropofflongitude,dropofflatitude,paymenttype,fareamount,extra,mtatax,tipamount,tollsamount,improvementsurcharge,totalamount
0,2,2016-01-01 00:00:00,2016-01-01 00:00:00,2.0,1.1,-73.990372,40.734695,1.0,N,-73.981842,40.732407,2.0,7.5,0.5,0.5,0.0,0.0,0.3,8.8
1,2,2016-01-01 00:00:00,2016-01-01 00:00:00,5.0,4.9,-73.980782,40.729912,1.0,N,-73.944473,40.716679,1.0,18.0,0.5,0.5,0.0,0.0,0.3,19.3
2,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1.0,10.54,-73.98455,40.679565,1.0,N,-73.950272,40.788925,1.0,33.0,0.5,0.5,0.0,0.0,0.3,34.3
3,2,2016-01-01 00:00:00,2016-01-01 00:00:00,1.0,4.75,-73.993469,40.71899,1.0,N,-73.962242,40.657333,2.0,16.5,0.0,0.5,0.0,0.0,0.3,17.3
4,2,2016-01-01 00:00:00,2016-01-01 00:00:00,3.0,1.76,-73.960625,40.78133,1.0,N,-73.977264,40.758514,2.0,8.0,0.0,0.5,0.0,0.0,0.3,8.8


In [46]:
df.tpepdropoffdatetime.astype('object').max().compute()

ValueError: Length mismatch: Expected axis has 17 elements, new values have 19 elements