In [1]:
import requests
import io
import os
from queue import Queue
import dask.dataframe as dd
import pandas as pd
import numpy as np
from dask import delayed
from distributed import Client
from distributed import progress, wait

client = Client('tcp://ec2-34-200-221-251.compute-1.amazonaws.com:8786')
client.restart()

client

0,1
Client  Scheduler: tcp://ec2-34-200-221-251.compute-1.amazonaws.com:8786  Dashboard: http://ec2-34-200-221-251.compute-1.amazonaws.com:8787,Cluster  Workers: 16  Cores: 64  Memory: 269.26 GB


In [2]:
def clean_df(df) -> pd.DataFrame:
    """
    Clean the dataframe to have self.columns

    Parameters
    ----------
    df: dataframe coming in from load_from_url() method

    Returns
    -------
    pd.DataFrame: cleaned to only have self.columns
    """

    columns = [
        'pickup_datetime',
        'dropoff_datetime',
        'tip_amount',
        'fare_amount',
        'total_amount',
        'vendor_id',
        'passenger_count',
        'trip_distance',
        'payment_type',
        'tolls_amount',
    ]
    
    df.columns = [col.lower() for col in df.columns]
    df = df.rename(columns={'vendor_name': 'vendor_id',
                            'total_amt': 'total_amount',
                            'tolls_amt': 'tolls_amount',
                            'fare_amt': 'fare_amount',
                            'tip_amt': 'tip_amount',
                            'trip_pickup_datetime': 'pickup_datetime',
                            'trip_dropoff_datetime': 'dropoff_datetime'
                           })
    df.columns = map(lambda col: 
                     col.replace('_', '').replace('tpep', ''),
                     df.columns
                    )
    df = df.rename(columns={col.replace('_', ''): col for col in columns})
    return df.loc[:, columns]

import s3fs
import glob
fs = s3fs.S3FileSystem()
files = [f for f in fs.ls('nyc-tlc/trip data/') if '2016' in f and 'yellow' in f]

In [3]:
import sys
counts = []

# Loop through years 2009-2017
for year in np.arange(2009, 2018):
    
    # Get only files pertaining to this year
    files = [f for f in fs.ls('nyc-tlc/trip data/') if str(year) in f and 'yellow' in f]
    
    # Process files in parallel. (client is asynchronous)
    for i, file in enumerate(files):
        
        # Extract year and month from filename
        _year, month = file[-11:-4].split('-')
        
        # Process data for current year and month
        df = dd.read_csv('s3://' + file, 
                         dtype='object',
                         error_bad_lines=False,
                         blocksize=int(128e6))
        df = df.map_partitions(clean_df)
        df = client.persist(df)
        
        # Yearly dataframe merging
        main_df = df if not i else main_df.append(df)
    
    # Write year's df to S3
    main_df.to_csv('s3://milesg-taxi-data-east/yellow-{year}-*.csv.gz'.format(year=year), compression='gzip')
    counts.append(main_df.passenger_count.count().compute())
    sys.stdout.write('\rYear: {} - Total {}'.format(_year, sum(counts)))
    
    # Clear from cluster memory
    client.cancel(main_df)


Year: 2017 - Total 1204768604

In [4]:
client.restart()

0,1
Client  Scheduler: tcp://ec2-34-200-221-251.compute-1.amazonaws.com:8786  Dashboard: http://ec2-34-200-221-251.compute-1.amazonaws.com:8787,Cluster  Workers: 16  Cores: 64  Memory: 269.26 GB


In [5]:
client.shutdown()