In [1]:
# Plotting setup
import matplotlib.pyplot as plt

# Package imports
import numpy as np
import pandas as pd

In [2]:
from urllib.error import HTTPError

In [17]:
def load_taxi_data(fleets=['yellow'], years=[2021], months=[1]):
    '''
    Load TLC Trip Record Data for New York taxis
    --------------------------------------------
    INPUTS:
    
    fleets (string[]): 'yellow', 'green', 'fhv'
                           List of taxi companies whose data will be retrieved.
                           'fhv' = Uber, Lyft, etc.
    years (int[])    : 2009...2021
                           Array containing all years to be retrieved.
    months (int[])   : 1 ... 12
                           Array containing the months to be retrieved coded as integers.
    
    OUTPUT:
    
    df_trips: pandas dataframe with columns features_common (see list below).
    '''
    url_prefix = 'https://nyc-tlc.s3.amazonaws.com/trip+data/'
    
    # A small subset of features is selected when FHV data is requested.
    if 'fhv' in fleets:
        features_common = ['pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID', 'fleet']
        # Empty dataframe
        df_trips = pd.DataFrame(columns=features_common)

        for fleet in fleets:
            for year in years:
                for month in months:
                    # Create file name
                    if month < 10:
                        url_data = fleet + '_tripdata_' + str(year) + '-0' + str(month) + '.csv' # Caution: Leading 0
                    else:
                        url_data = fleet + '_tripdata_' + str(year) + '-' + str(month) + '.csv'
                    # Try to download file
                    try:
                        print('Will download... ' + url_data)
                        df = pd.read_csv(url_prefix + url_data, low_memory=False)
                    except HTTPError:
                        print('ERROR: There is no data available for fleet={}, years={}, months={}!'.format(fleet, years, months))
                        continue
                    # Set Yellow taxi dataframe columns
                    if fleet == 'yellow':
                        # Features that all Yellow fleet data sets have in common
                        features = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID']
                    # Set Green taxi dataframe columns
                    if fleet == 'green':
                        # Features that all Green fleet data sets have in common
                        features = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID']
                    # Set FHV dataframe columns
                    if fleet == 'fhv':
                        # Features that all FHV data sets have in common
                        features = ['pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID']
                    # Create dataframe => Only use relevant features and drop rest
                    df.drop(columns=df.columns.difference(features), inplace=True)
                    # Add column to identify fleet
                    df['fleet'] = fleet
                    # Standardize pick-up and drop-off columns for all fleets, i.e., rename them
                    df.columns = features_common
                    # Aggregate all dataframes
                    df_trips = pd.concat([df_trips, df], axis=0)
        return df_trips
    # If no FHV: Use large set of features.
    else:
        features_common = ['pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'tip_amount', 'total_amount', 'fleet']
        # Empty dataframe
        df_trips = pd.DataFrame(columns=features_common)
        
        for fleet in fleets:
            for year in years:
                for month in months:
                    # Create file name
                    if month < 10:
                        url_data = fleet + '_tripdata_' + str(year) + '-0' + str(month) + '.csv' # Caution: Leading 0
                    else:
                        url_data = fleet + '_tripdata_' + str(year) + '-' + str(month) + '.csv'
                    # Try to download file
                    try:
                        print('Will download... ' + url_data)
                        df = pd.read_csv(url_prefix + url_data, low_memory=False)
                    except HTTPError:
                        print('ERROR: There is no data available for fleet={}, years={}, months={}!'.format(fleet, years, months))
                        continue
                    # Set Yellow taxi dataframe columns
                    if fleet == 'yellow':
                        # Features that all Yellow fleet data sets have in common
                        features = ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'tip_amount', 'total_amount']
                    # Set Green taxi dataframe columns
                    if fleet == 'green':
                        # Features that all Green fleet data sets have in common
                        features = ['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID', 'DOLocationID', 'passenger_count', 'trip_distance', 'tip_amount', 'total_amount']
                    # Create dataframe => Only use relevant features and drop rest
                    df.drop(columns=df.columns.difference(features), inplace=True)
                    # Add column to identify fleet
                    df['fleet'] = fleet
                    # Standardize pick-up and drop-off columns for all fleets, i.e., rename them
                    df.columns = features_common
                    # Aggregate all dataframes
                    df_trips = pd.concat([df_trips, df], axis=0)
        return df_trips

In [18]:
df_taxi_2019_03_yg = load_taxi_data(['yellow', 'green'], [2019], [3])

Will download... yellow_tripdata_2019-03.csv
Will download... green_tripdata_2019-03.csv


Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,tip_amount,total_amount,fleet
0,2019-03-01 00:24:41,2019-03-01 00:25:31,1,0.0,145,145,0.00,3.80,yellow
1,2019-03-01 00:25:27,2019-03-01 00:36:37,2,3.7,95,130,0.70,15.00,yellow
2,2019-03-01 00:05:21,2019-03-01 00:38:23,1,14.1,249,28,10.10,60.66,yellow
3,2019-03-01 00:48:55,2019-03-01 01:06:03,1,9.6,138,98,0.00,28.30,yellow
4,2019-03-01 00:11:42,2019-03-01 00:16:40,1,0.8,48,48,3.00,12.30,yellow
...,...,...,...,...,...,...,...,...,...
601097,2019-03-31 23:35:06,2019-03-31 23:42:59,42,247.0,1,1.61,2.75,12.05,green
601098,2019-03-31 23:30:07,2019-03-31 23:45:36,42,238.0,1,3.22,0.00,16.55,green
601099,2019-03-31 23:54:16,2019-04-01 00:03:35,41,244.0,1,2.11,2.06,12.36,green
601100,2019-03-31 23:51:29,2019-03-31 23:58:33,7,7.0,1,1.08,0.00,7.80,green


In [19]:
df_taxi_2019_03_yg.to_csv('../dat/df_taxi_2019_03_yg.csv', encoding='utf-8')

In [20]:
df_taxi_2019_03_yg

Unnamed: 0,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,passenger_count,trip_distance,tip_amount,total_amount,fleet
0,2019-03-01 00:24:41,2019-03-01 00:25:31,1,0.0,145,145,0.00,3.80,yellow
1,2019-03-01 00:25:27,2019-03-01 00:36:37,2,3.7,95,130,0.70,15.00,yellow
2,2019-03-01 00:05:21,2019-03-01 00:38:23,1,14.1,249,28,10.10,60.66,yellow
3,2019-03-01 00:48:55,2019-03-01 01:06:03,1,9.6,138,98,0.00,28.30,yellow
4,2019-03-01 00:11:42,2019-03-01 00:16:40,1,0.8,48,48,3.00,12.30,yellow
...,...,...,...,...,...,...,...,...,...
601097,2019-03-31 23:35:06,2019-03-31 23:42:59,42,247.0,1,1.61,2.75,12.05,green
601098,2019-03-31 23:30:07,2019-03-31 23:45:36,42,238.0,1,3.22,0.00,16.55,green
601099,2019-03-31 23:54:16,2019-04-01 00:03:35,41,244.0,1,2.11,2.06,12.36,green
601100,2019-03-31 23:51:29,2019-03-31 23:58:33,7,7.0,1,1.08,0.00,7.80,green
