### This notebook does the following:
- processes raw ridership data
- performs feature selection and cleaning
- performs hourly aggregation
- saves results in one file

In [40]:
import pandas as pd
import glob
import os
from datetime import date, timedelta
import itertools

In [2]:
dataDir = '/home/urwa/Documents/NYU/Data/'
processedFile = '/home/urwa/Documents/NYU/nycTaxi.csv'

In [3]:
files = glob.glob(dataDir+'*csv')

In [4]:
zones = pd.read_csv('taxi_zones.csv')
zones.head(2)

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough
0,1,0.116357,MULTIPOLYGON (((-74.18445299999996 40.69499599...,0.000782,Newark Airport,1,EWR
1,2,0.43347,MULTIPOLYGON (((-73.82337597260663 40.63898704...,0.004866,Jamaica Bay,2,Queens


In [10]:
zones[zones.zone.apply(lambda x: 'Airport' in x)][['zone','LocationID']]

Unnamed: 0,zone,LocationID
0,Newark Airport,1
136,JFK Airport,132
145,LaGuardia Airport,138


In [6]:
JfK_zone = 132
validDestZones = list(set([z for z in zones.LocationID if z != JfK_zone]))
len(validDestZones)

259

In [9]:
for file in files:
    print("Processing "+str(file).split('/')[-1])
    
    vehicleType = str(file).split('/')[-1].split('_')[0]
    df = pd.read_csv(file)
    print("DataFrame Shape: "+str(df.shape))
    
    # rename columns for consistency
    # set passenger count to 1 for fhv
    if vehicleType == 'fhv':
        df.rename(columns={'Pickup_DateTime': 'tpep_pickup_datetime', \
                           'PUlocationID':'PULocationID', 'DOlocationID':'DOLocationID' },inplace=True)
        df['passenger_count'] = 1

    # treat for na values
    df = df.dropna(subset=['tpep_pickup_datetime','PULocationID', 'DOLocationID'])
    df.fillna(value={'passenger_count':1}, inplace = True)
    
    # correct data types
    df['PULocationID'] = df['PULocationID'].astype('int')
    df['DOLocationID'] = df['DOLocationID'].astype('int')
    
    # filter to get outgoing traffic from JFK
    df = df[(df['PULocationID'] == JfK_zone) & (df['DOLocationID'].apply(lambda x: x in validDestZones))]
    print("JFK out DataFrame Shape: "+str(df.shape))
    
    # treat datetime
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['Date'] = df['tpep_pickup_datetime'].dt.date
    df['Hour'] = df['tpep_pickup_datetime'].dt.hour
    
    df['vehicle_type'] = vehicleType
    
    # select rquired columns
    df = df[['vehicle_type', 'Date', 'Hour', 'DOLocationID','passenger_count']]
    
    # Hourly aggregation
    df_count = df.groupby(['vehicle_type', 'Date', 'Hour', 'DOLocationID']).count().reset_index()
    df_count.rename(columns={'passenger_count': 'vehicle_count'},inplace=True)

    df_sum = df.groupby(['vehicle_type', 'Date', 'Hour', 'DOLocationID']).sum().reset_index()

    aggregatedDf = pd.merge(df_count,df_sum, on=['vehicle_type', 'Date', 'Hour', 'DOLocationID'], how='inner')
    print("Aggregated DataFrame Shape: "+str(aggregatedDf.shape))
    print(aggregatedDf.head(3))
    # save file
    if os.path.exists(processedFile):
        print('yo')
        aggregatedDf.to_csv(processedFile,index=False, header=False, mode='a+')      
    else:
        aggregatedDf.to_csv(processedFile,index=False)
    print('file saved..')
    print("------------------------------------------------")

Processing yellow_tripdata_2018-02.csv
DataFrame Shape: (8492076, 17)
JFK out DataFrame Shape: (162101, 17)
Aggregated DataFrame Shape: (63667, 6)
  vehicle_type        Date  Hour  DOLocationID  vehicle_count  passenger_count
0       yellow  2008-12-31    14           137              1                1
1       yellow  2018-01-16    19            10              1                1
2       yellow  2018-01-16    22            28              1                1
file saved..
------------------------------------------------
Processing fhv_tripdata_2017-10.csv
DataFrame Shape: (17890689, 6)
JFK out DataFrame Shape: (182087, 7)
Aggregated DataFrame Shape: (74067, 6)
  vehicle_type        Date  Hour  DOLocationID  vehicle_count  passenger_count
0          fhv  2017-10-01     0             7              2                2
1          fhv  2017-10-01     0            10              1                1
2          fhv  2017-10-01     0            14              1                1
yo
file saved..


  interactivity=interactivity, compiler=compiler, result=result)


DataFrame Shape: (13657212, 5)
JFK out DataFrame Shape: (0, 6)
Aggregated DataFrame Shape: (0, 6)
Empty DataFrame
Columns: [vehicle_count, vehicle_type, Date, Hour, DOLocationID, passenger_count]
Index: []
yo
file saved..
------------------------------------------------
Processing yellow_tripdata_2018-10.csv
DataFrame Shape: (8821105, 17)
JFK out DataFrame Shape: (212255, 17)
Aggregated DataFrame Shape: (74624, 6)
  vehicle_type        Date  Hour  DOLocationID  vehicle_count  passenger_count
0       yellow  2008-12-31    23            50              1                1
1       yellow  2008-12-31    23           162              1                5
2       yellow  2009-01-01     5           186              1                1
yo
file saved..
------------------------------------------------
Processing fhv_tripdata_2017-05.csv
DataFrame Shape: (15397388, 5)
JFK out DataFrame Shape: (1118, 6)
Aggregated DataFrame Shape: (1017, 6)
  vehicle_type        Date  Hour  DOLocationID  vehicle_count

DataFrame Shape: (18048534, 6)
JFK out DataFrame Shape: (184807, 7)
Aggregated DataFrame Shape: (72760, 6)
  vehicle_type        Date  Hour  DOLocationID  vehicle_count  passenger_count
0          fhv  2017-11-01     0             4              2                2
1          fhv  2017-11-01     0             7              2                2
2          fhv  2017-11-01     0            10              2                2
yo
file saved..
------------------------------------------------
Processing fhv_tripdata_2018-09.csv
DataFrame Shape: (22147421, 7)
JFK out DataFrame Shape: (250515, 8)
Aggregated DataFrame Shape: (87358, 6)
  vehicle_type        Date  Hour  DOLocationID  vehicle_count  passenger_count
0          fhv  2018-09-01     0             4              3                3
1          fhv  2018-09-01     0             5              1                1
2          fhv  2018-09-01     0             7              4                4
yo
file saved..
-------------------------------------

file saved..
------------------------------------------------


### Further processing

In [96]:
def getcCompleteGridDf(v_types,minDate,maxDate, locations):
    minDate = [int(x) for x in minDate.split('-')]
    maxDate = [int(x) for x in maxDate.split('-')]
    sdate = date(minDate[0], minDate[1], minDate[2])   
    edate = date(maxDate[0], maxDate[1], maxDate[2])    

    delta = edate - sdate       
    days = []
    for i in range(delta.days + 1):
        days.append(sdate + timedelta(days=i))
    hours = list(range(24))
    print(len(days))
    print(len(hours))
    
    combList = list(itertools.product(*[v_types,days,hours,locations]))
    dfList = [{'vehicle_type':v, 'Date':d, 'Hour':h, 'DOLocationID':l} for v,d,h,l in combList]
 
    dateHourDf = pd.DataFrame(dfList)
    dateHourDf['Date'] = pd.to_datetime(dateHourDf['Date']).dt.date
    return dateHourDf

In [97]:
processedDf = pd.read_csv(processedFile)
processedDf.head(2)

Unnamed: 0,vehicle_type,Date,Hour,DOLocationID,vehicle_count,passenger_count
0,yellow,2008-12-31,14,137,1,1
1,yellow,2018-01-16,19,10,1,1


In [98]:
processedDf.shape

(3340823, 6)

In [99]:
# ensuring proper grouping sine files were grouped by independently
processedDf = processedDf.groupby(['vehicle_type', 'Date', 'Hour', 'DOLocationID']).sum().reset_index()
processedDf.shape

(3340641, 6)

In [100]:
# sanity checks
validYears = [2017,2018]
processedDf = processedDf[processedDf.Date.apply(lambda x: int(x.split('-')[0]) in validYears)]

validMonths = list(range(1,13))
processedDf = processedDf[processedDf.Date.apply(lambda x: int(x.split('-')[1]) in validMonths)]

processedDf.shape    

(3340596, 6)

In [101]:
minDate, maxDate = (processedDf.Date.min(), processedDf.Date.max()) 
v_types = list(set(processedDf.vehicle_type))
locations = list(set(processedDf.DOLocationID))

print(len(v_types))
print(len(locations))

dateHourDf = getcCompleteGridDf(v_types,minDate,maxDate,locations)
processedDf = pd.merge(dateHourDf,processedDf, on=['vehicle_type', 'Date', 'Hour', 'DOLocationID'], how='left')
processedDf.fillna(0, inplace=True)
processedDf.shape

2
258
730
24


(9040320, 6)

In [94]:
# sanity check for size of new dataframe
730*2*258*24

9040320

In [102]:
processedDf.head()

Unnamed: 0,vehicle_type,Date,Hour,DOLocationID,vehicle_count,passenger_count
0,yellow,2017-01-01,0,1,0.0,0.0
1,yellow,2017-01-01,0,2,0.0,0.0
2,yellow,2017-01-01,0,3,0.0,0.0
3,yellow,2017-01-01,0,4,0.0,0.0
4,yellow,2017-01-01,0,5,0.0,0.0


In [103]:
# fraction of combinations that have data. Data is very scarce !!!
3340596/9040320

0.36952187533184666

In [104]:
processedDf.to_csv(processedFile,index=False)