### This notebook does the following:
- processes raw ridership data
- performs feature selection and cleaning
- performs hourly aggregation
- saves results in one file

In [1]:
import pandas as pd
import glob
import os
from datetime import date, timedelta
import itertools

In [2]:
# raw data directory
dataDir = '/home/urwa/Documents/Projects/NYU Remote/project/data/rawData/'

In [5]:
files = glob.glob(dataDir+'*csv')
len(files)

36

### identify zones of interest

In [6]:
zones = pd.read_csv('../Data/taxi_zones.csv')
zones.head(2)

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough
0,1,0.116357,MULTIPOLYGON (((-74.18445299999996 40.69499599...,0.000782,Newark Airport,1,EWR
1,2,0.43347,MULTIPOLYGON (((-73.82337597260663 40.63898704...,0.004866,Jamaica Bay,2,Queens


In [9]:
zones[zones.zone.apply(lambda x: 'Airport' in x)][['zone','LocationID']]

Unnamed: 0,zone,LocationID
0,Newark Airport,1
136,JFK Airport,132
145,LaGuardia Airport,138


In [10]:
zones[zones.zone.apply(lambda x: 'Station' in x)][['zone','LocationID']]

Unnamed: 0,zone,LocationID
185,Penn Station/Madison Sq West,186


In [15]:
zone_dict = {'Jfk':  132,
'Lga' : 138,
'Penn' : 186}
zone_dict

{'Jfk': 132, 'Lga': 138, 'Penn': 186}

### Run following script for each hub

In [63]:
hub = 'Penn'
zone = zone_dict[hub]
zone

186

In [64]:
processedFileDir = "/home/urwa/Documents/Projects/NYU Remote/project/data/processedData/"
processedFile = processedFileDir+hub+"VehiceByHour.csv"

In [65]:
validDestZones = list(set([z for z in zones.LocationID if z != zone]))
len(validDestZones)

259

In [66]:
for file in files:
    print("Processing "+str(file).split('/')[-1])
    
    vehicleType = str(file).split('/')[-1].split('_')[0]
    df = pd.read_csv(file)
    print("DataFrame Shape: "+str(df.shape))
    
    # rename columns for consistency
    # set passenger count to 1 for fhv
    if vehicleType == 'fhv':
        df.rename(columns={'Pickup_DateTime': 'tpep_pickup_datetime', \
                           'PUlocationID':'PULocationID', 'DOlocationID':'DOLocationID' },inplace=True)
        df['passenger_count'] = 1
        
    if vehicleType == 'green':
        df.rename(columns={'lpep_pickup_datetime': 'tpep_pickup_datetime'},inplace=True)


    # treat for na values
    df = df.dropna(subset=['tpep_pickup_datetime','PULocationID', 'DOLocationID'])
    df.fillna(value={'passenger_count':1}, inplace = True)
    
    # correct data types
    df['PULocationID'] = df['PULocationID'].astype('int')
    df['DOLocationID'] = df['DOLocationID'].astype('int')
    
    # filter to get outgoing traffic from selected hub
    df = df[(df['PULocationID'] == zone) & (df['DOLocationID'].apply(lambda x: x in validDestZones))]
    print("JFK out DataFrame Shape: "+str(df.shape))
    
    # treat datetime
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['Date'] = df['tpep_pickup_datetime'].dt.date
    df['Hour'] = df['tpep_pickup_datetime'].dt.hour
    
    #df['vehicle_type'] = vehicleType
    
    # select rquired columns
    #df = df[['vehicle_type', 'Date', 'Hour', 'DOLocationID','passenger_count']]
    df = df[['Date', 'Hour', 'DOLocationID','passenger_count']]
    
    # Hourly aggregation
    #df_count = df.groupby(['vehicle_type', 'Date', 'Hour', 'DOLocationID']).count().reset_index()
    df_count = df.groupby(['Date', 'Hour', 'DOLocationID']).count().reset_index()
    df_count.rename(columns={'passenger_count': 'vehicle_count'},inplace=True)

    #df_sum = df.groupby(['vehicle_type', 'Date', 'Hour', 'DOLocationID']).sum().reset_index()

    #aggregatedDf = pd.merge(df_count,df_sum, on=['vehicle_type', 'Date', 'Hour', 'DOLocationID'], how='inner')
    aggregatedDf = df_count
    
    print("Aggregated DataFrame Shape: "+str(aggregatedDf.shape))
    print(aggregatedDf.head(3))
    # save file
    if os.path.exists(processedFile):
        print('append to results...')
        aggregatedDf.to_csv(processedFile,index=False, header=False, mode='a+')      
    else:
        print('create results file...')
        aggregatedDf.to_csv(processedFile,index=False)
    print('file saved..')
    print("------------------------------------------------")

Processing yellow_tripdata_2018-11.csv
DataFrame Shape: (8145164, 17)
JFK out DataFrame Shape: (273052, 17)
Aggregated DataFrame Shape: (44396, 4)
         Date  Hour  DOLocationID  vehicle_count
0  2008-12-31    23           230              1
1  2018-10-31    23           162              2
2  2018-10-31    23           230              1
create results file...
file saved..
------------------------------------------------
Processing green_tripdata_2018-06.csv
DataFrame Shape: (739373, 19)
JFK out DataFrame Shape: (39, 19)
Aggregated DataFrame Shape: (39, 4)
         Date  Hour  DOLocationID  vehicle_count
0  2018-06-02    21           164              1
1  2018-06-04     8           113              1
2  2018-06-04     9           138              1
append to results...
file saved..
------------------------------------------------
Processing green_tripdata_2018-09.csv
DataFrame Shape: (666708, 19)
JFK out DataFrame Shape: (38, 19)
Aggregated DataFrame Shape: (37, 4)
         Date  Ho

DataFrame Shape: (22116391, 7)
JFK out DataFrame Shape: (122868, 8)
Aggregated DataFrame Shape: (47285, 4)
         Date  Hour  DOLocationID  vehicle_count
0  2018-08-01     0             3              1
1  2018-08-01     0             4              1
2  2018-08-01     0             7              1
append to results...
file saved..
------------------------------------------------
Processing yellow_tripdata_2018-02.csv
DataFrame Shape: (8492076, 17)
JFK out DataFrame Shape: (279480, 17)
Aggregated DataFrame Shape: (41352, 4)
         Date  Hour  DOLocationID  vehicle_count
0  2008-12-31    22           138              1
1  2018-01-17    18           170              1
2  2018-01-19     1           230              1
append to results...
file saved..
------------------------------------------------
Processing green_tripdata_2018-07.csv
DataFrame Shape: (684455, 19)
JFK out DataFrame Shape: (45, 19)
Aggregated DataFrame Shape: (44, 4)
         Date  Hour  DOLocationID  vehicle_count
0

### Further processing

In [67]:
def getcCompleteGridDf(minDate,maxDate, locations):
    minDate = [int(x) for x in minDate.split('-')]
    maxDate = [int(x) for x in maxDate.split('-')]
    sdate = date(minDate[0], minDate[1], minDate[2])   
    edate = date(maxDate[0], maxDate[1], maxDate[2])    

    delta = edate - sdate       
    days = []
    for i in range(delta.days + 1):
        days.append(sdate + timedelta(days=i))
    hours = list(range(24))
    print(len(days))
    print(len(hours))
    
    combList = list(itertools.product(*[days,hours,locations]))
    dfList = [{'Date':d, 'Hour':h, 'DOLocationID':l} for d,h,l in combList]
 
    dateHourDf = pd.DataFrame(dfList)
    dateHourDf['Date'] = pd.to_datetime(dateHourDf['Date']).dt.date
    return dateHourDf

In [68]:
processedDf = pd.read_csv(processedFile)
processedDf.head(2)

Unnamed: 0,Date,Hour,DOLocationID,vehicle_count
0,2008-12-31,23,230,1
1,2018-10-31,23,162,2


In [69]:
processedDf.shape

(1074188, 4)

In [70]:
# ensuring proper grouping since files were grouped by independently
processedDf = processedDf.groupby(['Date', 'Hour', 'DOLocationID']).sum().reset_index()
processedDf.shape

(712596, 4)

In [71]:
# sanity checks
validYears = [2018]
processedDf = processedDf[processedDf.Date.apply(lambda x: int(x.split('-')[0]) in validYears)]

validMonths = list(range(1,13))
processedDf = processedDf[processedDf.Date.apply(lambda x: int(x.split('-')[1]) in validMonths)]

processedDf.shape    

(712570, 4)

In [72]:
minDate, maxDate = (processedDf.Date.min(), processedDf.Date.max()) 
#v_types = list(set(processedDf.vehicle_type))
locations = list(set(processedDf.DOLocationID))

#print(len(v_types))
print(len(locations))

dateHourDf = getcCompleteGridDf(minDate,maxDate,locations)
dateHourDf.shape

257
365
24


(2251320, 3)

In [73]:
dateHourDf['Date'] = pd.to_datetime(dateHourDf['Date'])
processedDf['Date'] = pd.to_datetime(processedDf['Date'])

In [74]:
mergedDf = pd.merge(dateHourDf,processedDf, on=['Date', 'Hour', 'DOLocationID'], how='left')
mergedDf.fillna(0, inplace=True)
mergedDf['Date'] = mergedDf['Date'].dt.date
print(mergedDf.shape)
mergedDf.head(3)

(2251320, 4)


Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,1,2018-01-01,0,0.0
1,2,2018-01-01,0,0.0
2,3,2018-01-01,0,0.0


In [75]:
# sanity check
print(processedDf.vehicle_count.sum())
print(mergedDf.vehicle_count.sum())

4835787
4835787.0


In [76]:
# sanity check for size of new dataframe
366*258*24

2266272

In [77]:
# fraction of combinations that have data. Data is very scarce !!!
2332298/4520160

0.5159768680754664

In [78]:
mergedDf.to_csv(processedFile,index=False)