In [1]:
import pandas as pd

In [2]:
# load the CSV file exported from SPSS, and calculate modal split percentage for JFK passengers.
df = pd.read_csv('2014 TxT CSS O-D Dep SPSS Database_NYU w vars.csv')
jfk_tot = df.query('AIRPORT=="JFK"').WEIGHT.sum()
jfk_splits = df.query('AIRPORT=="JFK"').groupby('MAINMODE')['WEIGHT'].sum()/jfk_tot

In [3]:
# label each value of MAINMODE with a category
modegroups = {'A':'train',
'E':'train',
'J':'train',
'Z':'train',
'LIRR Connecting to JFK AirTrain':'train',
'Other (NYC Subway, including Connection to JFK AirTrain)':'train',
'Airlink New York':'van',
'All County Express':'van',
'Connecticut Limousine':'van',
"Dave's Best Limousine":'van',
'ETS Airport Shuttle':'van',
'Other (Shared-Ride Van/Service)':'van',
'Prime Time Shuttle of Connecticut':'van',
'State Shuttle':'van',
'SuperShuttle from Long Island':'van',
'SuperShuttle from Manhattan':'van',
'Limo/Executive Car/Town Car Service':'limo',
'Chartered/Tour Bus':'bus',
'NYC Airporter Bus from JFK/LaGuardia Airports':'bus',
'NYC Airporter Bus from Manhattan':'bus',
'Public/City Bus (that is, a local bus)':'bus',
'Transbridge Bus':'bus',
'Taxi':'taxi',
'Air Park':'local_van',
'Dollar Airport Parking':'local_van',
'Hilton Hotel':'local_van',
'Holiday Inn FSP':'local_van', 
'Hotel/Motel Shuttle/Van':'local_van',
'JFK Marriott Hotel':'local_van',
'National Airport Parking':'local_van',
'Other (Off-Airport Parking Shuttle/Van)':'local_van',
'Park Plus':'local_van',
'Sky Park/Central Parking':'local_van',
'The Parking Spot JFK/LGA':'local_van',
'U-Save':'local_van',
'Vista Airport Parking':'local_van',
'Drove Your Own Car':'private',
'Passenger in Car Parked at Airport':'private',
'Passenger in Car and Dropped Off at Airport':'private',
'Avis':'rental',
'Budget':'rental',
'Dollar':'rental',
'Enterprise':'rental',
'Hertz':'rental',
'National':'rental',
'Other (Drove Rental Car)':'rental'}

In [4]:
# add groupings
grouped_df = pd.DataFrame.from_dict(modegroups,orient='index').rename(columns={0:'ModeGroup'})
# summarize the mode-split percentages by mode-group
grouped_df.join(jfk_splits).groupby('ModeGroup').sum()

Unnamed: 0_level_0,WEIGHT
ModeGroup,Unnamed: 1_level_1
bus,0.052655
limo,0.12461
local_van,0.0302
private,0.309572
rental,0.026468
taxi,0.28586
train,0.112113
van,0.05212


In [5]:
df.groupby('DepCheckinWaittime')['WEIGHT'].sum()

DepCheckinWaittime
                      12868.353
1-4 mins.             11002.083
10-14 mins.            5429.572
15-19 mins.            2771.467
20-29 mins.            1741.629
30-45 mins.             944.949
5-9 mins.             11050.100
< 1 minute             2480.515
More than 45 mins.      741.019
Name: WEIGHT, dtype: float64

In [6]:
df.columns[58:90]

Index(['TripOriginOD', 'TripOriginLocationOD', 'TOManhattan', 'TONYC', 'TONY',
       'TONJ', 'TOCT', 'TOPA', 'TOOtherUS', 'TOTotalUS', 'TOZip', 'TOTownCity',
       'TOLocalAirport', 'VisitorVsResident', 'NightsVisited', 'NightsAway',
       'ResidentialRegion', 'PrimaryResidentialLocation', 'ResManhattan',
       'ResNYC', 'ResNY', 'ResNJ', 'ResCT', 'ResPA', 'ResOtherUS',
       'ResTotalUS', 'ResOutsideUS', 'ResZip', 'ResTownCity', 'Android',
       'BlackBerry', 'WindowsBasedOS'],
      dtype='object')

In [18]:
print ('Average number of records per zip code, when reported: ')
print (df.query('AIRPORT=="JFK" & TOZip > "10000" & TOZip < "13000"').groupby('TOZip').size().mean())
print ('Show the list of zips, truncated...')
df.query('AIRPORT=="JFK" & TOZip > "10000" & TOZip < "13000"').groupby('TOZip').size()

Average number of records per zip code, when reported: 
2.2879377431906613
Show the list of zips, truncated...


TOZip
10001    12
10002     5
10003     9
10005     4
10006     1
10007     1
10009     3
1001      1
10010     4
10011     9
10012     6
10013     3
10014     2
10015     1
10016     7
10017     9
10018     2
10019     9
10021     5
10022     6
10023     8
10024     3
10025     3
10026     4
10027    10
10028     5
10029     4
1003      1
10031     4
10032     1
         ..
11778     1
11779     1
11780     1
11783     2
11790     5
11791     1
11793     1
11801     2
11803     2
11914     1
11967     1
12002     1
12047     1
12054     1
12065     1
12110     1
12175     1
12208     1
12222     1
1226      1
12309     2
12345     1
12401     1
12513     1
12533     1
12534     1
12589     1
12603     1
12837     1
12901     1
dtype: int64

In [19]:
print ('Median number of records per zip code, when reported')
df.query('AIRPORT=="JFK" & TOZip > "10000" & TOZip < "13000"').groupby('TOZip')['WEIGHT'].size().median()

Median number of records per zip code, when reported


1.0