In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import datetime
import seaborn as sns

In [2]:
# Import local CSV files and change name of EXITS column

df_dec19 = pd.read_csv('v1december_turnstiles.csv', index_col = 0)
df_dec19 = df_dec19.rename(columns={'EXITS                                                               ':'EXITS'})
df_mar20 = pd.read_csv('v1march_turnstiles.csv', index_col = 0)
df_mar20 = df_mar20.rename(columns={'EXITS                                                               ':'EXITS'})
df_june20 = pd.read_csv('v1june_turnstiles.csv', index_col = 0)
df_june20 = df_june20.rename(columns={'EXITS                                                               ':'EXITS'})

In [5]:
#df_dec19_dailyfoottraffic.groupby(['STATION']).DIFF_FOOTTRAFFIC.agg(['mean','sum']).sort_values('sum', ascending = False).head(10)

In [57]:
def hr4FootTraffic(dataframe):
    """
    ADD SOME COMMENTS LATER BRIAN SO YOU REMEMBER ALL THE DETAILS
    """
    dataframe['TIMESTAMP'] = pd.to_datetime(dataframe['DATE'] + ' ' + dataframe['TIME'])
    dataframe['FOOTTRAFFIC'] = dataframe['ENTRIES'] + dataframe['EXITS']
    
    # This aggregates FOOTTRAFFIC by STATION & TIME, it also resets the indez
    hour4_counts_df = dataframe.groupby(["C/A", "UNIT", "SCP","STATION","DATE","TIME","TIMESTAMP"]).FOOTTRAFFIC.agg(['sum', 'count']).reset_index()

    # This creates a new column of DIFF_FOOTRAFFIC by taking the difference in 'sum'
    hour4_counts_df['DIFF_FOOTTRAFFIC'] = hour4_counts_df['sum'].diff()

    # For practical purposes, we will rename 'sum' to TOTAL_FOOTTRAFFIC
    hour4_counts_df = hour4_counts_df.rename(columns={'sum':'TOTAL_FOOTTRAFFIC'})

    # Now we'll drop out NaN values and reset the index
    hour4_counts_df = hour4_counts_df.dropna().reset_index(drop = True)
    
    # Below are steps for cleaning the data and removing outliers:
    
    # This removes all negative values in DIFF_FOOTTRAFFIC to ignore resets that occur
    hour4_counts_df = hour4_counts_df[hour4_counts_df['DIFF_FOOTTRAFFIC'] > 0]
    
    # This will create an upper bound of the 90th percentile, so we remove resets that result in DIFF high values
    hour4_counts_df = hour4_counts_df[hour4_counts_df['DIFF_FOOTTRAFFIC'] <\
                                      np.percentile(hour4_counts_df['DIFF_FOOTTRAFFIC'], 90)]

    
    station_counts_df = hour4_counts_df.groupby(["STATION"], as_index=False)["DIFF_FOOTTRAFFIC"].sum()
    
    return hour4_counts_df, station_counts_df
    

df_dec19_4hrfoottraffic, df_dec19_maxfoottraffic  = hr4FootTraffic(df_dec19)
df_mar20_4hrfoottraffic, df_mar20_maxfoottraffic = hr4FootTraffic(df_mar20)
df_june20_4hrfoottraffic, df_june20_maxfoottraffic = hr4FootTraffic(df_june20)

In [58]:
total_foottraffic = df_dec19_maxfoottraffic.set_index('STATION').\
    join(df_mar20_maxfoottraffic.set_index('STATION'), lsuffix = '_DEC').\
    join(df_june20_maxfoottraffic.set_index('STATION'), lsuffix = '_MAR', rsuffix = '_JUNE')

total_foottraffic['DIFF_FOOTTRAFFIC_TOTAL'] = total_foottraffic['DIFF_FOOTTRAFFIC_DEC'] +\
    total_foottraffic['DIFF_FOOTTRAFFIC_JUNE'] + total_foottraffic['DIFF_FOOTTRAFFIC_MAR']

total_foottraffic = total_foottraffic.sort_values('DIFF_FOOTTRAFFIC_TOTAL', ascending=False).reset_index()

total_foottraffic.head(20)

Unnamed: 0,STATION,DIFF_FOOTTRAFFIC_DEC,DIFF_FOOTTRAFFIC_MAR,DIFF_FOOTTRAFFIC_JUNE,DIFF_FOOTTRAFFIC_TOTAL
0,34 ST-PENN STA,3112290.0,989306.0,531701.0,4633297.0
1,FULTON ST,2098500.0,696159.0,375922.0,3170581.0
2,23 ST,2084589.0,654641.0,350989.0,3090219.0
3,86 ST,1805578.0,642971.0,320817.0,2769366.0
4,GRD CNTRL-42 ST,1658208.0,650954.0,219837.0,2528999.0
5,59 ST,1636729.0,493127.0,242443.0,2372299.0
6,125 ST,1498478.0,569846.0,253598.0,2321922.0
7,34 ST-HERALD SQ,1478523.0,554080.0,280564.0,2313167.0
8,CANAL ST,1641083.0,447500.0,224297.0,2312880.0
9,42 ST-PORT AUTH,1575823.0,469706.0,150255.0,2195784.0


In [59]:
total_foottraffic[total_foottraffic['STATION']=='GRD CNTRL-42 ST']

Unnamed: 0,STATION,DIFF_FOOTTRAFFIC_DEC,DIFF_FOOTTRAFFIC_MAR,DIFF_FOOTTRAFFIC_JUNE,DIFF_FOOTTRAFFIC_TOTAL
4,GRD CNTRL-42 ST,1658208.0,650954.0,219837.0,2528999.0


In [60]:
df_dec19_maxfoottraffic.sort_values(['DIFF_FOOTTRAFFIC'], ascending = False).head(20)

Unnamed: 0,STATION,DIFF_FOOTTRAFFIC
61,34 ST-PENN STA,3112290.0
226,FULTON ST,2098500.0
46,23 ST,2084589.0
110,86 ST,1805578.0
233,GRD CNTRL-42 ST,1658208.0
173,CANAL ST,1641083.0
85,59 ST,1636729.0
68,42 ST-PORT AUTH,1575823.0
9,125 ST,1498478.0
59,34 ST-HERALD SQ,1478523.0
