In [1]:
from dateutil import parser
from datetime import datetime
import numpy as np
import pandas as pd
import pickle

with open('date_import.pickle', 'rb') as file:
    mta = pickle.load(file)
mta.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,00:00:00,REGULAR,7089463,2401758
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,04:00:00,REGULAR,7089491,2401764
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,08:00:00,REGULAR,7089511,2401807
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,12:00:00,REGULAR,7089598,2401889
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,16:00:00,REGULAR,7089801,2401946


In [2]:
# strip whitespace from column names
mta.columns = mta.columns.str.strip()

In [3]:
# convert date and time columns to datetime
mta['date_time'] = mta['DATE'] + ' ' + mta['TIME']
mta['date_time'] = mta['date_time'].apply(parser.parse)
mta.dtypes

C/A                  object
UNIT                 object
SCP                  object
STATION              object
LINENAME             object
DIVISION             object
DATE                 object
TIME                 object
DESC                 object
ENTRIES               int64
EXITS                 int64
date_time    datetime64[ns]
dtype: object

In [4]:
# define keys; look for duplicate entries of turnstile/datetime combinations
keys = ['C/A', 'UNIT', 'SCP', 'STATION', 'date_time']
(mta
 .groupby(keys, as_index = False)  
 .ENTRIES.count()
 .sort_values("ENTRIES", ascending=False)).head(15)

Unnamed: 0,C/A,UNIT,SCP,STATION,date_time,ENTRIES
577401,R174,R034,00-00-03,125 ST,2019-06-08 00:00:00,2
193247,N045,R187,01-00-02,81 ST-MUSEUM,2019-06-01 17:00:00,2
577061,R174,R034,00-00-01,125 ST,2019-06-08 00:00:00,2
193475,N045,R187,01-06-00,81 ST-MUSEUM,2019-06-11 21:00:00,2
577231,R174,R034,00-00-02,125 ST,2019-06-08 00:00:00,2
193085,N045,R187,01-00-01,81 ST-MUSEUM,2019-06-02 17:00:00,2
193252,N045,R187,01-00-02,81 ST-MUSEUM,2019-06-02 17:00:00,2
193420,N045,R187,01-06-00,81 ST-MUSEUM,2019-06-02 17:00:00,2
193080,N045,R187,01-00-01,81 ST-MUSEUM,2019-06-01 17:00:00,2
192917,N045,R187,01-00-00,81 ST-MUSEUM,2019-06-02 17:00:00,2


In [5]:
# appears to be several duplicates, for example:
mask = ((mta["C/A"] == "R174") & 
(mta["UNIT"] == "R034") & 
(mta["SCP"] == "00-00-03") & 
(mta["STATION"] == "125 ST") &
(mta["date_time"].dt.date == datetime(2019, 6, 8).date()))

mta[mask].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,date_time
145455,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,00:00:00,REGULAR,3828801,2853462,2019-06-08 00:00:00
145456,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,00:00:00,RECOVR AUD,3828804,2853463,2019-06-08 00:00:00
145457,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,08:00:00,REGULAR,3828931,2853610,2019-06-08 08:00:00
145458,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,12:00:00,REGULAR,3829164,2853787,2019-06-08 12:00:00
145459,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,16:00:00,REGULAR,3829514,2854060,2019-06-08 16:00:00


In [6]:
# drop duplicates
mta.drop_duplicates(subset=keys, inplace=True)
(mta
 .groupby(keys)
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,date_time,ENTRIES
0,A002,R051,02-00-00,59 ST,2019-05-18 00:00:00,1
544720,R142,R293,01-00-01,34 ST-PENN STA,2019-05-20 10:00:00,1
544722,R142,R293,01-00-01,34 ST-PENN STA,2019-05-20 18:00:00,1
544723,R142,R293,01-00-01,34 ST-PENN STA,2019-05-20 22:00:00,1
544724,R142,R293,01-00-01,34 ST-PENN STA,2019-05-21 02:00:00,1


In [29]:
# group by unique turnstile/date combinations and extract the first reported time
mta_by_date = mta.groupby(['C/A', 'UNIT', 'SCP', 'STATION', 'DATE'], 
                          as_index = False)['ENTRIES', 'EXITS'].first()
mta_by_date.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,05/18/2019,7062481,2393489
1,A002,R051,02-00-00,59 ST,05/19/2019,7063280,2393719
2,A002,R051,02-00-00,59 ST,05/20/2019,7063983,2393895
3,A002,R051,02-00-00,59 ST,05/21/2019,7065493,2394422
4,A002,R051,02-00-00,59 ST,05/22/2019,7067101,2394950


In [30]:
# define column for the previous day and entries/exits for that day
mta_by_date[["PREV_DATE", "PREV_ENTRIES", "PREV_EXITS"]] = \
mta_by_date.groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES", "EXITS"].\
transform(lambda x: x.shift(1))

mta_by_date.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,EXITS,PREV_DATE,PREV_ENTRIES,PREV_EXITS
0,A002,R051,02-00-00,59 ST,05/18/2019,7062481,2393489,,,
1,A002,R051,02-00-00,59 ST,05/19/2019,7063280,2393719,05/18/2019,7062481.0,2393489.0
2,A002,R051,02-00-00,59 ST,05/20/2019,7063983,2393895,05/19/2019,7063280.0,2393719.0
3,A002,R051,02-00-00,59 ST,05/21/2019,7065493,2394422,05/20/2019,7063983.0,2393895.0
4,A002,R051,02-00-00,59 ST,05/22/2019,7067101,2394950,05/21/2019,7065493.0,2394422.0


In [31]:
# remove initial day for each turnstile, since it has no prior day for reference
mta_by_date.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [32]:
# remove rows with negative entries and/or exits relative to previous day (likely errors)
print(mta_by_date[mta_by_date["ENTRIES"] < mta_by_date["PREV_ENTRIES"]].count())
print(mta_by_date[mta_by_date["EXITS"] < mta_by_date["PREV_EXITS"]].count())
mta_by_date = mta_by_date[mta_by_date["ENTRIES"] >= mta_by_date["PREV_ENTRIES"]]
mta_by_date = mta_by_date[mta_by_date["EXITS"] >= mta_by_date["PREV_EXITS"]]
mta_by_date.head()

C/A             1618
UNIT            1618
SCP             1618
STATION         1618
DATE            1618
ENTRIES         1618
EXITS           1618
PREV_DATE       1618
PREV_ENTRIES    1618
PREV_EXITS      1618
dtype: int64
C/A             1401
UNIT            1401
SCP             1401
STATION         1401
DATE            1401
ENTRIES         1401
EXITS           1401
PREV_DATE       1401
PREV_ENTRIES    1401
PREV_EXITS      1401
dtype: int64


Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,EXITS,PREV_DATE,PREV_ENTRIES,PREV_EXITS
1,A002,R051,02-00-00,59 ST,05/19/2019,7063280,2393719,05/18/2019,7062481.0,2393489.0
2,A002,R051,02-00-00,59 ST,05/20/2019,7063983,2393895,05/19/2019,7063280.0,2393719.0
3,A002,R051,02-00-00,59 ST,05/21/2019,7065493,2394422,05/20/2019,7063983.0,2393895.0
4,A002,R051,02-00-00,59 ST,05/22/2019,7067101,2394950,05/21/2019,7065493.0,2394422.0
5,A002,R051,02-00-00,59 ST,05/23/2019,7068769,2395465,05/22/2019,7067101.0,2394950.0


In [33]:
#Drop old date column and reassign "PREV_DATE" as new "DATE"
mta_by_date.drop(columns='DATE', axis=1, inplace = True)
mta_by_date.rename(index = str, columns = {'PREV_DATE': 'DATE'}, inplace = True)
mta_by_date['DATE'] = pd.to_datetime(mta_by_date.DATE, format = '%m/%d/%Y')

#Add column for day of week and convert it to English names
dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
mta_by_date['weekday'] = mta_by_date.DATE.apply(datetime.weekday)
mta_by_date['weekday'] = mta_by_date['weekday'].map(dayOfWeek)

mta_by_date.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATE,PREV_ENTRIES,PREV_EXITS,weekday
1,A002,R051,02-00-00,59 ST,7063280,2393719,2019-05-18,7062481.0,2393489.0,Saturday
2,A002,R051,02-00-00,59 ST,7063983,2393895,2019-05-19,7063280.0,2393719.0,Sunday
3,A002,R051,02-00-00,59 ST,7065493,2394422,2019-05-20,7063983.0,2393895.0,Monday
4,A002,R051,02-00-00,59 ST,7067101,2394950,2019-05-21,7065493.0,2394422.0,Tuesday
5,A002,R051,02-00-00,59 ST,7068769,2395465,2019-05-22,7067101.0,2394950.0,Wednesday
6,A002,R051,02-00-00,59 ST,7070289,2395978,2019-05-23,7068769.0,2395465.0,Thursday
7,A002,R051,02-00-00,59 ST,7071830,2396411,2019-05-24,7070289.0,2395978.0,Friday
8,A002,R051,02-00-00,59 ST,7072639,2396608,2019-05-25,7071830.0,2396411.0,Saturday
9,A002,R051,02-00-00,59 ST,7073233,2396759,2019-05-26,7072639.0,2396608.0,Sunday
10,A002,R051,02-00-00,59 ST,7073967,2396939,2019-05-27,7073233.0,2396759.0,Monday


In [34]:
#Calculate total daily entries and exits and save as columns
mta_by_date['daily_entries'] = mta_by_date['ENTRIES'] - mta_by_date['PREV_ENTRIES']
mta_by_date['daily_exits'] = mta_by_date['EXITS'] - mta_by_date['PREV_EXITS']
mta_by_date.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,ENTRIES,EXITS,DATE,PREV_ENTRIES,PREV_EXITS,weekday,daily_entries,daily_exits
1,A002,R051,02-00-00,59 ST,7063280,2393719,2019-05-18,7062481.0,2393489.0,Saturday,799.0,230.0
2,A002,R051,02-00-00,59 ST,7063983,2393895,2019-05-19,7063280.0,2393719.0,Sunday,703.0,176.0
3,A002,R051,02-00-00,59 ST,7065493,2394422,2019-05-20,7063983.0,2393895.0,Monday,1510.0,527.0
4,A002,R051,02-00-00,59 ST,7067101,2394950,2019-05-21,7065493.0,2394422.0,Tuesday,1608.0,528.0
5,A002,R051,02-00-00,59 ST,7068769,2395465,2019-05-22,7067101.0,2394950.0,Wednesday,1668.0,515.0


In [35]:
mta.to_pickle('turnstiles_full.pickle')
mta_by_date.to_pickle('turnstiles_by_day.pickle')