In [1]:
from dateutil import parser
from datetime import datetime
import numpy as np
import pandas as pd
import pickle

with open('data_import.pickle', 'rb') as file:
    mta = pickle.load(file)
mta.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,00:00:00,REGULAR,7089463,2401758
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,04:00:00,REGULAR,7089491,2401764
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,08:00:00,REGULAR,7089511,2401807
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,12:00:00,REGULAR,7089598,2401889
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/08/2019,16:00:00,REGULAR,7089801,2401946


In [2]:
# strip whitespace from column names
mta.columns = mta.columns.str.strip()

In [3]:
# convert date and time columns to datetime
mta['date_time'] = mta['DATE'] + ' ' + mta['TIME']
mta['date_time'] = mta['date_time'].apply(parser.parse)
mta.dtypes

C/A                  object
UNIT                 object
SCP                  object
STATION              object
LINENAME             object
DIVISION             object
DATE                 object
TIME                 object
DESC                 object
ENTRIES               int64
EXITS                 int64
date_time    datetime64[ns]
dtype: object

In [4]:
# define keys; look for duplicate entries of turnstile/datetime combinations
keys = ['C/A', 'UNIT', 'SCP', 'STATION', 'date_time']
(mta
 .groupby(keys, as_index = False)  
 .ENTRIES.count()
 .sort_values("ENTRIES", ascending=False)).head(15)

Unnamed: 0,C/A,UNIT,SCP,STATION,date_time,ENTRIES
386057,N045,R187,01-00-01,81 ST-MUSEUM,2019-06-02 17:00:00,2
1158130,R174,R034,00-00-02,125 ST,2019-06-08 00:00:00,2
386394,N045,R187,01-00-02,81 ST-MUSEUM,2019-06-02 17:00:00,2
386389,N045,R187,01-00-02,81 ST-MUSEUM,2019-06-01 17:00:00,2
386787,N045,R187,01-06-00,81 ST-MUSEUM,2019-06-11 21:00:00,2
432507,N071,R013,00-00-04,34 ST-PENN STA,2019-04-20 08:00:00,2
236238,G009,R151,02-00-03,CONEY IS-STILLW,2019-05-16 17:00:00,2
386732,N045,R187,01-06-00,81 ST-MUSEUM,2019-06-02 17:00:00,2
431152,N071,R013,00-00-00,34 ST-PENN STA,2019-04-20 08:00:00,2
1158468,R174,R034,00-00-03,125 ST,2019-06-08 00:00:00,2


In [5]:
# appears to be several duplicates, for example:
mask = ((mta["C/A"] == "R174") & 
(mta["UNIT"] == "R034") & 
(mta["SCP"] == "00-00-03") & 
(mta["STATION"] == "125 ST") &
(mta["date_time"].dt.date == datetime(2019, 6, 8).date()))

mta[mask].head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,date_time
145455,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,00:00:00,REGULAR,3828801,2853462,2019-06-08 00:00:00
145456,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,00:00:00,RECOVR AUD,3828804,2853463,2019-06-08 00:00:00
145457,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,08:00:00,REGULAR,3828931,2853610,2019-06-08 08:00:00
145458,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,12:00:00,REGULAR,3829164,2853787,2019-06-08 12:00:00
145459,R174,R034,00-00-03,125 ST,1,IRT,06/08/2019,16:00:00,REGULAR,3829514,2854060,2019-06-08 16:00:00


In [6]:
# drop duplicates
mta.drop_duplicates(subset=keys, inplace=True)
(mta
 .groupby(keys)
 .ENTRIES.count()
 .reset_index()
 .sort_values("ENTRIES", ascending=False)).head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,date_time,ENTRIES
0,A002,R051,02-00-00,59 ST,2019-04-20 00:00:00,1
1093705,R142,R293,01-00-03,34 ST-PENN STA,2019-04-26 18:00:00,1
1093715,R142,R293,01-00-03,34 ST-PENN STA,2019-04-28 10:00:00,1
1093714,R142,R293,01-00-03,34 ST-PENN STA,2019-04-28 06:00:00,1
1093713,R142,R293,01-00-03,34 ST-PENN STA,2019-04-28 02:00:00,1


In [7]:
# group by unique turnstile/date combinations and extract the first reported time
mta_by_date = mta.groupby(['C/A', 'UNIT', 'SCP', 'STATION', 'DATE'], 
                          as_index = False)['ENTRIES', 'EXITS'].first()
mta_by_date.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,04/20/2019,7026702,2382234
1,A002,R051,02-00-00,59 ST,04/21/2019,7027457,2382455
2,A002,R051,02-00-00,59 ST,04/22/2019,7028053,2382642
3,A002,R051,02-00-00,59 ST,04/23/2019,7029313,2383025
4,A002,R051,02-00-00,59 ST,04/24/2019,7030821,2383462


In [None]:
# define column for the previous day and entries/exits for that day
mta_by_date[["PREV_DATE", "PREV_ENTRIES", "PREV_EXITS"]] = \
    mta_by_date.groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES", "EXITS"].\
    transform(lambda x: x.shift(1))

mta_by_date.head()

In [None]:
# remove initial day for each turnstile, since it has no prior day for reference
mta_by_date.dropna(subset=["PREV_DATE"], axis=0, inplace=True)

In [None]:
# remove rows with negative entries and/or exits relative to previous day (likely errors)
print(mta_by_date[mta_by_date["ENTRIES"] < mta_by_date["PREV_ENTRIES"]].count())
print(mta_by_date[mta_by_date["EXITS"] < mta_by_date["PREV_EXITS"]].count())
mta_by_date = mta_by_date[mta_by_date["ENTRIES"] >= mta_by_date["PREV_ENTRIES"]]
mta_by_date = mta_by_date[mta_by_date["EXITS"] >= mta_by_date["PREV_EXITS"]]
mta_by_date.head()

In [None]:
#Drop old date column and reassign "PREV_DATE" as new "DATE"
mta_by_date.drop(columns='DATE', axis=1, inplace = True)
mta_by_date.rename(index = str, columns = {'PREV_DATE': 'DATE'}, inplace = True)
mta_by_date['DATE'] = pd.to_datetime(mta_by_date.DATE, format = '%m/%d/%Y')

#Add column for day of week and convert it to English names
dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
mta_by_date['weekday'] = mta_by_date.DATE.apply(datetime.weekday)
mta_by_date['weekday'] = mta_by_date['weekday'].map(dayOfWeek)

mta_by_date.head()

In [None]:
#Calculate total daily entries and exits and save as columns
mta_by_date['daily_entries'] = mta_by_date['ENTRIES'] - mta_by_date['PREV_ENTRIES']
mta_by_date['daily_exits'] = mta_by_date['EXITS'] - mta_by_date['PREV_EXITS']
mta_by_date.head()

In [None]:
#Check for outliers in daily_entries
outliers = mta_by_date[mta_by_date.daily_entries > 2000].sort_values('daily_entries', ascending=False)
outliers.head(10)

In [None]:
#Remove rows with outliers in daily_entries and daily_exits
mta_by_date = mta_by_date[(mta_by_date.daily_entries < 20000) & (mta_by_date.daily_exits < 20000)]

In [None]:
mta.to_pickle('turnstiles_full.pickle')
mta_by_date.to_pickle('turnstiles_by_day.pickle')