In [53]:
import numpy as np
import pandas as pd
import matplotlib as plt

from matplotlib import pylab, mlab, pyplot
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.precision", 3)
pd.set_option('display.float_format', lambda x: '%.1f' % x)

In [3]:
# reading in a sample week of data 
df = pd.read_csv("http://web.mta.info/developers/data/nyct/turnstile/turnstile_190907.txt")

In [4]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,00:00:00,REGULAR,7183242,2433142
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,04:00:00,REGULAR,7183258,2433149
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,08:00:00,REGULAR,7183278,2433176
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,12:00:00,REGULAR,7183393,2433262
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/31/2019,16:00:00,REGULAR,7183572,2433312


## cleaning data: 

In [5]:
# adding column of 'datetime' objects
df["DATETIME"] = pd.to_datetime(df["DATE"] + " " + df["TIME"])

In [12]:
# also replacing DATE column with 'datetime' objects of just date (no time)
# to help with later filtering
df["DATE"] = pd.to_datetime(df["DATE"])

In [16]:
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-08-31,00:00:00,REGULAR,7183242,2433142,2019-08-31 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-08-31,04:00:00,REGULAR,7183258,2433149,2019-08-31 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-08-31,08:00:00,REGULAR,7183278,2433176,2019-08-31 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-08-31,12:00:00,REGULAR,7183393,2433262,2019-08-31 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-08-31,16:00:00,REGULAR,7183572,2433312,2019-08-31 16:00:00


In [18]:
# cleaning up column name EXIT which had trailing spaces
df.rename(columns={'EXITS                                                               ':"EXITS"}, inplace=True)

In [19]:
# finding actual entries and exits during each time period by finding difference
# in readings, for each turnstile
real_entries = df.groupby(["C/A", "UNIT", "SCP"]).agg({"ENTRIES": "diff"})
real_exits = df.groupby(["C/A", "UNIT", "SCP"]).agg({"EXITS": "diff"})

the above real entries and exits data was found using the pandas aggregate function "diff". the entries and exists were cumulative outputs for all time, so only the difference between each value and teh value before it would be the entries in that time period. Per documentation of pandas 'diff': "Calculates the difference of a DataFrame element compared with another element in the DataFrame (default is the element in the same column of the previous row)." This is exactly what we wanted to do. We broke it up into each individual unit first, becuase we only want it to subtract from entry/exit data for its own turnstile. therefore, first values for each turnstile, with nothing to subtract from (no preceding value), return NaN. 

In [20]:
# adding actual entries and exits as columns to the DataFrame
df["REAL_ENTRIES"] = real_entries["ENTRIES"]
df["REAL_EXITS"] = real_exits["EXITS"]

## exploring data: 

In [30]:
# finding outliers in actual entries and exits
df["REAL_ENTRIES"].sort_values(ascending=False).head(25)

73163     718560745.0
116104        92258.0
116895         7625.0
116877         7308.0
3095           2872.0
3089           2746.0
3101           2719.0
96607          2599.0
3173           2598.0
3185           2559.0
184356         2532.0
50270          2520.0
3107           2512.0
3179           2482.0
96613          2479.0
3137           2475.0
96601          2467.0
50276          2454.0
50264          2441.0
116408         2438.0
50318          2403.0
1373           2398.0
4873           2379.0
50312          2372.0
4915           2367.0
Name: REAL_ENTRIES, dtype: float64

In [31]:
# investigating highest value 
df.loc[73163]

C/A                            N205
UNIT                           R195
SCP                        02-00-00
STATION             161/YANKEE STAD
LINENAME                        BD4
DIVISION                        IND
DATE            2019-09-06 00:00:00
TIME                       12:22:00
DESC                        REGULAR
ENTRIES                   721441289
EXITS                    1895802233
DATETIME        2019-09-06 12:22:00
REAL_ENTRIES            7.18561e+08
REAL_EXITS              1.88641e+09
Name: 73163, dtype: object

even though there is probably a lot of foot traffic near Yankee Stadium,
over 700 million people in 4 hours is beyond unrealistic
its exits value is also absurdly high, indicating a malfunction at this time
deciding to remove this data point. 

In [32]:
# investigating next highest, with 90,000: 
df.loc[116104]

C/A                           PTH03
UNIT                           R552
SCP                        00-00-00
STATION              JOURNAL SQUARE
LINENAME                          1
DIVISION                        PTH
DATE            2019-09-01 00:00:00
TIME                       15:50:02
DESC                        REGULAR
ENTRIES                      126651
EXITS                        126869
DATETIME        2019-09-01 15:50:02
REAL_ENTRIES                  92258
REAL_EXITS                   110916
Name: 116104, dtype: object

In [33]:
df.loc[116895]

C/A                           PTH03
UNIT                           R552
SCP                        00-01-08
STATION              JOURNAL SQUARE
LINENAME                          1
DIVISION                        PTH
DATE            2019-09-06 00:00:00
TIME                       12:03:31
DESC                        REGULAR
ENTRIES                        7626
EXITS                          6509
DATETIME        2019-09-06 12:03:31
REAL_ENTRIES                   7625
REAL_EXITS                     6509
Name: 116895, dtype: object

In [36]:
df.loc[116877]

C/A                           PTH03
UNIT                           R552
SCP                        00-01-08
STATION              JOURNAL SQUARE
LINENAME                          1
DIVISION                        PTH
DATE            2019-09-03 00:00:00
TIME                       12:39:31
DESC                        REGULAR
ENTRIES                        7309
EXITS                          6264
DATETIME        2019-09-03 12:39:31
REAL_ENTRIES                   7308
REAL_EXITS                     6264
Name: 116877, dtype: object

all 3 of the above data points are for the same station, with single reading (4 hour) riderships of ~90,000, ~7000 and ~7000 respectively, far above the next highest readings which are all in the range of ~2000. Furthermore, they jump up by this much all at once -- the total entries/exits for that turnstile are equal to the real_entries/real_exits or 4-hour change, hence the turnstile probably reset just before that. These data points should also be eliminated. 

In [67]:
df["REAL_EXITS"].sort_values(ascending=False).head(25)

73163    1886405893.0
116104       110916.0
116895         6509.0
116877         6264.0
97422          5598.0
52968          4876.0
53010          4477.0
52950          4263.0
52962          4218.0
52956          4203.0
125525         4019.0
125536         3907.0
159575         3813.0
52998          3772.0
52992          3691.0
53004          3565.0
159574         3506.0
116569         3348.0
98533          3280.0
53052          3242.0
159029         3215.0
76941          3210.0
195022         3161.0
195034         3152.0
195016         3151.0
Name: REAL_EXITS, dtype: float64

the rows with highest exits are the same as those with highest entries above. the first 2 definitely seem anomalous. However, here the next 2 (row 116895 and 116877) are not far off from the rest of the data at all.. Looking at rows before and after the 2nd highest value (row 116104):

In [74]:
df.loc[116103:116105]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,REAL_ENTRIES,REAL_EXITS
116103,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-09-01,13:05:39,REGULAR,34393,15953,2019-09-01 13:05:39,73.0,17.0
116104,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-09-01,15:50:02,REGULAR,126651,126869,2019-09-01 15:50:02,92258.0,110916.0
116105,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-09-01,17:17:39,REGULAR,34450,15975,2019-09-01 17:17:39,-92201.0,-110894.0


it's clear that 116104 should be removed, as it causes the next entry to dip into the negative which is impossible. obviously negative data points will need to be removed but more on that later. 

looking at rows surrounding 3rd highest (row 116895):

In [73]:
df.loc[116894:116896]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,REAL_ENTRIES,REAL_EXITS
116894,PTH03,R552,00-01-08,JOURNAL SQUARE,1,PTH,2019-09-06,09:36:51,REGULAR,1,0,2019-09-06 09:36:51,-7577.0,-6487.0
116895,PTH03,R552,00-01-08,JOURNAL SQUARE,1,PTH,2019-09-06,12:03:31,REGULAR,7626,6509,2019-09-06 12:03:31,7625.0,6509.0
116896,PTH03,R552,00-01-08,JOURNAL SQUARE,1,PTH,2019-09-06,16:15:31,REGULAR,7639,6536,2019-09-06 16:15:31,13.0,27.0


so, this indicates that the high reading of ~7000 at row 116895 was only because the row that preceded it was negative. hence it actually should also be removed. 

In [71]:
df[df["STATION"] == "JOURNAL SQUARE"].head(100)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,REAL_ENTRIES,REAL_EXITS
116095,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-08-31,03:29:39,REGULAR,34042,15843,2019-08-31 03:29:39,,
116096,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-08-31,07:41:39,REGULAR,34062,15866,2019-08-31 07:41:39,20.0,23.0
116097,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-08-31,11:53:39,REGULAR,34132,15886,2019-08-31 11:53:39,70.0,20.0
116098,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-08-31,16:05:39,REGULAR,34217,15904,2019-08-31 16:05:39,85.0,18.0
116099,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-08-31,20:17:39,REGULAR,34279,15916,2019-08-31 20:17:39,62.0,12.0
116100,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-09-01,00:29:39,REGULAR,34298,15925,2019-09-01 00:29:39,19.0,9.0
116101,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-09-01,04:41:39,REGULAR,34298,15929,2019-09-01 04:41:39,0.0,4.0
116102,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-09-01,08:53:39,REGULAR,34320,15936,2019-09-01 08:53:39,22.0,7.0
116103,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-09-01,13:05:39,REGULAR,34393,15953,2019-09-01 13:05:39,73.0,17.0
116104,PTH03,R552,00-00-00,JOURNAL SQUARE,1,PTH,2019-09-01,15:50:02,REGULAR,126651,126869,2019-09-01 15:50:02,92258.0,110916.0


looking at general ridership for Journal Square, it is a station with very low ridership, numbers usually not exceeding the double digits. the values of 7000 must be deleted. 

as a rule, then, based on the last several test cases, even if data looks to fall within a normal range at first glance, it may be anomolous for it's stations normal values and caused only by a turnstile reset/error. The context that comes from looking at surrounding data points makes this clear. 

one solution could be to remove negative data points, and then remove the two adjacent entries. as shown in the two above examples, sometimes a negative value can be predeced by a super high value that caused its error, or sometimes it can cauase a super high value after it because it itself is the error or was caused by a turnstile reset. deleting all 3 points could be a reasonable move just to account for all possible cases. the amount of data lost will be insignificant when compared to how drastically the error values would offset the means. 

In [78]:
# checking out the next highest exits value after Journal Square
df.loc[97422]

C/A                            N505
UNIT                           R022
SCP                        02-00-02
STATION             34 ST-HERALD SQ
LINENAME                   BDFMNQRW
DIVISION                        IND
DATE            2019-09-06 00:00:00
TIME                       16:00:00
DESC                        REGULAR
ENTRIES                     7515309
EXITS                       3880256
DATETIME        2019-09-06 16:00:00
REAL_ENTRIES                  503.0
REAL_EXITS                   5598.0
Name: 97422, dtype: object

In [84]:
# checking out adjacent data points
df.loc[97415:97430]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATETIME,REAL_ENTRIES,REAL_EXITS
97415,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-08-31,16:00:00,REGULAR,7513913,3874097,2019-08-31 16:00:00,190.0,190.0
97416,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-08-31,20:00:00,REGULAR,7514166,3874253,2019-08-31 20:00:00,253.0,156.0
97417,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-09-01,00:00:00,REGULAR,7514394,3874341,2019-09-01 00:00:00,228.0,88.0
97418,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-09-01,04:00:00,REGULAR,7514460,3874366,2019-09-01 04:00:00,66.0,25.0
97419,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-09-01,08:00:00,REGULAR,7514488,3874395,2019-09-01 08:00:00,28.0,29.0
97420,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-09-01,12:00:00,REGULAR,7514641,3874506,2019-09-01 12:00:00,153.0,111.0
97421,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-09-01,16:00:00,REGULAR,7514806,3874658,2019-09-01 16:00:00,165.0,152.0
97422,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-09-06,16:00:00,REGULAR,7515309,3880256,2019-09-06 16:00:00,503.0,5598.0
97423,N505,R022,02-00-02,34 ST-HERALD SQ,BDFMNQRW,IND,2019-09-06,20:00:00,REGULAR,7516273,3880501,2019-09-06 20:00:00,964.0,245.0
97424,N505,R022,02-00-03,34 ST-HERALD SQ,BDFMNQRW,IND,2019-08-31,00:00:00,REGULAR,9247728,5598915,2019-08-31 00:00:00,,


So, ~5000 is high for this station, but not super (orders of magnitude) far off from its regular ridership which gets up into the thousands. It is not preceded or followed by any negative values or obvious errors, and the entry data given by the turnstile at the same time was a normal value. So, I'd make the case to leave this data point alone. 

In [88]:
# checking out lowest data points: 
df["REAL_ENTRIES"].sort_values().head(25), df["REAL_EXITS"].sort_values().head(25)

(193155   -12409434.0
 193323    -8327252.0
 20539     -6346214.0
 193407    -5369163.0
 81155     -4584498.0
 19507     -1599300.0
 36983      -480406.0
 101571     -350664.0
 46403      -202825.0
 65480      -101194.0
 116105      -92201.0
 37108       -86120.0
 37006       -74424.0
 122458      -66281.0
 71539       -61315.0
 131361      -41994.0
 15624       -36550.0
 120801      -18344.0
 121523       -9925.0
 125926       -8925.0
 141693       -7688.0
 116894       -7577.0
 116876       -7271.0
 11542        -1665.0
 11530        -1651.0
 Name: REAL_ENTRIES, dtype: float64,
 193407   -14768030.0
 193155    -9983375.0
 193323    -8464033.0
 81155     -1161890.0
 19507     -1109372.0
 20539     -1010106.0
 46403      -421614.0
 15624      -251664.0
 101571     -242022.0
 36983      -141218.0
 71539      -137066.0
 131361     -129645.0
 37108      -114594.0
 116105     -110894.0
 122458      -35934.0
 65480       -21533.0
 37006       -15190.0
 141693      -13386.0
 125926      -100

negative data points need to be deleted, as well as the row that immediately precedes and immediately follows each. 

## filtering data: 

to mitigate error, we have chosen to simply delete rows with erroneous either entry or exit data, as well as the surrounding 2 rows, one of which will also hold erroneous data. given that the scope of data we are looking at is sum of ridership over 3 months, the effect of losing these occasional entries is minute. 

In [93]:
# removing negative values, and adjacent entries: 

df[df["REAL_ENTRIES"] < 0], df[df["REAL_EXITS"] < 0]

(         C/A  UNIT       SCP          STATION LINENAME DIVISION       DATE  \
 1597    A011  R080  01-03-00       57 ST-7 AV     NQRW      BMT 2019-08-31   
 1598    A011  R080  01-03-00       57 ST-7 AV     NQRW      BMT 2019-08-31   
 1599    A011  R080  01-03-00       57 ST-7 AV     NQRW      BMT 2019-08-31   
 1600    A011  R080  01-03-00       57 ST-7 AV     NQRW      BMT 2019-08-31   
 1601    A011  R080  01-03-00       57 ST-7 AV     NQRW      BMT 2019-08-31   
 ...      ...   ...       ...              ...      ...      ...        ...   
 202994  R730  R431  00-00-04  EASTCHSTER/DYRE        5      IRT 2019-09-06   
 202995  R730  R431  00-00-04  EASTCHSTER/DYRE        5      IRT 2019-09-06   
 202996  R730  R431  00-00-04  EASTCHSTER/DYRE        5      IRT 2019-09-06   
 202997  R730  R431  00-00-04  EASTCHSTER/DYRE        5      IRT 2019-09-06   
 202998  R730  R431  00-00-04  EASTCHSTER/DYRE        5      IRT 2019-09-06   
 
             TIME     DESC     ENTRIES       EXITS