In [1]:
import sys
print('Python version: ', sys.version)

Python version:  3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 18:53:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium

## Load Data

In [3]:
def load_data(files):
    raw = pd.concat(pd.read_csv(f) for f in files)
    raw.columns = [c.strip() for c in raw.columns] # remove white spaces
    return raw
raw_df = load_data([
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200418.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200411.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200404.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200328.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200321.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200314.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200307.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200229.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200222.txt'])
raw_df

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,00:00:00,REGULAR,7414097,2517286
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,04:00:00,REGULAR,7414098,2517286
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,08:00:00,REGULAR,7414106,2517296
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,12:00:00,REGULAR,7414115,2517310
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,16:00:00,REGULAR,7414134,2517316
...,...,...,...,...,...,...,...,...,...,...,...
206178,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,02/21/2020,04:00:00,REGULAR,5554,420
206179,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,02/21/2020,08:00:00,REGULAR,5554,420
206180,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,02/21/2020,12:00:00,REGULAR,5554,420
206181,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,02/21/2020,16:00:00,REGULAR,5554,420


## Data Cleaning

Column names cleaned

In [4]:
raw_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

 C/A and SCP defines a turnstile

In [5]:
# count unique UNIT and STATION under same C/A and SCP
unique = raw_df[['C/A', 'SCP','UNIT','STATION']].groupby(['C/A', 'SCP']).nunique()
unique.loc[(unique['STATION'] != 1) | (unique['UNIT'] != 1)] # any unexpected value?

Unnamed: 0_level_0,Unnamed: 1_level_0,C/A,SCP,UNIT,STATION
C/A,SCP,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


Any duplication?

In [6]:
raw_df.loc[raw_df.duplicated(['DATE', 'TIME', 'C/A', 'SCP'])]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
66192,N120A,R153,01-00-00,UTICA AV,AC,IND,04/17/2020,05:00:00,RECOVR AUD,3377209,2331552
32401,H009,R235,00-03-00,BEDFORD AV,L,BMT,03/22/2020,12:00:00,RECOVR AUD,19676995,30044802
54409,N070,R012,04-00-02,34 ST-PENN STA,ACE,IND,02/25/2020,15:00:00,RECOVR AUD,9501259,10080888


The turnstiles submit “Regular” readings every four hours. The exact time is staggered

In [7]:
regular_df = raw_df.loc[raw_df['DESC'] == 'REGULAR']
readingtime = regular_df.groupby(['TIME']).size().reset_index(name='count')
readingtime.sort_values('count',ascending=False).head(10)

Unnamed: 0,TIME,count
37279,20:00:00,143102
29795,16:00:00,143076
22341,12:00:00,142895
14816,08:00:00,142881
7333,04:00:00,142859
0,00:00:00,142047
24207,13:00:00,78146
39172,21:00:00,78097
16718,09:00:00,78096
31658,17:00:00,78085


### Cleaning Daily entrances and exits

Compute counter difference between each reading time

for example:

DATE        TIME     ENTRIES ENTRIES_DIFF

03/28/2020	00:00:00 e0      e1-e0

03/28/2020	04:00:00 e1      e2-e1

03/28/2020	08:00:00 e2      e3-e2

03/28/2020	12:00:00 e3      e4-e3

03/28/2020	16:00:00 e4      e5-e4

03/28/2020	20:00:00 e5      e6-e5

03/29/2020	00:00:00 e6      e7-e6

The entries of 03/28/2020 is sum(ENTRIES')where DATE is 03/28/2020

If the next row is records another turnstile, then drop the last term

In [8]:
reading_diffs = regular_df.sort_values(['C/A', 'SCP','UNIT','STATION','DATE', 'TIME'])[[
    'C/A', 'SCP','UNIT','STATION','DATE', 'TIME', 'ENTRIES', 'EXITS']]
reading_diffs['ENTRIES_DIFF'] = reading_diffs['ENTRIES'].diff(periods=-1)*-1 # Difference with following row
reading_diffs['EXITS_DIFF'] = reading_diffs['EXITS'].diff(periods=-1)*-1 # Difference with following row
reading_diffs = reading_diffs.fillna(0) # Fill last line with 0
# turnstile ID = C/A + SCP
reading_diffs['ID'] = reading_diffs['C/A'] + reading_diffs['SCP']
reading_diffs['NEXT_ID'] = reading_diffs['ID'].shift(-1) # turnstile ID in next row
# turnstile changed, drop this period of difference
reading_diffs['ENTRIES_DIFF'] = np.where(
    reading_diffs['ID'] == reading_diffs['NEXT_ID'], reading_diffs['ENTRIES_DIFF'], 0)
reading_diffs['EXITS_DIFF'] = np.where(
    reading_diffs['ID'] == reading_diffs['NEXT_ID'], reading_diffs['EXITS_DIFF'], 0)
reading_diffs.head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
2,A002,02-00-00,R051,59 ST,02/15/2020,11:00:00,7379740,2503012,188.0,47.0,A00202-00-00,A00202-00-00
3,A002,02-00-00,R051,59 ST,02/15/2020,15:00:00,7379928,2503059,308.0,73.0,A00202-00-00,A00202-00-00
4,A002,02-00-00,R051,59 ST,02/15/2020,19:00:00,7380236,2503132,136.0,44.0,A00202-00-00,A00202-00-00
5,A002,02-00-00,R051,59 ST,02/15/2020,23:00:00,7380372,2503176,31.0,7.0,A00202-00-00,A00202-00-00
6,A002,02-00-00,R051,59 ST,02/16/2020,03:00:00,7380403,2503183,6.0,9.0,A00202-00-00,A00202-00-00
7,A002,02-00-00,R051,59 ST,02/16/2020,07:00:00,7380409,2503192,42.0,58.0,A00202-00-00,A00202-00-00
8,A002,02-00-00,R051,59 ST,02/16/2020,11:00:00,7380451,2503250,165.0,70.0,A00202-00-00,A00202-00-00
9,A002,02-00-00,R051,59 ST,02/16/2020,15:00:00,7380616,2503320,191.0,61.0,A00202-00-00,A00202-00-00
10,A002,02-00-00,R051,59 ST,02/16/2020,19:00:00,7380807,2503381,156.0,21.0,A00202-00-00,A00202-00-00
11,A002,02-00-00,R051,59 ST,02/16/2020,23:00:00,7380963,2503402,26.0,12.0,A00202-00-00,A00202-00-00


#### Cleaning negative counts
There are still negative differences, for some reasons

In [9]:
negative = reading_diffs[(reading_diffs['ENTRIES_DIFF']<0) | (reading_diffs['EXITS_DIFF']<0)]
negative_bar = negative[negative['ENTRIES_DIFF']<0].sort_values(
    ['ENTRIES_DIFF']).groupby('ENTRIES_DIFF').size().reset_index(name='cnt')
negative_bar['ENTRIES_DIFF'] = negative_bar['ENTRIES_DIFF'] *-1;
negative_bar.head(30)

Unnamed: 0,ENTRIES_DIFF,cnt
0,1054866000.0,1
1,168628000.0,1
2,134858500.0,1
3,56297020.0,1
4,23172680.0,1
5,15209550.0,1
6,14787090.0,1
7,14336130.0,1
8,13681620.0,1
9,13559560.0,1


An example

In [10]:
reading_diffs.loc[reading_diffs['ENTRIES_DIFF'] == -1731]

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
116254,PTH03,00-00-07,R552,JOURNAL SQUARE,03/30/2020,07:10:25,1845,369,-1731.0,-354.0,PTH0300-00-07,PTH0300-00-07


In [11]:
reading_diffs.loc[(reading_diffs['ID'] == 'PTH0300-00-07') & (
    (reading_diffs['DATE'] == '03/31/2020') | (reading_diffs['DATE'] == '03/30/2020'))]

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
116254,PTH03,00-00-07,R552,JOURNAL SQUARE,03/30/2020,07:10:25,1845,369,-1731.0,-354.0,PTH0300-00-07,PTH0300-00-07
116255,PTH03,00-00-07,R552,JOURNAL SQUARE,03/30/2020,11:20:03,114,15,100.0,22.0,PTH0300-00-07,PTH0300-00-07
116256,PTH03,00-00-07,R552,JOURNAL SQUARE,03/30/2020,15:32:03,214,37,77.0,33.0,PTH0300-00-07,PTH0300-00-07
116257,PTH03,00-00-07,R552,JOURNAL SQUARE,03/30/2020,19:44:03,291,70,33.0,23.0,PTH0300-00-07,PTH0300-00-07
116258,PTH03,00-00-07,R552,JOURNAL SQUARE,03/30/2020,23:56:03,324,93,8.0,12.0,PTH0300-00-07,PTH0300-00-07
116259,PTH03,00-00-07,R552,JOURNAL SQUARE,03/31/2020,04:08:03,332,105,-302.0,-105.0,PTH0300-00-07,PTH0300-00-07
116260,PTH03,00-00-07,R552,JOURNAL SQUARE,03/31/2020,08:14:52,30,0,69.0,14.0,PTH0300-00-07,PTH0300-00-07


In some stations, counters are growing negatively

In [12]:
reading_diffs.loc[(reading_diffs['ENTRIES_DIFF']<0) & (reading_diffs['ENTRIES_DIFF']>-1000)].head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
1608,A011,01-03-00,R080,57 ST-7 AV,02/15/2020,03:00:00,885696627,489935366,-14.0,-94.0,A01101-03-00,A01101-03-00
1609,A011,01-03-00,R080,57 ST-7 AV,02/15/2020,07:00:00,885696613,489935272,-72.0,-385.0,A01101-03-00,A01101-03-00
1610,A011,01-03-00,R080,57 ST-7 AV,02/15/2020,11:00:00,885696541,489934887,-118.0,-507.0,A01101-03-00,A01101-03-00
1611,A011,01-03-00,R080,57 ST-7 AV,02/15/2020,15:00:00,885696423,489934380,-231.0,-399.0,A01101-03-00,A01101-03-00
1612,A011,01-03-00,R080,57 ST-7 AV,02/15/2020,19:00:00,885696192,489933981,-142.0,-223.0,A01101-03-00,A01101-03-00
1613,A011,01-03-00,R080,57 ST-7 AV,02/15/2020,23:00:00,885696050,489933758,-68.0,-69.0,A01101-03-00,A01101-03-00
1614,A011,01-03-00,R080,57 ST-7 AV,02/16/2020,03:00:00,885695982,489933689,-7.0,-69.0,A01101-03-00,A01101-03-00
1615,A011,01-03-00,R080,57 ST-7 AV,02/16/2020,07:00:00,885695975,489933620,-36.0,-342.0,A01101-03-00,A01101-03-00
1616,A011,01-03-00,R080,57 ST-7 AV,02/16/2020,11:00:00,885695939,489933278,-102.0,-405.0,A01101-03-00,A01101-03-00
1617,A011,01-03-00,R080,57 ST-7 AV,02/16/2020,15:00:00,885695837,489932873,-172.0,-397.0,A01101-03-00,A01101-03-00


In [13]:
reading_diffs[(reading_diffs['ENTRIES_DIFF']<0) | (
    reading_diffs['EXITS_DIFF']<0)].groupby(['STATION']).size().reset_index(name='cnt').sort_values(
    'cnt', ascending=False).head(5)

Unnamed: 0,STATION,cnt
25,42 ST-PORT AUTH,1417
77,GRD CNTRL-42 ST,822
23,34 ST-HERALD SQ,761
4,14 ST,759
9,161/YANKEE STAD,738


Remove negative values (threshold may depend on time)

In [14]:
# Entry
reading_diffs['ENTRIES_DIFF'] = np.where(
    reading_diffs['ENTRIES_DIFF'] < -200, # resetted after full
    0, reading_diffs['ENTRIES_DIFF'])
reading_diffs['ENTRIES_DIFF'] = np.where(
    reading_diffs['ENTRIES_DIFF'] < 0, # negative turstiles
    reading_diffs['ENTRIES_DIFF'] * -1, reading_diffs['ENTRIES_DIFF'])
# Exit
reading_diffs['EXITS_DIFF'] = np.where(
    reading_diffs['EXITS_DIFF'] < -200, # resetted after full
    0, reading_diffs['EXITS_DIFF'])
reading_diffs['EXITS_DIFF'] = np.where(
    reading_diffs['EXITS_DIFF'] < 0, # negative turstiles
    reading_diffs['EXITS_DIFF'] * -1, reading_diffs['EXITS_DIFF'])
reading_diffs

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
2,A002,02-00-00,R051,59 ST,02/15/2020,11:00:00,7379740,2503012,188.0,47.0,A00202-00-00,A00202-00-00
3,A002,02-00-00,R051,59 ST,02/15/2020,15:00:00,7379928,2503059,308.0,73.0,A00202-00-00,A00202-00-00
4,A002,02-00-00,R051,59 ST,02/15/2020,19:00:00,7380236,2503132,136.0,44.0,A00202-00-00,A00202-00-00
5,A002,02-00-00,R051,59 ST,02/15/2020,23:00:00,7380372,2503176,31.0,7.0,A00202-00-00,A00202-00-00
6,A002,02-00-00,R051,59 ST,02/16/2020,03:00:00,7380403,2503183,6.0,9.0,A00202-00-00,A00202-00-00
...,...,...,...,...,...,...,...,...,...,...,...,...
206171,TRAM2,00-05-01,R469,RIT-ROOSEVELT,04/17/2020,05:00:00,5554,514,-0.0,-0.0,TRAM200-05-01,TRAM200-05-01
206172,TRAM2,00-05-01,R469,RIT-ROOSEVELT,04/17/2020,09:00:00,5554,514,-0.0,-0.0,TRAM200-05-01,TRAM200-05-01
206173,TRAM2,00-05-01,R469,RIT-ROOSEVELT,04/17/2020,13:00:00,5554,514,-0.0,-0.0,TRAM200-05-01,TRAM200-05-01
206174,TRAM2,00-05-01,R469,RIT-ROOSEVELT,04/17/2020,17:00:00,5554,514,-0.0,-0.0,TRAM200-05-01,TRAM200-05-01


#### Cleaning very large counts

In [15]:
reading_diffs.sort_values('ENTRIES_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
103805,N534,01-00-02,R220,CARROLL ST,03/14/2020,20:00:00,4685084,107992,2038596000.0,1831885000.0,N53401-00-02,N53401-00-02
137869,R142,01-00-01,R293,34 ST-PENN STA,02/17/2020,13:00:00,13402588,8682442,1140868000.0,1107288000.0,R14201-00-01,R14201-00-01
180611,R416,00-03-01,R245,ST LAWRENCE AV,02/26/2020,11:00:00,3932619,1238537,1002752000.0,300758800.0,R41600-03-01,R41600-03-01
190478,R532,00-00-03,R328,METS-WILLETS PT,04/13/2020,05:00:00,34982,21401,823359400.0,212053100.0,R53200-00-03,R53200-00-03
144653,R162,00-00-02,R166,79 ST,03/03/2020,00:00:00,16784351,2491213,552536000.0,566828900.0,R16200-00-02,R16200-00-02
30583,H001,00-00-00,R175,8 AV,03/11/2020,13:00:00,7481508,6471421,468477200.0,1393622000.0,H00100-00-00,H00100-00-00
103765,N534,01-00-01,R220,CARROLL ST,03/15/2020,00:00:00,349318,11640,414428000.0,0.0,N53401-00-01,N53401-00-01
50259,N056,01-00-03,R188,50 ST,03/13/2020,20:00:00,5646398,9437946,315537900.0,311746300.0,N05601-00-03,N05601-00-03
173220,R314,00-00-02,R406,PROSPECT AV,03/26/2020,09:00:00,325850,96766,66844460.0,16685190.0,R31400-00-02,R31400-00-02
3638,A025,01-06-00,R023,34 ST-HERALD SQ,03/04/2020,15:00:00,11523931,29347119,56298830.0,0.0,A02501-06-00,A02501-06-00


example

In [16]:
reading_diffs.loc[(reading_diffs['ID'] == 'N31600-00-01') & (reading_diffs['DATE'] == '04/04/2020')]

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
81523,N316,00-00-01,R267,46 ST,04/04/2020,01:00:00,158857,40906,-0.0,-0.0,N31600-00-01,N31600-00-01
81524,N316,00-00-01,R267,46 ST,04/04/2020,05:00:00,158857,40906,170683.0,216986.0,N31600-00-01,N31600-00-01
81525,N316,00-00-01,R267,46 ST,04/04/2020,09:00:00,329540,257892,11.0,8.0,N31600-00-01,N31600-00-01
81526,N316,00-00-01,R267,46 ST,04/04/2020,13:00:00,329551,257900,9.0,3.0,N31600-00-01,N31600-00-01
81527,N316,00-00-01,R267,46 ST,04/04/2020,17:00:00,329560,257903,8.0,4.0,N31600-00-01,N31600-00-01
81528,N316,00-00-01,R267,46 ST,04/04/2020,21:00:00,329568,257907,1.0,3.0,N31600-00-01,N31600-00-01


In [17]:
reading_diffs = reading_diffs[reading_diffs.ENTRIES_DIFF < 10000]
reading_diffs = reading_diffs[reading_diffs.EXITS_DIFF < 10000]

### Get Daily Count by Turnstile

In [18]:
daily_diffs = reading_diffs.groupby(['C/A', 'SCP','UNIT','STATION','DATE']).agg(
    {'ENTRIES_DIFF':'sum', 'EXITS_DIFF':'sum'}).reset_index()

In [19]:
daily_diffs.sort_values('ENTRIES_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,ENTRIES_DIFF,EXITS_DIFF
240247,R238A,02-00-03,R046,GRD CNTRL-42 ST,02/27/2020,11518.0,164.0
240246,R238A,02-00-03,R046,GRD CNTRL-42 ST,02/26/2020,11320.0,124.0
240239,R238A,02-00-03,R046,GRD CNTRL-42 ST,02/19/2020,11268.0,173.0
240245,R238A,02-00-03,R046,GRD CNTRL-42 ST,02/25/2020,11112.0,154.0
240365,R238A,02-03-00,R046,GRD CNTRL-42 ST,02/19/2020,10921.0,129.0
240371,R238A,02-03-00,R046,GRD CNTRL-42 ST,02/25/2020,10794.0,100.0
240372,R238A,02-03-00,R046,GRD CNTRL-42 ST,02/26/2020,10762.0,110.0
240367,R238A,02-03-00,R046,GRD CNTRL-42 ST,02/21/2020,10689.0,120.0
240238,R238A,02-00-03,R046,GRD CNTRL-42 ST,02/18/2020,10653.0,154.0
240373,R238A,02-03-00,R046,GRD CNTRL-42 ST,02/27/2020,10629.0,109.0


In [20]:
daily_diffs.sort_values('EXITS_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,DATE,ENTRIES_DIFF,EXITS_DIFF
79146,N063A,00-00-00,R011,42 ST-PORT AUTH,03/06/2020,1505.0,11549.0
79138,N063A,00-00-00,R011,42 ST-PORT AUTH,02/27/2020,1833.0,11202.0
240550,R240,00-00-00,R047,GRD CNTRL-42 ST,02/19/2020,1288.0,11071.0
79139,N063A,00-00-00,R011,42 ST-PORT AUTH,02/28/2020,1618.0,11070.0
79136,N063A,00-00-00,R011,42 ST-PORT AUTH,02/25/2020,1876.0,11056.0
79131,N063A,00-00-00,R011,42 ST-PORT AUTH,02/20/2020,1881.0,10700.0
240558,R240,00-00-00,R047,GRD CNTRL-42 ST,02/27/2020,1295.0,10589.0
240549,R240,00-00-00,R047,GRD CNTRL-42 ST,02/18/2020,1373.0,10578.0
79137,N063A,00-00-00,R011,42 ST-PORT AUTH,02/26/2020,1822.0,10568.0
79129,N063A,00-00-00,R011,42 ST-PORT AUTH,02/18/2020,1746.0,10558.0


### Get Daily Count by Station

In [21]:
station_daily_diffs = daily_diffs.groupby(['C/A', 'UNIT', 'STATION', 'DATE']).agg(
    {'ENTRIES_DIFF':'sum', 'EXITS_DIFF':'sum'}).reset_index()

In [22]:
station_daily_diffs.sort_values('ENTRIES_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,UNIT,STATION,DATE,ENTRIES_DIFF,EXITS_DIFF
28626,PTH22,R540,PATH NEW WTC,02/25/2020,64222.0,56914.0
28627,PTH22,R540,PATH NEW WTC,02/26/2020,63387.0,56626.0
28620,PTH22,R540,PATH NEW WTC,02/19/2020,63229.0,56727.0
28633,PTH22,R540,PATH NEW WTC,03/03/2020,62883.0,55022.0
28628,PTH22,R540,PATH NEW WTC,02/27/2020,62789.0,56632.0
28634,PTH22,R540,PATH NEW WTC,03/04/2020,62209.0,55523.0
28621,PTH22,R540,PATH NEW WTC,02/20/2020,62006.0,56528.0
28625,PTH22,R540,PATH NEW WTC,02/24/2020,61872.0,55771.0
28632,PTH22,R540,PATH NEW WTC,03/02/2020,61051.0,54017.0
28619,PTH22,R540,PATH NEW WTC,02/18/2020,61045.0,54846.0


In [23]:
station_daily_diffs.sort_values('EXITS_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,UNIT,STATION,DATE,ENTRIES_DIFF,EXITS_DIFF
28635,PTH22,R540,PATH NEW WTC,03/05/2020,60315.0,57367.0
28626,PTH22,R540,PATH NEW WTC,02/25/2020,64222.0,56914.0
28620,PTH22,R540,PATH NEW WTC,02/19/2020,63229.0,56727.0
28628,PTH22,R540,PATH NEW WTC,02/27/2020,62789.0,56632.0
28627,PTH22,R540,PATH NEW WTC,02/26/2020,63387.0,56626.0
28621,PTH22,R540,PATH NEW WTC,02/20/2020,62006.0,56528.0
28625,PTH22,R540,PATH NEW WTC,02/24/2020,61872.0,55771.0
28634,PTH22,R540,PATH NEW WTC,03/04/2020,62209.0,55523.0
28633,PTH22,R540,PATH NEW WTC,03/03/2020,62883.0,55022.0
28619,PTH22,R540,PATH NEW WTC,02/18/2020,61045.0,54846.0


In [27]:
station_daily_diffs.to_csv('../data/turnstile/station_daily_diffs.csv', index=False)