In [1]:
import sys
print('Python version: ', sys.version)

Python version:  3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 18:53:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium

## Load Data

In [3]:
def load_data(files):
    raw = pd.concat(pd.read_csv(f) for f in files)
    raw.columns = [c.strip() for c in raw.columns] # remove white spaces
    return raw
raw_df = load_data([
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200418.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200411.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200404.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200328.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200321.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200314.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200307.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200229.txt',
'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200222.txt'])
raw_df

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,00:00:00,REGULAR,7414097,2517286
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,04:00:00,REGULAR,7414098,2517286
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,08:00:00,REGULAR,7414106,2517296
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,12:00:00,REGULAR,7414115,2517310
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/11/2020,16:00:00,REGULAR,7414134,2517316
...,...,...,...,...,...,...,...,...,...,...,...
206178,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,02/21/2020,04:00:00,REGULAR,5554,420
206179,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,02/21/2020,08:00:00,REGULAR,5554,420
206180,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,02/21/2020,12:00:00,REGULAR,5554,420
206181,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,02/21/2020,16:00:00,REGULAR,5554,420


## Data Cleaning

Column names cleaned

In [4]:
raw_df.columns

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')

 C/A and SCP defines a turnstile

In [5]:
# count unique UNIT and STATION under same C/A and SCP
unique = raw_df[['C/A', 'SCP','UNIT','STATION']].groupby(['C/A', 'SCP']).nunique()
unique.loc[(unique['STATION'] != 1) | (unique['UNIT'] != 1)] # any unexpected value?

Unnamed: 0_level_0,Unnamed: 1_level_0,C/A,SCP,UNIT,STATION
C/A,SCP,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


The turnstiles submit “Regular” readings every four hours. The exact time is staggered

In [6]:
regular_df = raw_df.loc[raw_df['DESC'] == 'REGULAR']
readingtime = regular_df.groupby(['TIME']).size().reset_index(name='count')
readingtime.sort_values('count',ascending=False).head(10)

Unnamed: 0,TIME,count
37279,20:00:00,143102
29795,16:00:00,143076
22341,12:00:00,142895
14816,08:00:00,142881
7333,04:00:00,142859
0,00:00:00,142047
24207,13:00:00,78146
39172,21:00:00,78097
16718,09:00:00,78096
31658,17:00:00,78085


Any duplication?

In [7]:
regular_df.loc[regular_df.duplicated(['DATE', 'TIME', 'C/A', 'SCP'],keep=False)]

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS


### Cleaning Daily entrances and exits

Compute counter difference between each reading time

for example:

DATE        TIME     ENTRIES ENTRIES_DIFF

03/28/2020	00:00:00 e0      e1-e0

03/28/2020	04:00:00 e1      e2-e1

03/28/2020	08:00:00 e2      e3-e2

03/28/2020	12:00:00 e3      e4-e3

03/28/2020	16:00:00 e4      e5-e4

03/28/2020	20:00:00 e5      e6-e5

03/29/2020	00:00:00 e6      e7-e6

The entries of 03/28/2020 is sum(ENTRIES')where DATE is 03/28/2020

If the next row is records another turnstile, then drop the last term

In [8]:
keys = ['C/A', 'SCP','UNIT','STATION', 'LINENAME', 'DATE', 'TIME']
reading_diffs = regular_df.sort_values(keys)[[
    'C/A', 'SCP','UNIT','STATION','LINENAME', 'DATE', 'TIME', 'ENTRIES', 'EXITS']]
reading_diffs['ENTRIES_DIFF'] = reading_diffs['ENTRIES'].diff(periods=-1)*-1 # Difference with following row
reading_diffs['EXITS_DIFF'] = reading_diffs['EXITS'].diff(periods=-1)*-1 # Difference with following row
reading_diffs = reading_diffs.fillna(0) # Fill last line with 0
# turnstile ID = C/A + SCP
reading_diffs['ID'] = reading_diffs['C/A'] + reading_diffs['SCP']
reading_diffs['NEXT_ID'] = reading_diffs['ID'].shift(-1) # turnstile ID in next row
# turnstile changed, drop this period of difference
reading_diffs['ENTRIES_DIFF'] = np.where(
    reading_diffs['ID'] == reading_diffs['NEXT_ID'], reading_diffs['ENTRIES_DIFF'], 0)
reading_diffs['EXITS_DIFF'] = np.where(
    reading_diffs['ID'] == reading_diffs['NEXT_ID'], reading_diffs['EXITS_DIFF'], 0)
reading_diffs.head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
2,A002,02-00-00,R051,59 ST,NQR456W,02/15/2020,11:00:00,7379740,2503012,188.0,47.0,A00202-00-00,A00202-00-00
3,A002,02-00-00,R051,59 ST,NQR456W,02/15/2020,15:00:00,7379928,2503059,308.0,73.0,A00202-00-00,A00202-00-00
4,A002,02-00-00,R051,59 ST,NQR456W,02/15/2020,19:00:00,7380236,2503132,136.0,44.0,A00202-00-00,A00202-00-00
5,A002,02-00-00,R051,59 ST,NQR456W,02/15/2020,23:00:00,7380372,2503176,31.0,7.0,A00202-00-00,A00202-00-00
6,A002,02-00-00,R051,59 ST,NQR456W,02/16/2020,03:00:00,7380403,2503183,6.0,9.0,A00202-00-00,A00202-00-00
7,A002,02-00-00,R051,59 ST,NQR456W,02/16/2020,07:00:00,7380409,2503192,42.0,58.0,A00202-00-00,A00202-00-00
8,A002,02-00-00,R051,59 ST,NQR456W,02/16/2020,11:00:00,7380451,2503250,165.0,70.0,A00202-00-00,A00202-00-00
9,A002,02-00-00,R051,59 ST,NQR456W,02/16/2020,15:00:00,7380616,2503320,191.0,61.0,A00202-00-00,A00202-00-00
10,A002,02-00-00,R051,59 ST,NQR456W,02/16/2020,19:00:00,7380807,2503381,156.0,21.0,A00202-00-00,A00202-00-00
11,A002,02-00-00,R051,59 ST,NQR456W,02/16/2020,23:00:00,7380963,2503402,26.0,12.0,A00202-00-00,A00202-00-00


#### Cleaning negative counts
There are still negative differences, for some reasons

In [9]:
negative = reading_diffs[(reading_diffs['ENTRIES_DIFF']<0) | (reading_diffs['EXITS_DIFF']<0)]
negative_bar = negative[negative['ENTRIES_DIFF']<0].sort_values(
    ['ENTRIES_DIFF']).groupby('ENTRIES_DIFF').size().reset_index(name='cnt')
negative_bar['ENTRIES_DIFF'] = negative_bar['ENTRIES_DIFF'] *-1;
negative_bar.head(10)

Unnamed: 0,ENTRIES_DIFF,cnt
0,1054866000.0,1
1,168628000.0,1
2,134858500.0,1
3,56297020.0,1
4,23172680.0,1
5,15209550.0,1
6,14787090.0,1
7,14336130.0,1
8,13681620.0,1
9,13559560.0,1


An example

In [10]:
reading_diffs.loc[reading_diffs['ENTRIES_DIFF'] == -1731]

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
116254,PTH03,00-00-07,R552,JOURNAL SQUARE,1,03/30/2020,07:10:25,1845,369,-1731.0,-354.0,PTH0300-00-07,PTH0300-00-07


In [11]:
reading_diffs.loc[(reading_diffs['ID'] == 'PTH0300-00-07') & (
    (reading_diffs['DATE'] == '03/31/2020') | (reading_diffs['DATE'] == '03/30/2020'))]

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
116254,PTH03,00-00-07,R552,JOURNAL SQUARE,1,03/30/2020,07:10:25,1845,369,-1731.0,-354.0,PTH0300-00-07,PTH0300-00-07
116255,PTH03,00-00-07,R552,JOURNAL SQUARE,1,03/30/2020,11:20:03,114,15,100.0,22.0,PTH0300-00-07,PTH0300-00-07
116256,PTH03,00-00-07,R552,JOURNAL SQUARE,1,03/30/2020,15:32:03,214,37,77.0,33.0,PTH0300-00-07,PTH0300-00-07
116257,PTH03,00-00-07,R552,JOURNAL SQUARE,1,03/30/2020,19:44:03,291,70,33.0,23.0,PTH0300-00-07,PTH0300-00-07
116258,PTH03,00-00-07,R552,JOURNAL SQUARE,1,03/30/2020,23:56:03,324,93,8.0,12.0,PTH0300-00-07,PTH0300-00-07
116259,PTH03,00-00-07,R552,JOURNAL SQUARE,1,03/31/2020,04:08:03,332,105,-302.0,-105.0,PTH0300-00-07,PTH0300-00-07
116260,PTH03,00-00-07,R552,JOURNAL SQUARE,1,03/31/2020,08:14:52,30,0,69.0,14.0,PTH0300-00-07,PTH0300-00-07


In some stations, counters are growing negatively

In [12]:
reading_diffs.loc[(reading_diffs['ENTRIES_DIFF']<0) & (reading_diffs['ENTRIES_DIFF']>-1000)].head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID
1608,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/15/2020,03:00:00,885696627,489935366,-14.0,-94.0,A01101-03-00,A01101-03-00
1609,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/15/2020,07:00:00,885696613,489935272,-72.0,-385.0,A01101-03-00,A01101-03-00
1610,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/15/2020,11:00:00,885696541,489934887,-118.0,-507.0,A01101-03-00,A01101-03-00
1611,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/15/2020,15:00:00,885696423,489934380,-231.0,-399.0,A01101-03-00,A01101-03-00
1612,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/15/2020,19:00:00,885696192,489933981,-142.0,-223.0,A01101-03-00,A01101-03-00
1613,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/15/2020,23:00:00,885696050,489933758,-68.0,-69.0,A01101-03-00,A01101-03-00
1614,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/16/2020,03:00:00,885695982,489933689,-7.0,-69.0,A01101-03-00,A01101-03-00
1615,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/16/2020,07:00:00,885695975,489933620,-36.0,-342.0,A01101-03-00,A01101-03-00
1616,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/16/2020,11:00:00,885695939,489933278,-102.0,-405.0,A01101-03-00,A01101-03-00
1617,A011,01-03-00,R080,57 ST-7 AV,NQRW,02/16/2020,15:00:00,885695837,489932873,-172.0,-397.0,A01101-03-00,A01101-03-00


In [13]:
keys = ['C/A', 'SCP','UNIT','STATION', 'LINENAME']
negative_cnt = reading_diffs[(reading_diffs['ENTRIES_DIFF']<0) & (
    reading_diffs['EXITS_DIFF']<0)].groupby(keys).size().reset_index(name='negative cnt')
total_cnt = reading_diffs.groupby(keys).size().reset_index(name='total cnt')
nratio = pd.merge(negative_cnt, total_cnt)
nratio['negative ratio'] = nratio['negative cnt'] / nratio['total cnt']
nratio.sort_values('negative ratio', ascending=False).head(30)

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,negative cnt,total cnt,negative ratio
111,R127,00-00-00,R105,14 ST,123FLM,380,381,0.997375
24,N006A,00-00-00,R280,190 ST,A,375,376,0.99734
140,R322,00-00-02,R386,174 ST,25,375,376,0.99734
64,N601,00-00-01,R319,LEXINGTON AV/63,F,374,375,0.997333
45,N203,00-00-01,R195,161/YANKEE STAD,BD4,375,377,0.994695
146,R523,00-00-04,R147,61 ST WOODSIDE,7,374,377,0.992042
144,R401,00-00-00,R445,3 AV 138 ST,6,374,378,0.989418
3,A011,01-03-00,R080,57 ST-7 AV,NQRW,372,376,0.989362
51,N327,00-06-01,R254,GRAND-NEWTOWN,MR,368,374,0.983957
23,K026,00-00-01,R100,METROPOLITAN AV,M,369,377,0.97878


In [14]:
reading_diffs = pd.merge(reading_diffs, nratio)

Remove negative values (threshold may depend on time)

In [15]:
# Entry
reading_diffs['ENTRIES_DIFF'] = np.where(
    (reading_diffs['ENTRIES_DIFF'] < 0) & (reading_diffs['negative ratio'] < 0.9), # resetted after full, set to 0
    0, reading_diffs['ENTRIES_DIFF'])
reading_diffs['ENTRIES_DIFF'] = np.where(
    reading_diffs['ENTRIES_DIFF'] < 0, # negative turstiles, flip
    reading_diffs['ENTRIES_DIFF'] * -1, reading_diffs['ENTRIES_DIFF'])
# Exit
reading_diffs['EXITS_DIFF'] = np.where(
    (reading_diffs['EXITS_DIFF'] < 0) & (reading_diffs['negative ratio'] < 0.9), # resetted after full, set to 0
    0, reading_diffs['EXITS_DIFF'])
reading_diffs['EXITS_DIFF'] = np.where(
    reading_diffs['EXITS_DIFF'] < 0, # negative turstiles, filp
    reading_diffs['EXITS_DIFF'] * -1, reading_diffs['EXITS_DIFF'])
reading_diffs

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID,negative cnt,total cnt,negative ratio
0,A006,00-00-04,R079,5 AV/59 ST,NQRW,02/15/2020,01:00:00,7876580,3408084,5.0,1.0,A00600-00-04,A00600-00-04,1,372,0.002688
1,A006,00-00-04,R079,5 AV/59 ST,NQRW,02/15/2020,05:00:00,7876585,3408085,7.0,23.0,A00600-00-04,A00600-00-04,1,372,0.002688
2,A006,00-00-04,R079,5 AV/59 ST,NQRW,02/15/2020,09:00:00,7876592,3408108,69.0,53.0,A00600-00-04,A00600-00-04,1,372,0.002688
3,A006,00-00-04,R079,5 AV/59 ST,NQRW,02/15/2020,13:00:00,7876661,3408161,180.0,59.0,A00600-00-04,A00600-00-04,1,372,0.002688
4,A006,00-00-04,R079,5 AV/59 ST,NQRW,02/15/2020,17:00:00,7876841,3408220,133.0,8.0,A00600-00-04,A00600-00-04,1,372,0.002688
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58386,TRAM1,00-00-01,R468,RIT-MANHATTAN,R,04/17/2020,05:00:00,705,33,38.0,3.0,TRAM100-00-01,TRAM100-00-01,1,373,0.002681
58387,TRAM1,00-00-01,R468,RIT-MANHATTAN,R,04/17/2020,09:00:00,743,36,49.0,2.0,TRAM100-00-01,TRAM100-00-01,1,373,0.002681
58388,TRAM1,00-00-01,R468,RIT-MANHATTAN,R,04/17/2020,13:00:00,792,38,59.0,2.0,TRAM100-00-01,TRAM100-00-01,1,373,0.002681
58389,TRAM1,00-00-01,R468,RIT-MANHATTAN,R,04/17/2020,17:00:00,851,40,67.0,1.0,TRAM100-00-01,TRAM100-00-01,1,373,0.002681


#### Cleaning very large counts

In [16]:
reading_diffs.sort_values('ENTRIES_DIFF', ascending=False).head()

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID,negative cnt,total cnt,negative ratio
5027,H001,00-00-00,R175,8 AV,ACEL,03/11/2020,13:00:00,7481508,6471421,468477241.0,1393622000.0,H00100-00-00,H00100-00-00,223,383,0.582245
11510,N056,01-00-03,R188,50 ST,CE,03/13/2020,20:00:00,5646398,9437946,315537888.0,311746300.0,N05601-00-03,N05601-00-03,197,373,0.52815
1612,A025,01-00-00,R023,34 ST-HERALD SQ,BDFMNQRW,03/04/2020,15:00:00,5274455,10034932,23174512.0,1893966.0,A02501-00-00,A02501-00-00,1,375,0.002667
1987,A025,01-00-01,R023,34 ST-HERALD SQ,BDFMNQRW,03/04/2020,15:00:00,13797098,5914687,14338474.0,5746474.0,A02501-00-01,A02501-00-01,1,375,0.002667
2738,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,03/04/2020,19:00:00,5554054,8852506,9965078.0,23686500.0,A02501-06-01,A02501-06-01,1,375,0.002667


example

In [17]:
reading_diffs.loc[(reading_diffs['ID'] == 'N31600-00-01') & (reading_diffs['DATE'] == '04/04/2020')]

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,TIME,ENTRIES,EXITS,ENTRIES_DIFF,EXITS_DIFF,ID,NEXT_ID,negative cnt,total cnt,negative ratio


In [18]:
reading_diffs = reading_diffs[reading_diffs.ENTRIES_DIFF < 20000]
reading_diffs = reading_diffs[reading_diffs.EXITS_DIFF < 20000]

### Get Daily Count by Turnstile

In [19]:
daily_diffs = reading_diffs.groupby(['C/A', 'SCP','UNIT','STATION', 'LINENAME', 'DATE']).agg(
    {'ENTRIES_DIFF':'sum', 'EXITS_DIFF':'sum'}).reset_index()

In [20]:
daily_diffs.sort_values('ENTRIES_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,ENTRIES_DIFF,EXITS_DIFF
1087,H009,00-03-03,R235,BEDFORD AV,L,03/02/2020,4708.0,971.0
326,A025,01-00-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/26/2020,4535.0,1767.0
327,A025,01-00-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/27/2020,4450.0,1716.0
1084,H009,00-03-03,R235,BEDFORD AV,L,02/28/2020,4406.0,1082.0
7770,R183,00-00-04,R260,181 ST,1,03/05/2020,4358.0,3238.0
332,A025,01-00-01,R023,34 ST-HERALD SQ,BDFMNQRW,03/03/2020,4331.0,1600.0
318,A025,01-00-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/18/2020,4325.0,1683.0
319,A025,01-00-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/19/2020,4325.0,1724.0
334,A025,01-00-01,R023,34 ST-HERALD SQ,BDFMNQRW,03/05/2020,4294.0,1725.0
328,A025,01-00-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/28/2020,4278.0,1680.0


In [21]:
daily_diffs.sort_values('EXITS_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,SCP,UNIT,STATION,LINENAME,DATE,ENTRIES_DIFF,EXITS_DIFF
452,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/26/2020,2902.0,7952.0
453,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/27/2020,2759.0,7177.0
447,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/21/2020,2799.0,7114.0
446,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/20/2020,2639.0,7103.0
454,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/28/2020,3146.0,7055.0
460,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,03/05/2020,2760.0,6950.0
445,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/19/2020,2675.0,6944.0
444,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/18/2020,2621.0,6919.0
451,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/25/2020,2478.0,6912.0
450,A025,01-06-01,R023,34 ST-HERALD SQ,BDFMNQRW,02/24/2020,2497.0,6894.0


### Get Daily Count by Station

In [22]:
station_daily_diffs = daily_diffs.groupby(['C/A', 'UNIT', 'STATION','LINENAME','DATE']).agg(
    {'ENTRIES_DIFF':'sum', 'EXITS_DIFF':'sum'}).reset_index()

In [23]:
station_daily_diffs.sort_values('ENTRIES_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,UNIT,STATION,LINENAME,DATE,ENTRIES_DIFF,EXITS_DIFF
3953,PTH22,R540,PATH NEW WTC,1,03/03/2020,24277.0,19008.0
3955,PTH22,R540,PATH NEW WTC,1,03/05/2020,23774.0,18604.0
3946,PTH22,R540,PATH NEW WTC,1,02/25/2020,23573.0,19968.0
3945,PTH22,R540,PATH NEW WTC,1,02/24/2020,23490.0,18967.0
3948,PTH22,R540,PATH NEW WTC,1,02/27/2020,23409.0,19562.0
3940,PTH22,R540,PATH NEW WTC,1,02/19/2020,23061.0,19590.0
3939,PTH22,R540,PATH NEW WTC,1,02/18/2020,22853.0,19207.0
3952,PTH22,R540,PATH NEW WTC,1,03/02/2020,22726.0,19040.0
3947,PTH22,R540,PATH NEW WTC,1,02/26/2020,22667.0,19431.0
3954,PTH22,R540,PATH NEW WTC,1,03/04/2020,22615.0,19123.0


In [24]:
station_daily_diffs.sort_values('EXITS_DIFF', ascending=False).head(10)

Unnamed: 0,C/A,UNIT,STATION,LINENAME,DATE,ENTRIES_DIFF,EXITS_DIFF
3946,PTH22,R540,PATH NEW WTC,1,02/25/2020,23573.0,19968.0
3940,PTH22,R540,PATH NEW WTC,1,02/19/2020,23061.0,19590.0
3948,PTH22,R540,PATH NEW WTC,1,02/27/2020,23409.0,19562.0
3947,PTH22,R540,PATH NEW WTC,1,02/26/2020,22667.0,19431.0
3939,PTH22,R540,PATH NEW WTC,1,02/18/2020,22853.0,19207.0
3954,PTH22,R540,PATH NEW WTC,1,03/04/2020,22615.0,19123.0
3941,PTH22,R540,PATH NEW WTC,1,02/20/2020,22169.0,19074.0
3952,PTH22,R540,PATH NEW WTC,1,03/02/2020,22726.0,19040.0
3953,PTH22,R540,PATH NEW WTC,1,03/03/2020,24277.0,19008.0
3945,PTH22,R540,PATH NEW WTC,1,02/24/2020,23490.0,18967.0


In [25]:
station_daily_diffs.to_csv('../data/turnstile/station_daily_diffs.csv', index=False)