In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
print('Python version: ', sys.version)

Python version:  3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 18:53:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [3]:
import numpy as np
import pandas as pd
import requests
import lxml.html

# Scraping data

In [4]:
# from https://github.com/PrattSAVI/MTA_TurnstileAnalysis/blob/master/SubwayTrips_RateChange.ipynb
def get_link_dataframe():
    mta_url = r'http://web.mta.info/developers/turnstile.html'
    response = requests.get( mta_url) #Get the html content of the website

    doc = lxml.html.fromstring(response.content)
    #All links are stored in a single div. for each a, get the hrefs.
    data_path = doc.xpath('//div[@class="span-84 last"]/a/@href') 

    df_link = pd.DataFrame( data_path , columns = ['url'] ) #Pandas dataframe
    df_link['url'] = ['http://web.mta.info/developers/' + row['url'] for index,row in df_link.iterrows() ] #Complete url

    #Extract date and format, mark start and end day to filter easier in the future
    df_link['date'] = [row['url'].split('_')[1].split('.txt')[0] for index,row in df_link.iterrows() ] #Complete url
    df_link['date'] = pd.to_datetime( df_link.date , format = '%y%m%d')
    df_link['start_date'] = df_link.date - pd.Timedelta(days=7) # the first day is 7 days before the uploading date
    df_link['end_date'] = df_link.date - pd.Timedelta(days=1) # the last day is 1 days before the uploading date
    df_link = df_link.sort_values('date', ascending=True) # sort by date

    print( len(data_path) , ' weeks data is available' )
    return df_link
df_link = get_link_dataframe()

522  weeks data is available


In [5]:
df_link.tail()

Unnamed: 0,url,date,start_date,end_date
4,http://web.mta.info/developers/data/nyct/turns...,2020-04-04,2020-03-28,2020-04-03
3,http://web.mta.info/developers/data/nyct/turns...,2020-04-11,2020-04-04,2020-04-10
2,http://web.mta.info/developers/data/nyct/turns...,2020-04-18,2020-04-11,2020-04-17
1,http://web.mta.info/developers/data/nyct/turns...,2020-04-25,2020-04-18,2020-04-24
0,http://web.mta.info/developers/data/nyct/turns...,2020-05-02,2020-04-25,2020-05-01


In [6]:
def get_links_by_date(df_link, from_date, to_date):
    links = df_link.loc[(df_link['start_date'] <= to_date) & (df_link['end_date'] >= from_date)]
    return links
test_links = get_links_by_date(df_link, '2020-4-4', '2020-4-10')

In [7]:
def load_data(files):
    print('loading', len(files),'files:', files)
    raw = pd.concat(pd.read_csv(f, parse_dates={'datetime':['DATE', 'TIME']}) for f in files)
    raw.columns = [c.strip() for c in raw.columns] # remove white spaces
    # parse time
    raw['date'] =raw['datetime'].dt.date
    raw['time'] =raw['datetime'].dt.time
    print(len(raw), ' records loaded')
    # basic cleaning
    raw = raw.dropna() # remove n/a
    raw = raw.loc[raw['DESC'] == 'REGULAR'] # keep regular readings
    print(len(raw), ' regular readings')
    return raw

test_raw = load_data(test_links['url'].tolist()) # Testing using one.
test_raw.head(5)

loading 1 files: ['http://web.mta.info/developers/data/nyct/turnstile/turnstile_200411.txt']
205981  records loaded
205424  regular readings


Unnamed: 0,datetime,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS,date,time
0,2020-04-04 00:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7413507,2516927,2020-04-04,00:00:00
1,2020-04-04 04:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7413507,2516927,2020-04-04,04:00:00
2,2020-04-04 08:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7413511,2516935,2020-04-04,08:00:00
3,2020-04-04 12:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7413521,2516944,2020-04-04,12:00:00
4,2020-04-04 16:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7413541,2516953,2020-04-04,16:00:00


# Data cleaning
## Compute counter changes

In [8]:
def get_counter_diff(raw, from_date=None, to_date=None):
    keys = ['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','datetime']
    reading_diffs = raw.sort_values(keys)[keys + ['ENTRIES', 'EXITS']]
    reading_diffs['entries_bk'] = reading_diffs['ENTRIES'].diff(periods=-1)*-1 # Difference with following row
    reading_diffs['exits_bk'] = reading_diffs['EXITS'].diff(periods=-1)*-1 # Difference with following row
    reading_diffs['entries'] = reading_diffs['entries_bk']
    reading_diffs['exits'] = reading_diffs['exits_bk']
    #reading_diffs = reading_diffs.fillna(0) # Fill last line with 0
    reading_diffs.dropna(subset=['entries','exits']) # drop n/a
    # turnstile ID = C/A + SCP
    reading_diffs['id'] = reading_diffs['C/A'] + reading_diffs['SCP']
    reading_diffs['next_id'] = reading_diffs['id'].shift(-1) # turnstile ID in next row
    #reading_diffs['prev_id'] = reading_diffs['id'].shift(1) # turnstile ID in previous row
    # turnstile changed, drop this period of difference
    reading_diffs = reading_diffs.loc[reading_diffs['id'] == reading_diffs['next_id']]
    # remove dates out of range
    if from_date:
        reading_diffs = reading_diffs.loc[reading_diffs['datetime'] >= from_date]
    if to_date:
        reading_diffs = reading_diffs.loc[reading_diffs['datetime'] <= to_date]
    # date should be continuous
    print('datetime range:', reading_diffs['datetime'].min(), '-', reading_diffs['datetime'].max())
    # log
    print(len(reading_diffs), ' records loaded')
    print(len(reading_diffs[reading_diffs['entries']<0]), ' negative entry records')
    print(len(reading_diffs[reading_diffs['exits']<0]), ' negative exit records')
    return reading_diffs

test_diff = get_counter_diff(test_raw,'2020-4-1', '2020-5-1')
test_diff.head()

datetime range: 2020-04-04 00:00:00 - 2020-04-10 23:46:36
200493  records loaded
1656  negative entry records
1357  negative exit records


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,datetime,ENTRIES,EXITS,entries_bk,exits_bk,entries,exits,id,next_id
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 00:00:00,7413507,2516927,-0.0,-0.0,-0.0,-0.0,A00202-00-00,A00202-00-00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 04:00:00,7413507,2516927,4.0,8.0,4.0,8.0,A00202-00-00,A00202-00-00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 08:00:00,7413511,2516935,10.0,9.0,10.0,9.0,A00202-00-00,A00202-00-00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 12:00:00,7413521,2516944,20.0,9.0,20.0,9.0,A00202-00-00,A00202-00-00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 16:00:00,7413541,2516953,26.0,5.0,26.0,5.0,A00202-00-00,A00202-00-00


## Clean invalid counter values
### 1. negative counters

In [9]:
def flip_negative_diff(reading_diffs, threshold=0.9):
    keys = ['C/A','SCP','UNIT']
    # calculate portion of negative value for each turnstile
    # 0s are also counted, turnstile that are scarcely used also have high 'negative_cnt'
    # but that's okay because we are not flipping positive values
    negative_entry_cnt = reading_diffs[reading_diffs['entries']<=0].groupby(
        keys).size().reset_index(name='nonpos_entries_cnt')
    negative_exit_cnt = reading_diffs[reading_diffs['exits']<=0].groupby(
        keys).size().reset_index(name='nonpos_exits_cnt')
    total_cnt = reading_diffs.groupby(keys).size().reset_index(name='total_cnt')
    nratio = pd.merge(total_cnt, negative_entry_cnt, how='outer').fillna(0)
    nratio = pd.merge(nratio, negative_exit_cnt, how='outer').fillna(0)
    nratio['nonpos_entries_ratio'] = nratio['nonpos_entries_cnt'] / nratio['total_cnt']
    nratio['nonpos_exits_ratio'] = nratio['nonpos_exits_cnt'] / nratio['total_cnt']
    #print(nratio.loc[nratio['negative_ratio'] < 0.95].sort_values('negative_ratio', ascending=False).head(20))
    # merge with original data
    reading_diffs = pd.merge(reading_diffs, nratio)
    # log info
    entries_flipped = reading_diffs.loc[(reading_diffs['entries'] < 0) & (
        reading_diffs['nonpos_entries_ratio'] > threshold)][[
        'C/A', 'SCP', 'STATION', 'nonpos_entries_ratio']].drop_duplicates().sort_values('nonpos_entries_ratio')
    exits_flipped = reading_diffs.loc[(reading_diffs['exits'] < 0) & (
        reading_diffs['nonpos_exits_ratio'] > threshold)][[
        'C/A', 'SCP', 'STATION', 'nonpos_exits_ratio']].drop_duplicates().sort_values('nonpos_exits_ratio')
    print('flipped', len(entries_flipped), 'entry counters:\n', entries_flipped.tail().to_string(index=False))
    print('flipped', len(exits_flipped), 'exit counters:\n', exits_flipped.tail().to_string(index=False))
    # filp counter differences if ratio > threshold
    reading_diffs['entries'] = np.where(
        (reading_diffs['entries'] < 0) & (reading_diffs['nonpos_entries_ratio'] > threshold), # negative turstiles, flip
        reading_diffs['entries'] * -1, reading_diffs['entries'])
    reading_diffs['exits'] = np.where(
        (reading_diffs['exits'] < 0) & (reading_diffs['nonpos_exits_ratio'] > threshold), # negative turstiles, flip
        reading_diffs['exits'] * -1, reading_diffs['exits'])
    print(len(reading_diffs[reading_diffs['entries']<0]), ' negative entry records remains')
    print(len(reading_diffs[reading_diffs['exits']<0]), ' negative exit records remains')
    return reading_diffs

test_flipped = flip_negative_diff(test_diff)
test_flipped.head()

flipped 46 entry counters:
   C/A       SCP          STATION  nonpos_entries_ratio
 N203  00-00-01  161/YANKEE STAD                   1.0
 N207  00-00-00           167 ST                   1.0
 N215  00-00-02      182-183 STS                   1.0
 N056  01-00-03            50 ST                   1.0
 R730  00-00-04  EASTCHSTER/DYRE                   1.0
flipped 37 exit counters:
    C/A       SCP          STATION  nonpos_exits_ratio
  N205  02-00-00  161/YANKEE STAD                 1.0
 N325A  00-05-00      ELMHURST AV                 1.0
 N325A  00-06-01      ELMHURST AV                 1.0
 N334B  00-06-02            75 AV                 1.0
  R622  00-00-00      FRANKLIN AV                 1.0
4  negative entry records remains
4  negative exit records remains


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,datetime,ENTRIES,EXITS,entries_bk,exits_bk,entries,exits,id,next_id,total_cnt,nonpos_entries_cnt,nonpos_exits_cnt,nonpos_entries_ratio,nonpos_exits_ratio
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 00:00:00,7413507,2516927,-0.0,-0.0,-0.0,-0.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 04:00:00,7413507,2516927,4.0,8.0,4.0,8.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 08:00:00,7413511,2516935,10.0,9.0,10.0,9.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 12:00:00,7413521,2516944,20.0,9.0,20.0,9.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 16:00:00,7413541,2516953,26.0,5.0,26.0,5.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951


### 2. negative values caused by resets or other cases

In [10]:
def remove_negative_diff(reading_diffs, by_value=True):
    if by_value:
        reading_diffs = reading_diffs.copy(deep=True) # don't modify the parameter passed in
    # log info
    negative_entries = reading_diffs[reading_diffs['entries']<0][[
        'C/A', 'SCP', 'STATION', 'nonpos_entries_ratio']].drop_duplicates().sort_values('nonpos_entries_ratio')
    negative_exits = reading_diffs[reading_diffs['exits']<0][[
        'C/A', 'SCP', 'STATION', 'nonpos_exits_ratio']].drop_duplicates().sort_values('nonpos_exits_ratio')
    print('set', len(reading_diffs[reading_diffs['exits']<0]), 'negative exit records to 0')
    print(len(negative_entries), 'turnstiles affected:\n', negative_entries.to_string(index=False))
    print('set', len(reading_diffs[reading_diffs['entries']<0]), 'negative entry records to 0')
    print(len(negative_exits), 'turnstiles affected:\n', negative_exits.to_string(index=False))
    # set to 0
    reading_diffs['entries'] = np.where(reading_diffs['entries'] < 0, 0, reading_diffs['entries'])
    reading_diffs['exits'] = np.where(reading_diffs['exits'] < 0, 0, reading_diffs['exits'])
    return reading_diffs

test_nonegative = remove_negative_diff(test_flipped)
test_nonegative.head()

set 4 negative exit records to 0
4 turnstiles affected:
    C/A       SCP       STATION  nonpos_entries_ratio
 N339A  00-00-00  PARSONS BLVD              0.024390
 R217A  00-03-02   BLEECKER ST              0.024390
  N316  00-00-00         46 ST              0.048780
  A007  01-06-03    5 AV/59 ST              0.219512
set 4 negative entry records to 0
4 turnstiles affected:
    C/A       SCP       STATION  nonpos_exits_ratio
 N339A  00-00-00  PARSONS BLVD            0.024390
  N316  00-00-00         46 ST            0.048780
 R217A  00-03-02   BLEECKER ST            0.121951
  A007  01-06-03    5 AV/59 ST            0.560976


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,datetime,ENTRIES,EXITS,entries_bk,exits_bk,entries,exits,id,next_id,total_cnt,nonpos_entries_cnt,nonpos_exits_cnt,nonpos_entries_ratio,nonpos_exits_ratio
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 00:00:00,7413507,2516927,-0.0,-0.0,-0.0,-0.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 04:00:00,7413507,2516927,4.0,8.0,4.0,8.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 08:00:00,7413511,2516935,10.0,9.0,10.0,9.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 12:00:00,7413521,2516944,20.0,9.0,20.0,9.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 16:00:00,7413541,2516953,26.0,5.0,26.0,5.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951


### 3. Values too large

In [11]:
def remove_large_diff(reading_diffs, threshold = 15000, by_value=True):
    if by_value:
        reading_diffs = reading_diffs.copy(deep=True) # don't modify the parameter passed in
    # log info
    large_entries = reading_diffs[reading_diffs['entries']>threshold][[
        'C/A', 'SCP', 'STATION', 'entries']].sort_values('entries').drop_duplicates([
        'C/A', 'SCP', 'STATION']) # log the smallest invalid value of this turnstile
    large_exits = reading_diffs[reading_diffs['exits']>threshold][[
        'C/A', 'SCP', 'STATION', 'exits']].sort_values('exits').drop_duplicates([
        'C/A', 'SCP', 'STATION']) # log the smallest invalid value of this turnstile
    print('set', len(reading_diffs[reading_diffs['entries']>threshold]), 'large entry records to 0')
    print(len(large_entries), 'turnstiles affected:\n', large_entries)
    print('set', len(reading_diffs[reading_diffs['exits']>threshold]), 'large exit records to 0')
    print(len(large_exits), 'turnstiles affected:\n', large_exits)
    # set to 0
    reading_diffs['entries'] = np.where(reading_diffs['entries']>threshold, 0, reading_diffs['entries'])
    reading_diffs['exits'] = np.where(reading_diffs['exits']>threshold, 0, reading_diffs['exits'])
    return reading_diffs

test_cleaned = remove_large_diff(test_nonegative)
test_cleaned.head()

set 2 large entry records to 0
2 turnstiles affected:
           C/A       SCP        STATION   entries
79291    N316  00-00-01          46 ST  170683.0
145609  R200A  01-05-01  BOWLING GREEN  524288.0
set 1 large exit records to 0
1 turnstiles affected:
         C/A       SCP STATION     exits
79291  N316  00-00-01   46 ST  216986.0


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,datetime,ENTRIES,EXITS,entries_bk,exits_bk,entries,exits,id,next_id,total_cnt,nonpos_entries_cnt,nonpos_exits_cnt,nonpos_entries_ratio,nonpos_exits_ratio
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 00:00:00,7413507,2516927,-0.0,-0.0,-0.0,-0.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 04:00:00,7413507,2516927,4.0,8.0,4.0,8.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 08:00:00,7413511,2516935,10.0,9.0,10.0,9.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 12:00:00,7413521,2516944,20.0,9.0,20.0,9.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2020-04-04 16:00:00,7413541,2516953,26.0,5.0,26.0,5.0,A00202-00-00,A00202-00-00,41,3.0,5.0,0.073171,0.121951


In [17]:
def get_cleaned_diff(raw, from_date=None, to_date=None):
    df = get_counter_diff(raw, from_date, to_date)
    print('---')
    df = flip_negative_diff(df) # flip counters that counts negatively
    print('---')
    df = remove_negative_diff(df) # set negative counts to 0
    print('---')
    df = remove_large_diff(df) # set large counts to 0
    print('---')
    df = df[['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','datetime', 'entries', 'exits']]
    print(len(df), ' records cleaned')
    return df

test_cleaned = get_cleaned_diff(test_raw)
test_cleaned.to_csv('../data/turnstile/turnstile_diffs_test.csv.zip',
                    index=False, compression=dict(method='zip',archive_name='turnstile_diffs_test.csv'))

datetime range: 2020-04-04 00:00:00 - 2020-04-10 23:46:36
200493  records loaded
1656  negative entry records
1357  negative exit records
---
flipped 46 entry counters:
   C/A       SCP          STATION  nonpos_entries_ratio
 N203  00-00-01  161/YANKEE STAD                   1.0
 N207  00-00-00           167 ST                   1.0
 N215  00-00-02      182-183 STS                   1.0
 N056  01-00-03            50 ST                   1.0
 R730  00-00-04  EASTCHSTER/DYRE                   1.0
flipped 37 exit counters:
    C/A       SCP          STATION  nonpos_exits_ratio
  N205  02-00-00  161/YANKEE STAD                 1.0
 N325A  00-05-00      ELMHURST AV                 1.0
 N325A  00-06-01      ELMHURST AV                 1.0
 N334B  00-06-02            75 AV                 1.0
  R622  00-00-00      FRANKLIN AV                 1.0
4  negative entry records remains
4  negative exit records remains
---
set 4 negative exit records to 0
4 turnstiles affected:
    C/A       SCP     

## Loading and logging cleaned counter difference

In [13]:
%%time
links2020 = get_links_by_date(df_link, '2020-1-1', '2020-5-31')
links2019 = get_links_by_date(df_link, '2019-1-1', '2019-5-31')
print('loading 2020 data:')
raw2020 = load_data(links2020['url'].tolist())
print('loading 2019 data:')
raw2019 = load_data(links2019['url'].tolist())

loading 2020 data:
loading 18 files: ['http://web.mta.info/developers/data/nyct/turnstile/turnstile_200104.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200111.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200118.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200125.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200201.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200208.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200215.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200222.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200229.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200307.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200314.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200321.txt', 'http://web.mta.info/developers/data/nyct/turnstile/turnstile_

In [14]:
%%time
cleaned2020 = get_cleaned_diff(raw2020,'2020-1-1', '2020-5-31')
cleaned2019 = get_cleaned_diff(raw2019,'2019-1-1', '2019-5-31')

datetime range: 2020-01-01 00:00:00 - 2020-05-01 19:47:31
3571314  records loaded
30665  negative entry records
24754  negative exit records
---
flipped 58 entry counters:
    C/A       SCP          STATION  nonpos_entries_ratio
  N327  00-06-01    GRAND-NEWTOWN                   1.0
  N342  01-03-02   JAMAICA 179 ST                   1.0
 N400A  02-00-03         COURT SQ                   1.0
 N063A  00-00-08  42 ST-PORT AUTH                   1.0
  R730  00-00-04  EASTCHSTER/DYRE                   1.0
flipped 43 exit counters:
    C/A       SCP        STATION  nonpos_exits_ratio
  N327  00-06-01  GRAND-NEWTOWN                 1.0
 N334B  00-06-02          75 AV                 1.0
 N400A  02-00-03       COURT SQ                 1.0
  N551  00-00-00       AVENUE I                 1.0
  R622  00-00-00    FRANKLIN AV                 1.0
1587  negative entry records remains
2332  negative exit records remains
---
set 2332 negative exit records to 0
308 turnstiles affected:
    C/A       

set 155 large entry records to 0
81 turnstiles affected:
            C/A       SCP          STATION       entries
431414    D002  00-00-02             8 AV  1.564300e+04
2026762  PTH03  00-01-08   JOURNAL SQUARE  1.565400e+04
432064    D002  00-03-00             8 AV  1.655300e+04
432713    D002  00-03-01             8 AV  1.900600e+04
2126679  PTH16  01-01-08       LACKAWANNA  1.917800e+04
...        ...       ...              ...           ...
1650792   N500  00-00-03   47-50 STS ROCK  1.224788e+09
1708068   N506  00-05-03  34 ST-HERALD SQ  1.225866e+09
1085544   N102  01-00-05  JAY ST-METROTEC  1.245969e+09
2997213   R311  00-00-03      3 AV-149 ST  1.785334e+09
1802072   N534  01-00-02       CARROLL ST  2.038596e+09

[81 rows x 4 columns]
set 120 large exit records to 0
62 turnstiles affected:
            C/A       SCP          STATION         exits
2026883  PTH03  00-01-08   JOURNAL SQUARE  1.559400e+04
433361    D002  00-03-02             8 AV  1.577300e+04
432713    D002  00-03-

---
set 235 large entry records to 0
169 turnstiles affected:
            C/A       SCP          STATION       entries
2649025  PTH21  01-00-01       PATH WTC 2  1.514200e+04
2047516   N503  00-00-05  42 ST-BRYANT PK  1.519200e+04
3276467   R226  02-00-00            23 ST  1.538300e+04
3203134  R205A  04-02-02        FULTON ST  1.563800e+04
2038147   N502  01-00-00  42 ST-BRYANT PK  1.573200e+04
...        ...       ...              ...           ...
3644343   R311  00-00-03      3 AV-149 ST  1.869546e+09
2133267  N519A  01-05-02  B'WAY-LAFAYETTE  1.928603e+09
3744674   R405  01-00-00       CYPRESS AV  2.011374e+09
4303749   R729  00-00-02    BAYCHESTER AV  2.055526e+09
3498461   R252  00-03-02           103 ST  2.088503e+09

[169 rows x 4 columns]
set 195 large exit records to 0
132 turnstiles affected:
            C/A       SCP          STATION         exits
2453967  PTH03  00-00-0A   JOURNAL SQUARE  1.510100e+04
2044968   N503  00-00-02  42 ST-BRYANT PK  1.570200e+04
2051775   N504 

In [15]:
%%time
cleaned2020.to_csv('../data/turnstile/turnstile_diffs_2020.csv.zip',
                   index=False, compression=dict(method='zip',archive_name='turnstile_diffs_2020.csv'))
cleaned2019.to_csv('../data/turnstile/turnstile_diffs_2019.csv.zip',
                   index=False, compression=dict(method='zip',archive_name='turnstile_diffs_2019.csv'))

CPU times: user 46.9 s, sys: 917 ms, total: 47.8 s
Wall time: 48.2 s
