In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import sys
print('Python version: ', sys.version)

Python version:  3.6.10 |Anaconda, Inc.| (default, Mar 25 2020, 18:53:43) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import requests
import lxml.html

In [4]:
#from https://github.com/PrattSAVI/MTA_TurnstileAnalysis/blob/master/SubwayTrips_RateChange.ipynb
mta_url = r'http://web.mta.info/developers/turnstile.html'
response = requests.get( mta_url) #Get the html content of the website

doc = lxml.html.fromstring(response.content)
#All links are stored in a single div. for each a, get the hrefs.
data_path = doc.xpath('//div[@class="span-84 last"]/a/@href') 

df_link = pd.DataFrame( data_path , columns = ['url'] ) #Pandas dataframe
df_link['url'] = ['http://web.mta.info/developers/' + row['url'] for index,row in df_link.iterrows() ] #Complete url

#Extract date and format, separate to year,month,day to filter easier in the future
df_link['date'] = [row['url'].split('_')[1].split('.txt')[0] for index,row in df_link.iterrows() ] #Complete url
df_link['date'] = pd.to_datetime( df_link.date , format = '%y%m%d')
df_link['year'] = df_link['date'].dt.year
df_link['month'] = df_link['date'].dt.month
df_link['day'] = df_link['date'].dt.day
df_link = df_link.sort_values('date', ascending=True) # sort by date

print( len(data_path) , ' weeks data is available' )

522  weeks data is available


In [5]:
df_link['url'].tolist()[-1] # most recent data

'http://web.mta.info/developers/data/nyct/turnstile/turnstile_200502.txt'

In [6]:
def load_data(files):
    raw = pd.concat(pd.read_csv(f, parse_dates={'DATETIME':['DATE', 'TIME']}) for f in files)
    raw.columns = [c.strip() for c in raw.columns] # remove white spaces
    raw['DATE'] =raw['DATETIME'].dt.date
    raw['TIME'] =raw['DATETIME'].dt.time
    print(len(raw), ' records loaded')
    # basic cleaning
    raw = raw.dropna() #remove n/a
    raw = raw.loc[raw['DESC'] == 'REGULAR'] # keep regular readings
    print(len(raw), ' regular readings')
    return raw

test_raw = load_data([df_link['url'].tolist()[-20]]) # Testing using one.
test_raw.head(5)

207605  records loaded
206689  regular readings


Unnamed: 0,DATETIME,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DESC,ENTRIES,EXITS,DATE,TIME
0,2019-12-14 03:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7309003,2477349,2019-12-14,03:00:00
1,2019-12-14 07:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7309008,2477362,2019-12-14,07:00:00
2,2019-12-14 11:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7309080,2477433,2019-12-14,11:00:00
3,2019-12-14 15:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7309289,2477498,2019-12-14,15:00:00
4,2019-12-14 19:00:00,A002,R051,02-00-00,59 ST,NQR456W,BMT,REGULAR,7309595,2477541,2019-12-14,19:00:00


In [7]:
def get_counter_diff(raw):
    keys = ['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME']
    reading_diffs = raw.sort_values(keys)[keys + ['ENTRIES', 'EXITS']]
    reading_diffs['entries_bk'] = reading_diffs['ENTRIES'].diff(periods=-1)*-1 # Difference with following row
    reading_diffs['exits_bk'] = reading_diffs['EXITS'].diff(periods=-1)*-1 # Difference with following row
    reading_diffs['entries'] = reading_diffs['entries_bk']
    reading_diffs['exits'] = reading_diffs['exits_bk']
    #reading_diffs = reading_diffs.fillna(0) # Fill last line with 0
    reading_diffs.dropna(subset=['entries','exits']) # drop n/a
    # turnstile ID = C/A + SCP
    reading_diffs['id'] = reading_diffs['C/A'] + reading_diffs['SCP']
    reading_diffs['next_id'] = reading_diffs['id'].shift(-1) # turnstile ID in next row
    #reading_diffs['prev_id'] = reading_diffs['id'].shift(1) # turnstile ID in previous row
    # turnstile changed, drop this period of difference
    reading_diffs = reading_diffs.loc[reading_diffs['id'] == reading_diffs['next_id']]
    # date should be continuous
    print(len(reading_diffs), ' turnstile * time slots')
    print(len(reading_diffs[reading_diffs['entries']<0]), ' negative entry records')
    print(len(reading_diffs[reading_diffs['exits']<0]), ' negative exit records')
    return reading_diffs

test_diff = get_counter_diff(test_raw)
test_diff.head()

201765  turnstile * time slots
1961  negative entry records
1405  negative exit records


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,entries_bk,exits_bk,entries,exits,id,next_id
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,03:00:00,7309003,2477349,5.0,13.0,5.0,13.0,A00202-00-00,A00202-00-00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,07:00:00,7309008,2477362,72.0,71.0,72.0,71.0,A00202-00-00,A00202-00-00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,11:00:00,7309080,2477433,209.0,65.0,209.0,65.0,A00202-00-00,A00202-00-00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,15:00:00,7309289,2477498,306.0,43.0,306.0,43.0,A00202-00-00,A00202-00-00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,19:00:00,7309595,2477541,211.0,23.0,211.0,23.0,A00202-00-00,A00202-00-00


In [8]:
def flip_negative_diff(reading_diffs, threshold=0.9):
    keys = ['C/A','SCP','UNIT']
    # calculate portion of negative value for each turnstile
    # 0s are also counted, turnstile that are scarcely used also have high 'negative_cnt'
    # but that's okay because we are not flipping positive values
    negative_entry_cnt = reading_diffs[reading_diffs['entries']<=0].groupby(
        keys).size().reset_index(name='nonpos_entries_cnt')
    negative_exit_cnt = reading_diffs[reading_diffs['exits']<=0].groupby(
        keys).size().reset_index(name='nonpos_exits_cnt')
    total_cnt = reading_diffs.groupby(keys).size().reset_index(name='total_cnt')
    nratio = pd.merge(total_cnt, negative_entry_cnt, how='outer').fillna(0)
    nratio = pd.merge(nratio, negative_exit_cnt, how='outer').fillna(0)
    nratio['nonpos_entries_ratio'] = nratio['nonpos_entries_cnt'] / nratio['total_cnt']
    nratio['nonpos_exits_ratio'] = nratio['nonpos_exits_cnt'] / nratio['total_cnt']
    #print(nratio.loc[nratio['negative_ratio'] < 0.95].sort_values('negative_ratio', ascending=False).head(20))
    # merge with original data
    reading_diffs = pd.merge(reading_diffs, nratio)
    # log info
    entries_flipped = reading_diffs.loc[(reading_diffs['entries'] < 0) & (
        reading_diffs['nonpos_entries_ratio'] > threshold)][[
        'C/A', 'SCP', 'STATION', 'nonpos_entries_ratio']].drop_duplicates().sort_values('nonpos_entries_ratio')
    exits_flipped = reading_diffs.loc[(reading_diffs['exits'] < 0) & (
        reading_diffs['nonpos_exits_ratio'] > threshold)][[
        'C/A', 'SCP', 'STATION', 'nonpos_exits_ratio']].drop_duplicates().sort_values('nonpos_exits_ratio')
    print('flipped', len(entries_flipped), 'entry counters:\n', entries_flipped.tail().to_string(index=False))
    print('flipped', len(exits_flipped), 'exit counters:\n', exits_flipped.tail().to_string(index=False))
    # filp counter differences if ratio > threshold
    reading_diffs['entries'] = np.where(
        (reading_diffs['entries'] < 0) & (reading_diffs['nonpos_entries_ratio'] > threshold), # negative turstiles, flip
        reading_diffs['entries'] * -1, reading_diffs['entries'])
    reading_diffs['exits'] = np.where(
        (reading_diffs['exits'] < 0) & (reading_diffs['nonpos_exits_ratio'] > threshold), # negative turstiles, flip
        reading_diffs['exits'] * -1, reading_diffs['exits'])
    print(len(reading_diffs[reading_diffs['entries']<0]), ' negative entry records remains')
    print(len(reading_diffs[reading_diffs['exits']<0]), ' negative exit records remains')
    return reading_diffs

test_flipped = flip_negative_diff(test_diff)
test_flipped.head()

flipped 46 entry counters:
    C/A       SCP          STATION  nonpos_entries_ratio
  N207  00-00-00           167 ST                   1.0
  N215  00-00-02      182-183 STS                   1.0
  N305  01-03-04  LEXINGTON AV/53                   1.0
 N063A  00-00-04  42 ST-PORT AUTH                   1.0
  R730  00-00-04  EASTCHSTER/DYRE                   1.0
flipped 34 exit counters:
   C/A       SCP          STATION  nonpos_exits_ratio
 D011  01-06-00         BAY PKWY                 1.0
 D008  00-03-00            18 AV                 1.0
 B012  00-00-00    PROSPECT PARK                 1.0
 R622  00-00-00      FRANKLIN AV                 1.0
 R646  01-00-01  FLATBUSH AV-B.C                 1.0
28  negative entry records remains
28  negative exit records remains


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,...,exits_bk,entries,exits,id,next_id,total_cnt,nonpos_entries_cnt,nonpos_exits_cnt,nonpos_entries_ratio,nonpos_exits_ratio
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,03:00:00,7309003,2477349,...,13.0,5.0,13.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,07:00:00,7309008,2477362,...,71.0,72.0,71.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,11:00:00,7309080,2477433,...,65.0,209.0,65.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,15:00:00,7309289,2477498,...,43.0,306.0,43.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,19:00:00,7309595,2477541,...,23.0,211.0,23.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0


In [9]:
# def remove_jumping_diff(reading_diffs, by_value=True):
#     if by_value:
#         reading_diffs = reading_diffs.copy(deep=True) # don't modify the parameter passed in
#     reading_diffs['prev_entries'] = reading_diffs['entries'].shift(1)
#     reading_diffs['next_entries'] = reading_diffs['entries'].shift(-1)
#     reading_diffs['prev_exits'] = reading_diffs['entries'].shift(1)
#     reading_diffs['next_exits'] = reading_diffs['exits'].shift(-1)
#     reading_diffs['entries'] = np.where(reading_diffs['entries'] < 0, 0, reading_diffs['entries'])

In [10]:
def remove_negative_diff(reading_diffs, by_value=True):
    if by_value:
        reading_diffs = reading_diffs.copy(deep=True) # don't modify the parameter passed in
    # log info
    negative_entries = reading_diffs[reading_diffs['entries']<0][[
        'C/A', 'SCP', 'STATION', 'nonpos_entries_ratio']].drop_duplicates().sort_values('nonpos_entries_ratio')
    negative_exits = reading_diffs[reading_diffs['exits']<0][[
        'C/A', 'SCP', 'STATION', 'nonpos_exits_ratio']].drop_duplicates().sort_values('nonpos_exits_ratio')
    print('set', len(reading_diffs[reading_diffs['exits']<0]), 'negative exit records to 0')
    print(len(negative_entries), 'turnstiles affected:\n', negative_entries.to_string(index=False))
    print('set', len(reading_diffs[reading_diffs['entries']<0]), 'negative entry records to 0')
    print(len(negative_exits), 'turnstiles affected:\n', negative_exits.to_string(index=False))
    # set to 0
    reading_diffs['entries'] = np.where(reading_diffs['entries'] < 0, 0, reading_diffs['entries'])
    reading_diffs['exits'] = np.where(reading_diffs['exits'] < 0, 0, reading_diffs['exits'])
    return reading_diffs

test_nonegative = remove_negative_diff(test_flipped)
test_nonegative.head()

set 28 negative exit records to 0
28 turnstiles affected:
    C/A       SCP          STATION  nonpos_entries_ratio
  R132  01-00-01            23 ST              0.024390
  R132  01-00-00            23 ST              0.024390
  N206  01-00-02           167 ST              0.024390
  R610  00-04-01   ATL AV-BARCLAY              0.025000
  R174  00-00-00           125 ST              0.025000
  N062  01-03-03  42 ST-PORT AUTH              0.025000
 PTH03  00-01-08   JOURNAL SQUARE              0.025000
  R180  00-00-01           157 ST              0.025641
 PTH22  00-02-05     PATH NEW WTC              0.025641
 PTH03  00-02-01   JOURNAL SQUARE              0.025641
 PTH22  00-00-06     PATH NEW WTC              0.027778
  R533  00-03-02    FLUSHING-MAIN              0.028571
  N062  01-03-04  42 ST-PORT AUTH              0.050000
 PTH22  00-03-05     PATH NEW WTC              0.051282
  N503  00-00-01  42 ST-BRYANT PK              0.051282
  R261  00-00-02   149/GRAND CONC            

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,...,exits_bk,entries,exits,id,next_id,total_cnt,nonpos_entries_cnt,nonpos_exits_cnt,nonpos_entries_ratio,nonpos_exits_ratio
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,03:00:00,7309003,2477349,...,13.0,5.0,13.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,07:00:00,7309008,2477362,...,71.0,72.0,71.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,11:00:00,7309080,2477433,...,65.0,209.0,65.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,15:00:00,7309289,2477498,...,43.0,306.0,43.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,19:00:00,7309595,2477541,...,23.0,211.0,23.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0


In [13]:
def remove_large_diff(reading_diffs, threshold = 15000, by_value=True):
    if by_value:
        reading_diffs = reading_diffs.copy(deep=True) # don't modify the parameter passed in
    # log info
    large_entries = reading_diffs[reading_diffs['entries']>threshold][[
        'C/A', 'SCP', 'STATION', 'entries']].sort_values('entries').drop_duplicates([
        'C/A', 'SCP', 'STATION']) # log the smallest invalid value of this turnstile
    large_exits = reading_diffs[reading_diffs['exits']>threshold][[
        'C/A', 'SCP', 'STATION', 'exits']].sort_values('exits').drop_duplicates([
        'C/A', 'SCP', 'STATION']) # log the smallest invalid value of this turnstile
    print('set', len(reading_diffs[reading_diffs['exits']>threshold]), 'large exit records to 0')
    print(len(large_entries), 'turnstiles affected:\n', large_entries)
    print('set', len(reading_diffs[reading_diffs['entries']>threshold]), 'large entry records to 0')
    print(len(large_exits), 'turnstiles affected:\n', large_exits)
    # set to 0
    reading_diffs['entries'] = np.where(reading_diffs['entries']>threshold, 0, reading_diffs['entries'])
    reading_diffs['exits'] = np.where(reading_diffs['exits']>threshold, 0, reading_diffs['exits'])
    return reading_diffs

test_cleaned = remove_large_diff(test_nonegative)
test_cleaned.head()

set 3 large exit records to 0
3 turnstiles affected:
           C/A       SCP        STATION       entries
128599  R108A  05-03-00  WTC-CORTLANDT  2.288100e+04
166135   R289  00-00-01     FORDHAM RD  1.671883e+07
172542   R331  00-00-03    GUN HILL RD  1.219988e+09
set 3 large entry records to 0
3 turnstiles affected:
           C/A       SCP        STATION        exits
73564    N206  01-00-02         167 ST      41613.0
128599  R108A  05-03-00  WTC-CORTLANDT     384796.0
172542   R331  00-00-03    GUN HILL RD  902382897.0


Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,ENTRIES,EXITS,...,exits_bk,entries,exits,id,next_id,total_cnt,nonpos_entries_cnt,nonpos_exits_cnt,nonpos_entries_ratio,nonpos_exits_ratio
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,03:00:00,7309003,2477349,...,13.0,5.0,13.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,07:00:00,7309008,2477362,...,71.0,72.0,71.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,11:00:00,7309080,2477433,...,65.0,209.0,65.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,15:00:00,7309289,2477498,...,43.0,306.0,43.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,19:00:00,7309595,2477541,...,23.0,211.0,23.0,A00202-00-00,A00202-00-00,41,0.0,0.0,0.0,0.0


In [14]:
def get_cleaned_diff(raw):
    df = get_counter_diff(raw)
    print('---')
    df = flip_negative_diff(df) # flip counters that counts negatively
    print('---')
    df = remove_negative_diff(df) # set negative counts to 0
    print('---')
    df = remove_large_diff(df) # set large counts to 0
    print('---')
    df = df[['C/A','UNIT','SCP','STATION','LINENAME','DIVISION','DATE','TIME', 'entries', 'exits']]
    print(len(df), ' records cleaned')
    return df
test_cleaned = get_cleaned_diff(test_raw)
test_cleaned.head()

201765  turnstile * time slots
1961  negative entry records
1405  negative exit records
---
flipped 46 entry counters:
    C/A       SCP          STATION  nonpos_entries_ratio
  N207  00-00-00           167 ST                   1.0
  N215  00-00-02      182-183 STS                   1.0
  N305  01-03-04  LEXINGTON AV/53                   1.0
 N063A  00-00-04  42 ST-PORT AUTH                   1.0
  R730  00-00-04  EASTCHSTER/DYRE                   1.0
flipped 34 exit counters:
   C/A       SCP          STATION  nonpos_exits_ratio
 D011  01-06-00         BAY PKWY                 1.0
 D008  00-03-00            18 AV                 1.0
 B012  00-00-00    PROSPECT PARK                 1.0
 R622  00-00-00      FRANKLIN AV                 1.0
 R646  01-00-01  FLATBUSH AV-B.C                 1.0
28  negative entry records remains
28  negative exit records remains
---
set 28 negative exit records to 0
28 turnstiles affected:
    C/A       SCP          STATION  nonpos_entries_ratio
  R132  01-

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,entries,exits
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,03:00:00,5.0,13.0
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,07:00:00,72.0,71.0
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,11:00:00,209.0,65.0
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,15:00:00,306.0,43.0
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-12-14,19:00:00,211.0,23.0
