In [1]:
import pandas as pd
import numpy as np
import os

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# Data source: http://web.mta.info/developers/turnstile.html
#Download all desired weeks from website, store them in a folder called ../data/mta_turnstiles
# This will load combined CSV files that are in this folder into an appended dataframe
datafiles = ['data/mta_turnstiles/' + x for x in os.listdir('data/mta_turnstiles/')]

list_ = []
for file_ in datafiles:
    df = pd.read_csv(file_)
    list_.append(df)
df = pd.concat(list_)

In [3]:
df.columns #notice the whitespace on EXITS

Index([u'C/A', u'UNIT', u'SCP', u'STATION', u'LINENAME', u'DIVISION', u'DATE',
       u'TIME', u'DESC', u'ENTRIES',
       u'EXITS                                                               '],
      dtype='object')

In [4]:
df.columns = df.columns.str.strip() #strip whitespace
df.head() #preview first five rows

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456,BMT,05/21/2016,00:00:00,REGULAR,5672118,1920707
1,A002,R051,02-00-00,59 ST,NQR456,BMT,05/21/2016,04:00:00,REGULAR,5672183,1920719
2,A002,R051,02-00-00,59 ST,NQR456,BMT,05/21/2016,08:00:00,REGULAR,5672214,1920761
3,A002,R051,02-00-00,59 ST,NQR456,BMT,05/21/2016,12:00:00,REGULAR,5672330,1920867
4,A002,R051,02-00-00,59 ST,NQR456,BMT,05/21/2016,16:00:00,REGULAR,5672640,1920936


In [5]:
df.describe() #basic descriptive statistics

Unnamed: 0,ENTRIES,EXITS
count,777253.0,777253.0
mean,36196270.0,29460010.0
std,197587000.0,179129400.0
min,0.0,0.0
25%,587378.0,297954.0
50%,2575607.0,1501834.0
75%,6546763.0,4657076.0
max,2147483000.0,2087387000.0


#  Basic Cleaning

## Make a datetime obj timestamp

In [6]:
df['TIMESTAMP'] = pd.to_datetime((df.DATE + ' ' + df.TIME), format='%m/%d/%Y %H:%M:%S')

## Make unique identifiers for stations

In [7]:
df = df.reset_index()

In [11]:
l = [''.join(sorted(a)) for a in df['LINENAME']] #sort each linename, since subway lines aren't listed in a consistent order

In [12]:
df['STATID']=df['STATION']+pd.Series(l)

## Turnstiles capture cumulative counts, but we want noncumulative counts
Get the row difference in order to get a count per time period. Assign this a new column.

In [24]:
df['ENTRY_DIFF']=df.groupby(['STATID','UNIT','SCP'],as_index=False)['ENTRIES'].transform(pd.Series.diff)['ENTRIES']

## Bin timestamps by hour
We have data at four-hour intervals, but it is not consistent across lines (sometimes 12am, 4am, etc.. sometimes 1am, 5am, etc...)
Note this does not work if some lines take measurements more frequently than every four hours

In [None]:
df['HOD'] = [r.hour for r in df.TIMESTAMP] #hod = "hour of day"

# Data exploration
Evaluate max entries to identify outliers.

In [29]:
df.sort(['ENTRY_DIFF']).groupby(['STATID'])['STATID','ENTRY_DIFF'].max()

  if __name__ == '__main__':


Unnamed: 0_level_0,STATID,ENTRY_DIFF
STATID,Unnamed: 1_level_1,Unnamed: 2_level_1
1 AVL,1 AVL,2321.0
103 ST-CORONA7,103 ST-CORONA7,2027.0
103 ST1,103 ST1,9003.0
103 ST6,103 ST6,1301.0
103 STBC,103 STBC,841.0
104 STA,104 STA,355.0
104 STJZ,104 STJZ,431.0
110 ST6,110 ST6,1272.0
111 ST7,111 ST7,1242.0
111 STA,111 STA,695.0


It looks like we might have some crazy outliers. What is going on with Wall St45? Let's take a look.