In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime

%config InlineBackend.figure_format = 'svg'
%matplotlib inline 

# Download and load MTA turnstile data

In [6]:
def all_saturdays(start_date, end_date):
    modified_end_date = datetime.datetime.strptime(end_date,'%m/%d/%Y')+ datetime.timedelta(weeks = 1)
    saturday_list = pd.date_range(start =start_date, end =modified_end_date,
                              freq='W-SAT').strftime('%m/%d/%Y').tolist()
    return saturday_list

In [7]:
def import_mta(date):
    formatted_date = date[-2:]+date[:2]+date[3:5]
    base_url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    date_url = base_url.format(formatted_date)
    date_data = pd.read_csv(date_url)
    return date_data

In [8]:
def mta_to_df(saturday_list):
    dict_of_dfs = {}
    for saturday in saturday_list:
        dict_of_dfs[saturday] = pd.DataFrame(import_mta(saturday))
    mta =pd.DataFrame()
    for val in dict_of_dfs.values():
        mta = pd.concat([mta, val])
    return mta

In [9]:
start_date = input("Enter start date (X/X/XXXX): ")
end_date = input("Enter end date: ")

Enter start date (X/X/XXXX): 5/1/19
Enter end date: 5/31/2019


In [10]:
saturday_list = all_saturdays(start_date,end_date)

In [11]:
# print(saturday_list)

In [12]:
mta = mta_to_df(saturday_list)

In [13]:
mta.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,00:00:00,REGULAR,7035249,2384833
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,2384840
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,2384875
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,2384951
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,2385020


# In case of later breakage

In [None]:
mta.to_csv(r'mta.csv')

In [None]:
# mta = pd.read_csv('mta.csv')

# Cleaning

In [None]:
mta.info()

In [None]:
mta.columns

In [None]:
mta.columns = mta.columns.str.strip().str.lower().str.replace('/',"_")

In [None]:
mta.columns

In [None]:
mta["datetime"] = pd.to_datetime(mta.date + " " + mta.time, format="%m/%d/%Y %H:%M:%S")

In [None]:
mta.head()

In [None]:
mta.date.value_counts().sort_index()

In [None]:
mta.date = pd.to_datetime(mta.date,format="%m/%d/%Y")

In [None]:
mta = mta[(mta.date >= start_date)]

In [None]:
mta = mta[(mta.date <= end_date)]

In [None]:
mta.drop(columns = ['time'], inplace = True)

In [None]:
mta

## daily entries by turnstile

In [None]:
mta.head(10)

In [None]:
by_turnstile_date = mta.groupby(['c_a', 'unit', 'scp', 'station', 'date'])

In [None]:
mta_daily = by_turnstile_date['entries'].min().reset_index()

In [None]:
mta_daily.describe([.95,.997])

In [None]:
mta_daily['daily_entries'] = mta_daily.groupby(['c_a', 'unit', 'scp', 'station'])['entries'].diff().shift(-1)

In [None]:
#dropping rows with negative daily entries
mta_daily = mta_daily[mta_daily.daily_entries >= 0]

In [None]:
#dropping rows with the top .3% of daily entries
mta_daily = mta_daily[mta_daily.daily_entries < mta_daily.daily_entries.quantile(q =.997)]

In [None]:
mta_daily