In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime

%config InlineBackend.figure_format = 'svg'
%matplotlib inline 

# Download and load MTA turnstile data

In [2]:
def all_saturdays(start_date, end_date):
    modified_end_date = datetime.datetime.strptime(end_date,'%m/%d/%Y')+ datetime.timedelta(weeks = 1)
    saturday_list = pd.date_range(start =start_date, end =modified_end_date,
                              freq='W-SAT').strftime('%m/%d/%Y').tolist()
    return saturday_list

In [3]:
def import_mta(date):
    formatted_date = date[-2:]+date[:2]+date[3:5]
    base_url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    date_url = base_url.format(formatted_date)
    date_data = pd.read_csv(date_url)
    return date_data

In [4]:
def mta_to_df(saturday_list):
    dict_of_dfs = {}
    for saturday in saturday_list:
        dict_of_dfs[saturday] = pd.DataFrame(import_mta(saturday))
    mta =pd.DataFrame()
    for val in dict_of_dfs.values():
        mta = pd.concat([mta, val])
    return mta

In [5]:
start_date = input("Enter start date (X/X/XXXX): ")
end_date = input("Enter end date: ")

Enter start date (X/X/XXXX): 5/1/2019
Enter end date: 5/31/2019


In [6]:
saturday_list = all_saturdays(start_date,end_date)

In [7]:
# print(saturday_list)

In [11]:
mta = mta_to_df(saturday_list)

In [12]:
mta.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,00:00:00,REGULAR,7035249,2384833
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,2384840
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,2384875
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,2384951
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,2385020


# In case of later breakage

In [13]:
mta.to_csv(r'mta.csv')

In [70]:
# mta = pd.read_csv('mta.csv')

# Cleaning

In [71]:
mta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1026784 entries, 0 to 1026783
Data columns (total 12 columns):
Unnamed: 0                                                              1026784 non-null int64
C/A                                                                     1026784 non-null object
UNIT                                                                    1026784 non-null object
SCP                                                                     1026784 non-null object
STATION                                                                 1026784 non-null object
LINENAME                                                                1026784 non-null object
DIVISION                                                                1026784 non-null object
DATE                                                                    1026784 non-null object
TIME                                                                    1026784 non-null object
DESC                    

In [72]:
mta.columns

Index(['Unnamed: 0', 'C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION',
       'DATE', 'TIME', 'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')

In [73]:
mta.columns = mta.columns.str.strip().str.lower().str.replace('/',"_")

In [74]:
mta.columns

Index(['unnamed: 0', 'c_a', 'unit', 'scp', 'station', 'linename', 'division',
       'date', 'time', 'desc', 'entries', 'exits'],
      dtype='object')

In [75]:
mta["datetime"] = pd.to_datetime(mta.date + " " + mta.time, format="%m/%d/%Y %H:%M:%S")

In [76]:
mta.head()

Unnamed: 0,unnamed: 0,c_a,unit,scp,station,linename,division,date,time,desc,entries,exits,datetime
0,0,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,00:00:00,REGULAR,7035249,2384833,2019-04-27 00:00:00
1,1,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,04:00:00,REGULAR,7035269,2384840,2019-04-27 04:00:00
2,2,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,08:00:00,REGULAR,7035292,2384875,2019-04-27 08:00:00
3,3,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,12:00:00,REGULAR,7035392,2384951,2019-04-27 12:00:00
4,4,A002,R051,02-00-00,59 ST,NQR456W,BMT,04/27/2019,16:00:00,REGULAR,7035651,2385020,2019-04-27 16:00:00


In [77]:
mta.date.value_counts().sort_index()

04/27/2019    28914
04/28/2019    28982
04/29/2019    30972
04/30/2019    28982
05/01/2019    29624
05/02/2019    29871
05/03/2019    29512
05/04/2019    28963
05/05/2019    28954
05/06/2019    29008
05/07/2019    30355
05/08/2019    29280
05/09/2019    28793
05/10/2019    28735
05/11/2019    28807
05/12/2019    28922
05/13/2019    29877
05/14/2019    30074
05/15/2019    30589
05/16/2019    29940
05/17/2019    30472
05/18/2019    29052
05/19/2019    29225
05/20/2019    29501
05/21/2019    30146
05/22/2019    27745
05/23/2019    28021
05/24/2019    29673
05/25/2019    28855
05/26/2019    29063
05/27/2019    29068
05/28/2019    28873
05/29/2019    29095
05/30/2019    29190
05/31/2019    29651
Name: date, dtype: int64

In [78]:
mta.date = pd.to_datetime(mta.date,format="%m/%d/%Y")

In [46]:
mta = mta[(mta.date >= start_date)]

In [47]:
mta = mta[(mta.date <= end_date)]

In [79]:
mta.drop(columns = ['time'], inplace = True)

In [80]:
mta

Unnamed: 0,unnamed: 0,c_a,unit,scp,station,linename,division,date,desc,entries,exits,datetime
0,0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035249,2384833,2019-04-27 00:00:00
1,1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035269,2384840,2019-04-27 04:00:00
2,2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035292,2384875,2019-04-27 08:00:00
3,3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035392,2384951,2019-04-27 12:00:00
4,4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035651,2385020,2019-04-27 16:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
1026779,203790,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-05-31,REGULAR,5554,378,2019-05-31 09:00:00
1026780,203791,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-05-31,REGULAR,5554,378,2019-05-31 12:10:52
1026781,203792,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-05-31,REGULAR,5554,378,2019-05-31 13:00:00
1026782,203793,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,2019-05-31,REGULAR,5554,378,2019-05-31 17:00:00


### Group the data so that it represents daily entries for each turnstile (hint: pd.groupby or DataFrame.groupby)

In [81]:
mta.head(10)

Unnamed: 0,unnamed: 0,c_a,unit,scp,station,linename,division,date,desc,entries,exits,datetime
0,0,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035249,2384833,2019-04-27 00:00:00
1,1,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035269,2384840,2019-04-27 04:00:00
2,2,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035292,2384875,2019-04-27 08:00:00
3,3,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035392,2384951,2019-04-27 12:00:00
4,4,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035651,2385020,2019-04-27 16:00:00
5,5,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-27,REGULAR,7035930,2385070,2019-04-27 20:00:00
6,6,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-28,REGULAR,7036100,2385087,2019-04-28 00:00:00
7,7,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-28,REGULAR,7036119,2385088,2019-04-28 04:00:00
8,8,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-28,REGULAR,7036125,2385103,2019-04-28 08:00:00
9,9,A002,R051,02-00-00,59 ST,NQR456W,BMT,2019-04-28,REGULAR,7036197,2385155,2019-04-28 12:00:00


In [112]:
by_turnstile_date = mta.groupby(['c_a', 'unit', 'scp', 'station', 'date'])

In [113]:
mta_daily = by_turnstile_date['entries'].min().reset_index()

In [114]:
mta_daily.head()

Unnamed: 0,c_a,unit,scp,station,date,entries
0,A002,R051,02-00-00,59 ST,2019-04-27,7035249
1,A002,R051,02-00-00,59 ST,2019-04-28,7036100
2,A002,R051,02-00-00,59 ST,2019-04-29,7036746
3,A002,R051,02-00-00,59 ST,2019-04-30,7038242
4,A002,R051,02-00-00,59 ST,2019-05-01,7039729


In [116]:
mta_daily['daily_entries'] = mta_daily.groupby(['c_a', 'unit', 'scp', 'station'])['entries'].diff().shift(-1)

In [121]:
mta_daily.head(50)

Unnamed: 0,c_a,unit,scp,station,date,entries,daily_entries
0,A002,R051,02-00-00,59 ST,2019-04-27,7035249,851.0
1,A002,R051,02-00-00,59 ST,2019-04-28,7036100,646.0
2,A002,R051,02-00-00,59 ST,2019-04-29,7036746,1496.0
3,A002,R051,02-00-00,59 ST,2019-04-30,7038242,1487.0
4,A002,R051,02-00-00,59 ST,2019-05-01,7039729,1574.0
5,A002,R051,02-00-00,59 ST,2019-05-02,7041303,1577.0
6,A002,R051,02-00-00,59 ST,2019-05-03,7042880,1588.0
7,A002,R051,02-00-00,59 ST,2019-05-04,7044468,746.0
8,A002,R051,02-00-00,59 ST,2019-05-05,7045214,535.0
9,A002,R051,02-00-00,59 ST,2019-05-06,7045749,1566.0
