# Notebook 2

## This notebook includes more recent work after performing some preliminary scratchwork (in 'MTA-brian') and performing Challenge 1.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import datetime

In [2]:
# Function to collect data

def get_data(week_nums):
    url = "http://web.mta.info/developers/data/nyct/turnstile/turnstile_{}.txt"
    dfs = []
    for week_num in week_nums:
        file_url = url.format(week_num)
        dfs.append(pd.read_csv(file_url))
    return pd.concat(dfs)

# Select weeks to aggregate data by YYMMDD format
week_nums = [200627, 200620, 200613]

In [47]:
# Pull functions that Lauren Fields created

def combinedTraffic(dataframe):
    dataframe["FOOTTRAFFIC"] = dataframe["ENTRIES"] + dataframe["EXITS"]
    
def turnstileColumn(dataframe):
    '''
    Takes an MTA dataframe & creates a new column
    Where there's an index number for each Turn Stile in frame.
    '''
    eachTS = dataframe.groupby(["C/A", "UNIT", "SCP", "STATION"])[['FOOTTRAFFIC']].sum()
    howMany = eachTS.shape[0]
    eachTS['TURNSTILE'] = range(1,(howMany + 1))
    del eachTS["FOOTTRAFFIC"]
    dataframe = pd.merge(dataframe, eachTS,  how='left',
                         left_on=['C/A','UNIT','SCP', 'STATION'],
                         right_on = ['C/A','UNIT','SCP', 'STATION'])    
    return dataframe

def trafficFix(dataframe):
    dataframe['FOOTTRAFFIC'] = dataframe.groupby('TURNSTILE')['FOOTTRAFFIC'].diff().fillna(method='backfill')
    # dataframe.groupby('TURNSTILE')['FOOTTRAFFIC'].transform('mean')
    dataframe['FOOTTRAFFIC'] = dataframe['FOOTTRAFFIC'].astype(int)
    return dataframe

In [40]:
# Rename the exits row because there are excessive spaces
df = df.rename(columns={'EXITS                                                               ':'EXITS'})

# Define 'df' as variable for exported data
df = get_data(week_nums)
df = df.reset_index(drop=True)
df.columns = [column.strip().upper() for column in df.columns]

In [41]:
# Convert data into time series
df["DATE_TIME"] = pd.to_datetime(df["DATE"] + " " + df["TIME"], format="%m/%d/%Y %H:%M:%S")
df.head(5)

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS,DATE_TIME
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/20/2020,00:00:00,REGULAR,7424218,2522558,2020-06-20 00:00:00
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/20/2020,04:00:00,REGULAR,7424220,2522559,2020-06-20 04:00:00
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/20/2020,08:00:00,REGULAR,7424231,2522572,2020-06-20 08:00:00
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/20/2020,12:00:00,REGULAR,7424265,2522590,2020-06-20 12:00:00
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,06/20/2020,16:00:00,REGULAR,7424340,2522604,2020-06-20 16:00:00


In [42]:
# Group data by daily counts according to parameters listed

daily_counts_df = df.groupby(["C/A","UNIT","SCP","STATION","DATE"], as_index=False)["ENTRIES"].first()

daily_counts_df[["PREV_DATE", "PREV_ENTRIES"]] = (daily_counts_df
                                                       .groupby(["C/A", "UNIT", "SCP", "STATION"])["DATE", "ENTRIES"]
                                                       .apply(lambda grp: grp.shift(1)))
daily_counts_df.dropna(subset=["PREV_DATE"], inplace=True)

  


In [43]:
# This is a function that will calculated daily entries and remove outliers that are >100,000

def get_daily_counts(row, max_counter):
    counter = row["ENTRIES"] - row["PREV_ENTRIES"]
    if counter < 0:
        # Maybe counter is reversed?
        counter = -counter
    if counter > max_counter:
        # Maybe counter was reset to 0? 
        counter = min(row["ENTRIES"], row["PREV_ENTRIES"])
    if counter > max_counter:
        # Check it again to make sure we're not still giving a counter that's too big
        return 0
    return counter

daily_counts_df["DAILY_ENTRIES"] = daily_counts_df.apply(get_daily_counts, axis=1, max_counter=100000)
daily_counts_df

# Convert DATE field to datetime to use in chart
daily_counts_df["DATE"] = pd.to_datetime(daily_counts_df["DATE"])

daily_counts_df

Unnamed: 0,C/A,UNIT,SCP,STATION,DATE,ENTRIES,PREV_DATE,PREV_ENTRIES,DAILY_ENTRIES
1,A002,R051,02-00-00,59 ST,2020-06-07,7421025,06/06/2020,7420920.0,105.0
2,A002,R051,02-00-00,59 ST,2020-06-08,7421148,06/07/2020,7421025.0,123.0
3,A002,R051,02-00-00,59 ST,2020-06-09,7421389,06/08/2020,7421148.0,241.0
4,A002,R051,02-00-00,59 ST,2020-06-10,7421639,06/09/2020,7421389.0,250.0
5,A002,R051,02-00-00,59 ST,2020-06-11,7421889,06/10/2020,7421639.0,250.0
...,...,...,...,...,...,...,...,...,...
103671,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2020-06-22,5554,06/21/2020,5554.0,0.0
103672,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2020-06-23,5554,06/22/2020,5554.0,0.0
103673,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2020-06-24,5554,06/23/2020,5554.0,0.0
103674,TRAM2,R469,00-05-01,RIT-ROOSEVELT,2020-06-25,5554,06/24/2020,5554.0,0.0


In [44]:
station_counts_df = daily_counts_df.groupby(["STATION"], as_index=False)["DAILY_ENTRIES"].sum()
station_counts_df.sort_values('DAILY_ENTRIES', ascending=False).head(10)

Unnamed: 0,STATION,DAILY_ENTRIES
61,34 ST-PENN STA,301034.0
9,125 ST,288210.0
68,42 ST-PORT AUTH,239571.0
110,86 ST,235921.0
59,34 ST-HERALD SQ,231693.0
14,14 ST-UNION SQ,216125.0
257,JKSN HT-ROOSVLT,209839.0
233,GRD CNTRL-42 ST,208612.0
46,23 ST,208375.0
217,FLUSHING-MAIN,198310.0
