In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup

In [2]:
# Get a list of the files that we're grabbing
def get_list_of_files():
    """Grabs the full list of files from the website"""
    url = "http://web.mta.info/developers/turnstile.html"
    res=requests.get(url)

    if res.status_code == 200:
        page=BeautifulSoup(res.content)
        mydivs = page.find_all("div", {"class": "span-84 last"})

        file_list = []

        for div in mydivs:
            files=div.findAll('a')


        for file in files:
            file_list.append("http://web.mta.info/developers/"+file['href'])
        return file_list



In [3]:
#Getting files from a certain date onward for x amount of weeks

def get_files_from_date(file_list, date = '221008', weeks=10):
    """Getting the files from a certain week specified by date in format YYMMDD
       Weeks the number of weeks of data starting with the first file"""
        
    first_file_pattern = re.compile(f".*{date}.*")
    first_file_index = -1
    new_file_list = []
    
    # Finding the index of the first file
    for i in range(len(file_list)):
        if first_file_pattern.search(file_list[i]) != None:
            first_file_index = i
            break;
        
        
    if first_file_index == -1:
    # Checking if that file was found in the list of files. Returns nothing if not found
        print(f'File for date: {date} not found')
        return None
    
    else:
    # If found creates a list starting from the file found and length of weeks put
        for i in range(first_file_index, first_file_index + weeks):
            new_file_list.append(file_list[i])
        return new_file_list
    
            

In [4]:
def get_data_from_station(file_list, station):
    """Grabs the data from a specific station and a list of files and concatenates it into one dataframe
"""
    total_data = pd.DataFrame()
    
    for file in file_list:
        #loops through list of files and concatenates the dataframes into one
        df = pd.read_csv(file)
        station_df = df[df['STATION'] == station]
        total_data = pd.concat([total_data, station_df])
        
    return total_data

In [5]:
# Specifying number of weeks and station to pull the data 
weeks = 26
station = '125 ST'

In [6]:
# Getting list of all files from the website
file_list = get_list_of_files()

In [7]:
# Taking x weeks for approximately 6 months of data
files = get_files_from_date(file_list, '221008', weeks)

In [8]:
# Getting the data from station specified
df = get_data_from_station(files, station) 
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
49191,N025,R102,01-00-00,125 ST,ACBD,IND,10/01/2022,00:00:00,REGULAR,553673,658829
49192,N025,R102,01-00-00,125 ST,ACBD,IND,10/01/2022,04:00:00,REGULAR,553681,658891
49193,N025,R102,01-00-00,125 ST,ACBD,IND,10/01/2022,08:00:00,REGULAR,553717,658918
49194,N025,R102,01-00-00,125 ST,ACBD,IND,10/01/2022,12:00:00,REGULAR,553801,658980
49195,N025,R102,01-00-00,125 ST,ACBD,IND,10/01/2022,16:00:00,REGULAR,553884,659125


In [9]:
df.shape

(46932, 11)

In [10]:
def reformat_df(df):
    """Updates index to the datetime and specific turnstile id
       Lowercases all columns
       Drops useless columns
       Sorts the Index
       """
    # snake_case column names
    df.columns=df.columns.str.lower().str.replace(" ","")
    # converting date to datetime format
    df["datetime"]=df["date"]+" "+df["time"]
    df.datetime=pd.to_datetime(df["datetime"])
    df['turnstile_id']=df["scp"]+" "+df["c/a"]
    df=df.set_index(["turnstile_id","datetime"]).sort_index()

    df.drop(columns=['date', 'c/a', 'unit','scp','linename','division','desc'], inplace = True)

    return df

In [11]:
def calc_entries_exits(temp):
    """Calculates the total entries and exits in a given time frame by taking the diff()
       Uses the abs() function since some of the turnstiles are operating in a backwards fashion"""
    #Calculates the total entries
    temp['entries_abs']=temp['entries'].diff().abs()
        
    #Calculates the total exits
    temp['exits_abs']=temp['exits'].diff().abs()

    #NEED TO DROP THE NAN VALUES AFTER DIFF
    # temp.dropna(inplace = True)
    
    #Recalculates the entry or exit if the turnstile was reset by taking the mean value o
    for datetime, row in temp.iterrows():
        if row["entries_abs"]>15000:
            mean=temp[(temp["entries_abs"]<15000) & (temp['time'] == row['time'])]["entries_abs"].mean()        
            mean=int(mean)    
            temp.loc[datetime, 'entries_abs'] = mean
            
    # for datetime, row in temp.iterrows():
        if row["exits_abs"]>15000:
            mean=temp[(temp["exits_abs"]<15000) & (temp['time'] == row['time'])]["exits_abs"].mean()        
            mean=int(mean)    
            temp.loc[datetime, 'exits_abs'] = mean
            
            
    return temp

In [12]:
# Reformating Data Frame
df = reformat_df(df)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,station,time,entries,exits
turnstile_id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00-00-00 N026,2022-04-09 00:00:00,125 ST,00:00:00,794141,1125196
00-00-00 N026,2022-04-09 04:00:00,125 ST,04:00:00,794179,1125311
00-00-00 N026,2022-04-09 08:00:00,125 ST,08:00:00,794291,1125483
00-00-00 N026,2022-04-09 12:00:00,125 ST,12:00:00,794549,1125884
00-00-00 N026,2022-04-09 16:00:00,125 ST,16:00:00,794919,1126390


In [13]:
# Dropping null values
print(df.shape)
df.dropna(inplace = True)
print(df.shape)

(46932, 4)
(46932, 4)


In [14]:
# Calcuating the total entries and exits for each turnstile at a given time
df=df.groupby(level=0).apply(lambda x: calc_entries_exits(x))

In [15]:
# Checking for outliers in entries 
df.sort_values(by='entries_abs', ascending = False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,station,time,entries,exits,entries_abs,exits_abs
turnstile_id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00-00-07 N026,2022-05-25 20:00:00,125 ST,20:00:00,20978802,23682439,866.0,444.0
00-00-07 N026,2022-05-11 20:00:00,125 ST,20:00:00,20950872,23657238,816.0,413.0
00-00-07 N026,2022-05-24 20:00:00,125 ST,20:00:00,20976216,23680350,800.0,398.0
00-00-07 N026,2022-05-17 20:00:00,125 ST,20:00:00,20962224,23667851,794.0,369.0
00-00-07 N026,2022-05-23 20:00:00,125 ST,20:00:00,20973911,23678370,779.0,323.0


In [16]:
# Checking for outliers in exits
df.sort_values(by='exits_abs', ascending = False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,station,time,entries,exits,entries_abs,exits_abs
turnstile_id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00-00-04 R304,2022-09-06 08:35:34,125 ST,08:35:34,8014289,4910935,0.0,1115.0
00-00-00 R258,2022-09-02 17:00:00,125 ST,17:00:00,139444,634597,175.0,1098.0
00-00-00 R258,2022-05-11 09:00:00,125 ST,09:00:00,74715,341702,136.0,989.0
00-00-00 R258,2022-09-03 17:00:00,125 ST,17:00:00,140044,637521,138.0,957.0
00-00-00 R258,2022-05-11 17:00:00,125 ST,17:00:00,75078,343260,232.0,940.0


In [20]:
# Save the file in a csv to be used 
df.to_csv(f'../data/{station}_{weeks}_weeks.csv')