In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
from bs4 import BeautifulSoup

In [2]:
# Get a list of the files that we're grabbing
def get_list_of_files():
    """Grabs the full list of files from the website"""
    url = "http://web.mta.info/developers/turnstile.html"
    res=requests.get(url)

    if res.status_code == 200:
        page=BeautifulSoup(res.content)
        mydivs = page.find_all("div", {"class": "span-84 last"})

        file_list = []

        for div in mydivs:
            files=div.findAll('a')


        for file in files:
            file_list.append("http://web.mta.info/developers/"+file['href'])
        return file_list



In [3]:
#Getting files from a certain date onward for x amount of weeks

def get_files_from_date(file_list, date = '221008', weeks=10):
    """Getting the files from a certain week specified by date in format YYMMDD
       Weeks the number of weeks of data starting with the first file"""
        
    first_file_pattern = re.compile(f".*{date}.*")
    first_file_index = -1
    new_file_list = []
    
    # Finding the index of the first file
    for i in range(len(file_list)):
        if first_file_pattern.search(file_list[i]) != None:
            first_file_index = i
            break;
        
        
    if first_file_index == -1:
    # Checking if that file was found in the list of files. Returns nothing if not found
        print(f'File for date: {date} not found')
        return None
    
    else:
    # If found creates a list starting from the file found and length of weeks put
        for i in range(first_file_index, first_file_index + weeks):
            new_file_list.append(file_list[i])
        return new_file_list
    
            

In [4]:
def get_data_from_station(file_list, station):
    """Grabs the data from a specific station and a list of files and concatenates it into one dataframe
"""
    total_data = pd.DataFrame()
    
    for file in file_list:
        #loops through list of files and concatenates the dataframes into one
        df = pd.read_csv(file)
        station_df = df[df['STATION'] == station]
        total_data = pd.concat([total_data, station_df])
        
    return total_data

In [5]:
# Specifying number of weeks and station to pull the data 
weeks = 26
station = 'WOODHAVEN BLVD'

In [6]:
# Getting list of all files from the website
file_list = get_list_of_files()

In [7]:
# Taking x weeks for approximately 6 months of data
files = get_files_from_date(file_list, '221008', weeks)

In [8]:
# Getting the data from station specified
df = get_data_from_station(files, station) 
df.head()

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
43436,J031,R006,00-00-00,WOODHAVEN BLVD,JZ,BMT,10/01/2022,00:00:00,REGULAR,5412654,6797858
43437,J031,R006,00-00-00,WOODHAVEN BLVD,JZ,BMT,10/01/2022,04:00:00,REGULAR,5412659,6797868
43438,J031,R006,00-00-00,WOODHAVEN BLVD,JZ,BMT,10/01/2022,08:00:00,REGULAR,5412700,6797895
43439,J031,R006,00-00-00,WOODHAVEN BLVD,JZ,BMT,10/01/2022,12:00:00,REGULAR,5412765,6797939
43440,J031,R006,00-00-00,WOODHAVEN BLVD,JZ,BMT,10/01/2022,16:00:00,REGULAR,5412827,6798026


In [9]:
df.shape

(19660, 11)

In [20]:
def fix_times(time):
    """Reformating all the times to be in intervals of 4 hours starting from 00:00:00"""
    hour = time[:2]
    minutes = time[3:5]
    new_time = round((int(hour) + int(minutes) / 60)/4) * 4

    if(new_time > 20): #If time rounds to 24 need to round down to 20
        new_time = 20
    elif(new_time < 10): #adding a 0 to the string if only one digit for formatting
        new_time = '0' + str(new_time)

    return f"{new_time}:00:00"


In [11]:
def reformat_df(df):
    """Updates index to the datetime and specific turnstile id
       Lowercases all columns
       Drops useless columns
       Sorts the Index
       """
    # snake_case column names
    df.columns=df.columns.str.lower().str.replace(" ","")
    # converting date to datetime format
    df["datetime"]=df["date"]+" "+df["time"]
    df.datetime=pd.to_datetime(df["datetime"])
    df['turnstile_id']=df["scp"]+" "+df["c/a"]
    df=df.set_index(["turnstile_id","datetime"]).sort_index()

    df.drop(columns=['date', 'c/a', 'unit','scp','linename','division','desc'], inplace = True)

    return df

In [12]:
def calc_entries_exits(temp):
    """Calculates the total entries and exits in a given time frame by taking the diff()
       Uses the abs() function since some of the turnstiles are operating in a backwards fashion"""
    #Calculates the total entries
    temp['entries_abs']=temp['entries'].diff().abs()
        
    #Calculates the total exits
    temp['exits_abs']=temp['exits'].diff().abs()

    #NEED TO DROP THE NAN VALUES AFTER DIFF
    # temp.dropna(inplace = True)
    
    #Recalculates the entry or exit if the turnstile was reset by taking the mean value o
    for datetime, row in temp.iterrows():
        if row["entries_abs"]>15000:
            mean=temp[(temp["entries_abs"]<15000) & (temp['time'] == row['time'])]["entries_abs"].mean()        
            mean=int(mean)    
            temp.loc[datetime, 'entries_abs'] = mean
            
    # for datetime, row in temp.iterrows():
        if row["exits_abs"]>15000:
            mean=temp[(temp["exits_abs"]<15000) & (temp['time'] == row['time'])]["exits_abs"].mean()        
            mean=int(mean)    
            temp.loc[datetime, 'exits_abs'] = mean
            
            
    return temp

In [13]:
# Changing times to all be between hours 0, 4, 8, 12, 16, 20
df['TIME'] = df['TIME'].apply(lambda x: fix_times(x))

# Reformating Data Frame
df = reformat_df(df)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,station,time,entries,exits
turnstile_id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00-00-00 J031,2022-04-09 00:00:00,WOODHAVEN BLVD,00:00:00,5337040,6727440
00-00-00 J031,2022-04-09 04:00:00,WOODHAVEN BLVD,04:00:00,5337049,6727466
00-00-00 J031,2022-04-09 08:00:00,WOODHAVEN BLVD,08:00:00,5337113,6727500
00-00-00 J031,2022-04-09 12:00:00,WOODHAVEN BLVD,12:00:00,5337196,6727569
00-00-00 J031,2022-04-09 16:00:00,WOODHAVEN BLVD,16:00:00,5337270,6727666


In [14]:
# Calcuating the total entries and exits for each turnstile at a given time
df=df.groupby(level=0).apply(lambda x: calc_entries_exits(x))

In [15]:
# Dropping null values
print(df.shape)
df.dropna(inplace = True)
print(df.shape)

(19660, 6)
(19642, 6)


In [16]:
# Checking for outliers in entries 
df.sort_values(by='entries_abs', ascending = False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,station,time,entries,exits,entries_abs,exits_abs
turnstile_id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00-00-01 N329,2022-05-19 08:00:00,WOODHAVEN BLVD,08:00:00,5792852,2221304,1146.0,55.0
00-00-01 N329,2022-04-14 08:00:00,WOODHAVEN BLVD,08:00:00,5731374,2195341,1122.0,38.0
00-00-01 N329,2022-08-05 08:00:00,WOODHAVEN BLVD,08:00:00,5910441,2285564,1087.0,29.0
00-00-01 N329,2022-05-12 08:00:00,WOODHAVEN BLVD,08:00:00,5780852,2216134,1009.0,69.0
00-00-00 N329,2022-07-20 08:00:00,WOODHAVEN BLVD,08:00:00,4091259,5529847,994.0,102.0


In [17]:
# Checking for outliers in exits
df.sort_values(by='exits_abs', ascending = False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,station,time,entries,exits,entries_abs,exits_abs
turnstile_id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00-00-00 N329,2022-09-24 16:00:00,WOODHAVEN BLVD,16:00:00,4188321,5630991,303.0,801.0
00-00-00 N329,2022-09-23 16:00:00,WOODHAVEN BLVD,16:00:00,4187200,5628827,355.0,799.0
00-00-00 N329,2022-05-20 16:00:00,WOODHAVEN BLVD,16:00:00,4002524,5430643,357.0,780.0
00-00-00 N329,2022-06-17 16:00:00,WOODHAVEN BLVD,16:00:00,4043602,5477286,418.0,769.0
00-00-00 N329,2022-06-03 16:00:00,WOODHAVEN BLVD,16:00:00,4021419,5453238,353.0,769.0


In [18]:
# Resetting index to get turnstile_id and datetime columns back
df.reset_index(inplace=True)

# Grouping by datetime to sum the entries and exits of the entire station at each time
df = df.groupby('datetime').sum()

# Dropping entries and exits 
df.drop(columns=['entries','exits'], inplace = True)

df.head()

Unnamed: 0_level_0,entries_abs,exits_abs
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-04-09 04:00:00,74.0,254.0
2022-04-09 08:00:00,1222.0,500.0
2022-04-09 12:00:00,1661.0,1839.0
2022-04-09 16:00:00,2082.0,2948.0
2022-04-09 20:00:00,1662.0,2379.0


In [19]:
# Save the file in a csv to be used 
df.to_csv(f'../data/{station}_{weeks}_weeks.csv')