In [2]:
import  requests
from bs4 import BeautifulSoup
import pandas as pd


# MTA Turnstile Data Analysis

### 1. Define the route and location of the data

In [3]:
#Location of the data
url = 'http://web.mta.info/developers/turnstile.html'
#relative path of the data
#will be formated as http://web.mta.info/developers/data/nyct/turnstile/[file name].txt
route = 'data/nyct/turnstile/'

## 2. Access location, get and parse html content 

In [4]:
#perform a get request on the url
page = requests.get(url)
#parse its html to query and access the page elements
content = BeautifulSoup(page.content, "html.parser")

#query all 'a' element 
links = content.find_all('a')

## 3. Extrct relevant links

In [5]:
data_links = []
for link in links:
    #try accessing the href attribute of the tag, otherwise continue since it doesn't have one
    try:
        #if file is stored in the rout 'data/nyct/turnstile/' then we will save it, else continue
        if route in link['href']:
            data_links.append("http://web.mta.info/developers/" + link["href"])
        else:
            continue
    except KeyError:
        continue
print(len(data_links))#->591

591


## 4. Load data, create dataframe

In [6]:
#save a subset of the links
required_links = data_links[0:12]
#load and read data
dfs = [pd.read_csv(link) for link in required_links]
#combine all data into a single dataframe
df = pd.concat(dfs)

## 5. Preview data

In [7]:
print(df.columns)

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES',
       'EXITS                                                               '],
      dtype='object')


In [8]:
#remove white space in last column 
df.rename(columns = {df.columns[-1]:df.columns[-1].strip()},inplace = True)
print(df.columns)

Index(['C/A', 'UNIT', 'SCP', 'STATION', 'LINENAME', 'DIVISION', 'DATE', 'TIME',
       'DESC', 'ENTRIES', 'EXITS'],
      dtype='object')


In [9]:
df

Unnamed: 0,C/A,UNIT,SCP,STATION,LINENAME,DIVISION,DATE,TIME,DESC,ENTRIES,EXITS
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/21/2021,00:00:00,REGULAR,7622548,2607689
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/21/2021,04:00:00,REGULAR,7622561,2607697
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/21/2021,08:00:00,REGULAR,7622573,2607718
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/21/2021,12:00:00,REGULAR,7622604,2607766
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,08/21/2021,16:00:00,REGULAR,7622715,2607802
...,...,...,...,...,...,...,...,...,...,...,...
209501,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,06/11/2021,05:00:00,REGULAR,5554,584
209502,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,06/11/2021,09:00:00,REGULAR,5554,584
209503,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,06/11/2021,13:00:00,REGULAR,5554,584
209504,TRAM2,R469,00-05-01,RIT-ROOSEVELT,R,RIT,06/11/2021,17:00:00,REGULAR,5554,584
