In [107]:
#import relevant libraries
from bs4 import BeautifulSoup #parsing html
import requests #working with urls
import pandas as pd #working with datasets

In [108]:
def get_options(soup, tag):
    """
    this function goes through the parsed soup of transtats website
    searches for the specified tag
    returns a list of elements of the tag
    used to obtain list of carriers and airports
    """
    options = []
    get_list = soup.find(id=tag)
    for option in get_list.find_all('option'):
        options.append(option['value'])
    return options

In [109]:
def print_list(label, codes):
    """
    prints label of list and elements of list
    useful for viewing elements of a list
    """
    print("\n%s:" % label)
    for code in codes:
        print(code)

In [110]:
def extract_data(soup):
    """
    this helper function gets 2 of the required elements for making requests to the transtats website for
    carrier-airport combinations.
    """
    data = {"eventvalidation": "",
            "viewstate": ""}
    vs = soup.find(id="__VIEWSTATE")
    viewstate = vs["value"]
    ev = soup.find(id="__EVENTVALIDATION")
    eventvalidation = ev['value']

    return viewstate, eventvalidation

In [111]:
def make_request(carrier, airport):
    """
    request html containing data for specified carrier-airport combination from transtats website
    some elements required for making request are hard-coded, others are given as inputs
    writes the obtained data to a html file
    data obtained from this function will still require further parsing to obtain the required table of information
    """
    viewstate, eventvalidation = extract_data(soup)
    url = "https://www.transtats.bts.gov/Data_Elements.aspx?Data=2"
    request_elements = (('__EVENTTARGET', ""), 
                        ('__EVENTARGUMENT', ""), 
                        ('__VIEWSTATE', viewstate), 
                        ('__VIEWSTATEGENERATOR', "8E3A4798"), 
                        ('__EVENTVALIDATION', eventvalidation), 
                        ('CarrierList', carrier), 
                        ('AirportList', airport), 
                        ('Submit', "Submit"))

    r = s.post(url, request_elements)
    filename = carrier+"-"+airport+".html"
    with open(filename, "w") as f:
        f.write(r.text)
    return filename

In [112]:
def get_table(html_file):
    """
    from html containing data for specified carrier-airport combination, this function extracts the core data (table)
    writes the contents of the table (monthly flight volumes) into python dictionaries and returns a pandas datadrame
    of the data.
    """

    with open(html_file, "r") as html:
        soup = BeautifulSoup(html, "lxml")
        
    data = []

    table = soup.find(id = 'DataGrid1')
    for entry in table.find_all('tr'):
        details = []
        for cell in entry.find_all('td'):
            details.append(cell.get_text().replace(',',''))
        info = {}        
        if details[1].upper() != 'TOTAL' and details[1].upper() != 'MONTH':
            info["courier"], info["airport"] = html_file[:-5].split("-")
            info["year"] = int(details[0])
            info["month"] = int(details[1])
            info["domestic"] = int(details[2])
            info["international"] = int(details[3])
            data.append(info)
    df = pd.DataFrame(data)
    return df

In [113]:
s = requests.Session()

#get transtats html from url
r = s.get('https://www.transtats.bts.gov/Data_Elements.aspx?Data=2')
#parse html text
soup = BeautifulSoup(r.text, "lxml")

#get list of carriers and airports using the get_options function
carrier_codes = get_options(soup, 'CarrierList')
airport_codes = get_options(soup, 'AirportList')

#print_list("Carriers", carrier_codes)
#print_list("Airports", airport_codes)

html_file = make_request(carrier_codes[0], airport_codes[0])
get_table(html_file)

Unnamed: 0,airport,courier,domestic,international,month,year
0,All,All,815489,92565,10,2002
1,All,All,766775,91342,11,2002
2,All,All,782175,96881,12,2002
3,All,All,785651,98053,1,2003
4,All,All,690750,85965,2,2003
5,All,All,797634,97929,3,2003
6,All,All,766639,89398,4,2003
7,All,All,789857,87671,5,2003
8,All,All,798841,95435,6,2003
9,All,All,832075,102795,7,2003
