In [172]:
import sys
import re
import requests
import time
import json
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Methods

In [158]:
user_agent = ''.join(["user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) ",
                          "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/",
                           "83.0.4103.97 Safari/537.36"])

In [164]:
def get_school_url(unitid):
    return f"https://nces.ed.gov/ipeds/datacenter/facsimileView.aspx?unitid={unitid}&goToReportId=6&year=2020&surveyNumber=9"

In [2]:
def get_driver():
    opts = Options()
    opts.add_argument(user_agent)
    opts.add_argument("start-maximized")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    return driver

In [None]:
def get_soup(url, driver):
    driver.get(url)
    time.sleep(5)
    html = driver.page_source
    return BeautifulSoup(html, 'html5lib').body

In [108]:
ranks = ["professors", "associate professors", "assistant professors"]
race_dict = {"Hispanic/Latino": "hispanic",
             "American Indian or Alaska Native": "aian",
             "Asian": "api",
             "Black or African American": "black",
             "Native Hawaiian or Other Pacific Islander": "api", 
             "White": "white"}
ignored = ["Nonresident alien", "Two or more races", "Race and ethnicity unknown"]

In [159]:
def get_tenured_tenure_track_data(soup):
    male_data = {}
    female_data = {}
    for race in race_dict.values():
        male_data[race] = {"professors": 0, "associate professors": 0,
                          "assistant professors": 0}
        female_data[race] = {"professors": 0, "associate professors": 0,
                          "assistant professors": 0}
    
    sections = soup.find_all("table", class_="sc survey-t")
    
    for i in range(1,7):
        section = sections[i]
        rows = section.find_all('tr')
        male_rows = rows[:16]
        female_rows = rows[16:]
        get_info(male_rows, male_data)
        get_info(female_rows, female_data)
    
    return {"male": male_data, "female": female_data}

In [167]:
def get_info(rows, dictionary):
    for row in rows:
        input_tags = row.find_all("input")
        if len(input_tags) != 6:
            continue
        race = row.text
        race = re.sub("[^a-zA-Z_/ ]", "", race).strip()
        if race in ignored:
            continue
        race = race_dict[race]
        target_dict = dictionary[race]
        for i in range(3):
            if input_tags[i].has_attr("value"):
                val = int(input_tags[i]["value"])
                target_dict.update({ranks[i]:
                                    target_dict[ranks[i]] + val})

# Application

In [161]:
# can be found at https://nces.ed.gov/
school_unitid = {"columbia": 190150,
                 "dartmouth": 182670,
                 "yale": 130794,
                 "princeton": 186131,
                 "brown": 217156, 
                 "cornell": 190415, 
                 "upenn": 215062, 
                 "harvard": 166027}

In [162]:
driver = get_driver()




In [168]:
schools = {}
for school, unitid in school_unitid.items():
    school_url = get_school_url(unitid)
    soup = get_soup(school_url, driver)
    data = get_tenured_tenure_track_data(soup)
    schools[school] = data

In [171]:
schools

{'columbia': {'male': {'hispanic': {'professors': 27,
    'associate professors': 17,
    'assistant professors': 40},
   'aian': {'professors': 1,
    'associate professors': 0,
    'assistant professors': 0},
   'api': {'professors': 137,
    'associate professors': 82,
    'assistant professors': 123},
   'black': {'professors': 36,
    'associate professors': 14,
    'assistant professors': 30},
   'white': {'professors': 758,
    'associate professors': 264,
    'assistant professors': 400}},
  'female': {'hispanic': {'professors': 13,
    'associate professors': 15,
    'assistant professors': 75},
   'aian': {'professors': 1,
    'associate professors': 1,
    'assistant professors': 1},
   'api': {'professors': 57,
    'associate professors': 65,
    'assistant professors': 212},
   'black': {'professors': 21,
    'associate professors': 13,
    'assistant professors': 46},
   'white': {'professors': 321,
    'associate professors': 203,
    'assistant professors': 504}}},
 'da

In [174]:
for school, data in schools.items():
    path = sys.path[0] + f'/../data/{school}/{school}_nces.json'
    with open(path, 'w') as file:
        json.dump(data, file)