# Mallory Drake, Luke Weeklund & Paige Turner Data Wrangling Project Scraping

In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By # used to import different ways to access data in the XML or HTML file
from selenium.webdriver.chrome.service import Service # no longer need to download a driver file, use service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

from webdriver_manager.chrome import ChromeDriverManager # used to manage the Chrome driver to emulate a Chrome web browser

import time
import random

#### Scrape 62 pages of the table data off the website to collect second dataset

In [4]:
browser = webdriver.Chrome()
browser.maximize_window()

page_counter = 1

# start empty lists for each column of data
rank = []
school_name = []
school_type = []
early_career_pay = []
mid_career_pay = []
pct_high_meaning = []
pct_stem_degrees = []

# Scrape first page
url = "https://www.payscale.com/college-salary-report/bachelors"
browser.get(url)
time.sleep(random.uniform(1, 4))

while page_counter <= 61:  # Scraping the entire table
    table = browser.find_element(By.CLASS_NAME, 'data-table')
    rows = table.find_elements(By.CLASS_NAME, 'data-table__row')

    for row in rows:
        values = row.find_elements(By.CLASS_NAME, 'data-table__value')

        rank.append(values[0].text)
        school_name.append(values[1].text)
        school_type.append(values[2].text)
        early_career_pay.append(values[3].text)
        mid_career_pay.append(values[4].text)
        pct_high_meaning.append(values[5].text)
        pct_stem_degrees.append(values[6].text)

    # Move to the next page
    page_counter += 1
    url = "https://www.payscale.com/college-salary-report/bachelors/page/" + str(page_counter)
    browser.get(url)
    time.sleep(random.uniform(1, 4))

browser.quit()
print('done')


done


#### make a dataframe using the scraped data

In [6]:
scraped_table = pd.DataFrame({
    'Rank': rank,
    'School Name': school_name,
    'School Type': school_type,
    'Early Career Pay': early_career_pay,
    'Mid Career Pay': mid_career_pay,
    'Percent of STEM Degrees': pct_stem_degrees
})
scraped_table

Unnamed: 0,Rank,School Name,School Type,Early Career Pay,Mid Career Pay,Percent of STEM Degrees
0,1,Massachusetts Institute of Technology,"Engineering, Private School, Research University","$110,200","$196,900",66%
1,2,Princeton University,"Engineering, Ivy League, Private School, Resea...","$95,600","$194,100",53%
2,3,United States Naval Academy,"Engineering, Liberal Arts School, Sober School...","$96,700","$187,800",54%
3,4,Harvey Mudd College,"Engineering, Liberal Arts School, Private School","$115,000","$185,900",74%
4,5,Babson College,"Business, Private School","$90,600","$181,400",3%
...,...,...,...,...,...,...
1498,1499,Morris College,"Private School, Religious","$40,000","$68,600",1%
1499,1500,Texas College,"Private School, Religious","$42,700","$68,300",21%
1500,1501,Claflin University,"Liberal Arts School, Private School, Religious","$47,900","$68,000",22%
1501,1501,The Baptist College of Florida,"Private School, Religious","$41,900","$68,000",0%


#### Strip any blank spaces on the school name column so it can be merged with the other dataframe

In [11]:
scraped_table['School Name'] = scraped_table['School Name'].str.strip()
scraped_table

Unnamed: 0,Rank,School Name,School Type,Early Career Pay,Mid Career Pay,Percent of STEM Degrees
0,1,Massachusetts Institute of Technology,"Engineering, Private School, Research University","$110,200","$196,900",66%
1,2,Princeton University,"Engineering, Ivy League, Private School, Resea...","$95,600","$194,100",53%
2,3,United States Naval Academy,"Engineering, Liberal Arts School, Sober School...","$96,700","$187,800",54%
3,4,Harvey Mudd College,"Engineering, Liberal Arts School, Private School","$115,000","$185,900",74%
4,5,Babson College,"Business, Private School","$90,600","$181,400",3%
...,...,...,...,...,...,...
1498,1499,Morris College,"Private School, Religious","$40,000","$68,600",1%
1499,1500,Texas College,"Private School, Religious","$42,700","$68,300",21%
1500,1501,Claflin University,"Liberal Arts School, Private School, Religious","$47,900","$68,000",22%
1501,1501,The Baptist College of Florida,"Private School, Religious","$41,900","$68,000",0%


In [18]:
# change rank to salary rank so it can be identified when merged with our other dataset
scraped_table.rename(columns={'Rank': 'Postgrad Salary Rank'}, inplace=True)
scraped_table.head()

Unnamed: 0,Postgrad Salary Rank,School Name,School Type,Early Career Pay,Mid Career Pay,Percent of STEM Degrees
0,1,Massachusetts Institute of Technology,"Engineering, Private School, Research University","$110,200","$196,900",66%
1,2,Princeton University,"Engineering, Ivy League, Private School, Resea...","$95,600","$194,100",53%
2,3,United States Naval Academy,"Engineering, Liberal Arts School, Sober School...","$96,700","$187,800",54%
3,4,Harvey Mudd College,"Engineering, Liberal Arts School, Private School","$115,000","$185,900",74%
4,5,Babson College,"Business, Private School","$90,600","$181,400",3%


#### Save to a .csv file to use when merging in the next jupyter file

In [20]:
scraped_table.to_csv('scraped_univeristy_salary_data.csv', index=False)