# Developing and Using the Glassdoor scraper

**Scrape Glassdoor for PhD Stipend Information for each University**

* Data Collection: Selenium and Chrome webdriver for webscraping
* Data Extraction: Beautiful soup is used to parse HTML elements saved as lists
* Data Storage: Lists are combined into a dataframe for further processing. 
* Data Cleaning: Salaries are reformatted, and values below 2,000 or above 80,000 are excluded.

In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

**Developing the scraper**

In [3]:
url = 'https://www.glassdoor.com/Salaries/us-phd-student-salary-SRCH_IL.0,2_IN1_KO3,14.htm'

In [6]:
webdriver_location = r'C:\Users\Lenovo V15\Downloads\chromedriver_win32.exe'
chrome_service = Service(executable_path=webdriver_location)
driver = webdriver.Chrome(service=chrome_service)
driver.get(url)

page_html = driver.page_source

# Parse with BeautifulSoup
soup = BeautifulSoup(page_html, 'html.parser')
driver.quit()

In [10]:
# Checking that the pages is properly scraped
uni_divs = soup.find_all('div', {'class': 'py d-flex align-items-start align-items-lg-center css-17435dd row'})
university=[]
star_rating=[]
salary_high=[]
salary_low=[]
salary_avg=[]

for div in uni_divs:
    university.append(div.find('a', class_='css-3m8p33 el6ke058').text)
    star_rating.append(div.find('span', class_="css-mi55ob m-0 css-135bgoo el6ke0510").text)
    salary_high.append(div.find('div', class_="d-flex flex-column align-items-end css-1qfy6mj e13r6qcv5").text)
    salary_low.append(div.find('div', class_="d-flex flex-column align-items-end css-15o6gsn e13r6qcv5").text)
    salary_avg.append(div.find('h3', class_="m-0 css-be8uqy el6ke056").text)

df = pd.DataFrame({'University':university, 'Star':star_rating, 'Salary_high':salary_high, 'Salary_low':salary_low, 'Salary_avg':salary_avg})
df.head()

Unnamed: 0,University,Star,Salary_high,Salary_low,Salary_avg
0,Stanford University,4.3,$54K,$34K,"$42,688"
1,University of Pennsylvania,4.1,$51K,$33K,"$41,299"
2,Harvard University,4.3,$53K,$34K,"$42,451"
3,Columbia University,4.0,$53K,$35K,"$42,901"
4,Northwestern University,4.2,$49K,$32K,"$39,382"


In [11]:
df.tail()

Unnamed: 0,University,Star,Salary_high,Salary_low,Salary_avg
14,University of California Berkeley,4.3,$55K,$35K,"$44,033"
15,Boston University,4.2,$52K,$33K,"$41,095"
16,UC San Diego,4.1,$52K,$34K,"$41,946"
17,Princeton University,4.5,$52K,$34K,"$41,710"
18,Washington University in St. Louis,4.1,$48K,$30K,"$38,426"


In [None]:
# Scrapes the whole Glassdoor, saving each soup in a list scraped_pages
# With wait times this will take ~13 minutes to load 52 pages containingt 1040 universities (20/page) from glassdoor

import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")


webdriver_location = r'C:\Users\Lenovo V15\Downloads\chromedriver_win32.exe'

# Create a Service object with the webdriver_location
service = Service(executable_path=webdriver_location)

# Pass the Service object to the webdriver.Chrome constructor
#driver = webdriver.Chrome(service=service)
driver = webdriver.Chrome(service=service, options=options)


scraped_pages = []
base_url = "https://www.glassdoor.com/Salaries/us-phd-student-salary-SRCH_IL.0,2_IN1_KO3,14"

for i in range(52):
    # Update URL based on the current iteration
    url = f'{base_url}_IP.htm' if i == 0 else f'{base_url}_IP{i + 1}.htm'
    
    print(f"Scraping page {i + 1}: {url}")
    
    driver.get(url)
    uni_html = driver.page_source
    scraped_pages.append(BeautifulSoup(uni_html, 'html.parser'))
    
    # Wait for a random time between requests (10 to 20s)
    time.sleep(random.uniform(10, 20))

# Close the driver after the loop
driver.quit()

print(f"Number of pages scraped: {len(scraped_pages)}")


**Extract element values from each soup and save into a dataframe**

In [16]:
# Re-running this as an example for github, but only on one soup
scraped_pages = soup

In [18]:
# Define empty DataFrame 
df_final = pd.DataFrame(columns=['University', 'Star', 'Salary_high', 'Salary_low', 'Salary_avg'])

for soup in scraped_pages:
    uni_divs = soup.find_all('div', {'class': 'py d-flex align-items-start align-items-lg-center css-17435dd row'})

    university = []
    star_rating = []
    salary_high = []
    salary_low = []
    salary_avg = []

    for div in uni_divs:
        university.append(div.find('a', class_='css-3m8p33 el6ke058').text)
        star_elem = div.find('span', class_="css-mi55ob m-0 css-135bgoo el6ke0510")
        
        if star_elem is not None:
            star_rating.append(star_elem.text)
        else:
            star_rating.append('N/A')  
        
        salary_high.append(div.find('div', class_="d-flex flex-column align-items-end css-1qfy6mj e13r6qcv5").text)
        salary_low.append(div.find('div', class_="d-flex flex-column align-items-end css-15o6gsn e13r6qcv5").text)
        salary_avg.append(div.find('h3', class_="m-0 css-be8uqy el6ke056").text)

    # Create a DataFrame for the current scrape
    df = pd.DataFrame({'University': university, 'Star': star_rating, 'Salary_high': salary_high, 'Salary_low': salary_low, 'Salary_avg': salary_avg})
    
    # Concatenate the current DataFrame with the final DataFrame
    df_final = pd.concat([df_final, df], ignore_index=True)


df_final.shape

(19, 5)

In [19]:
df_final.head()

Unnamed: 0,University,Star,Salary_high,Salary_low,Salary_avg
0,Stanford University,4.3,$54K,$34K,"$42,688"
1,University of Pennsylvania,4.1,$51K,$33K,"$41,299"
2,Harvard University,4.3,$53K,$34K,"$42,451"
3,Columbia University,4.0,$53K,$35K,"$42,901"
4,Northwestern University,4.2,$49K,$32K,"$39,382"


In [89]:
df_final.to_csv('GlassDoorScrape_Salaries_Start_Large.csv')

In [21]:
df_glassdoor_salary = df_final.copy()

In [22]:
# How it is filtered later

In [23]:
df_glassdoor_salary['Salary_high'] = df_glassdoor_salary['Salary_high'].apply(lambda x: int(x.replace('$', '').replace('K', ''))*1000)
df_glassdoor_salary['Salary_low'] = df_glassdoor_salary['Salary_low'].apply(lambda x: int(x.replace('$', '').replace('K', ''))*1000)
df_glassdoor_salary['Salary_avg'] = df_glassdoor_salary['Salary_avg'].apply(lambda x: int(x.replace('$', '').replace(',', '')))

In [None]:
def filter_function(row, columns, min_value, max_value):
    return all(row[column] >= min_value and row[column] <= max_value for column in columns)

columns_to_filter = ['Salary_low', 'Salary_avg', 'Salary_high']
min_value = 2000
max_value = 80000 # stipends over 80,000 and 20000 would be unrealistic and will be excluded

filtered_df = df_glassdoor_salary[df_glassdoor_salary.apply(filter_function, args=(columns_to_filter, min_value, max_value), axis=1)]

In [None]:
filtered_df.to_csv("GlassDoorScrape_PhDSalaries_Parsed.csv")