# Job Listing Scrapper

Dataset created from scraping job engine sites including Glassdoor, Indeed, LinkedIn, and Angel using Python's Selenium library and scrapes for the following fields: 

1. **Company Name**: Name of the company
2. **Job Title**: The title of job, eg. Data scientist, junior data scientist, senior data scientist etc.
3. **Job Description**: Tells us what is expected out of the job title.
4. **Job Requirement**: Required skills
5. **Salary Estimate**: Range of salary and the source.
6. **Benefits**: Benefits offered by the company including medical insurance, equity, etc.
7. **Location**: Location of the job
8. **Size**: Range of number of employee working in the company
9. **Rating**: It gives the rating of the company
10. **Review**: Employee Reviews
11. **Industry**: Industry of the company
12. **Sector**: Sector in which company works
13. **Revenue**: Total revenue of the company per year
14. **Num Listings**: Total number of job listings for a country 

### Install Packages

In [None]:
#chromedriver - https://sites.google.com/chromium.org/driver/
#pip install -U selenium
#conda install -c conda-forge python-dotenv
#conda install -c conda-forge webdriver-manager
#conda install tqdm
#pip install oschmod

### Add Libraries

In [None]:
import os
import oschmod
import re
import time
import requests
import warnings
import numpy as np
import pandas as pd

from tqdm.auto import tqdm # works for both terminal and notebook
from dotenv import load_dotenv, find_dotenv
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, WebDriverException, ElementClickInterceptedException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup

In [None]:
warnings.filterwarnings('ignore')

### Helper Functions

In [None]:
def driver_setup(url, chrome_var=None, chrome_path=None, linux_unix=True):
    
    options = Options()
    #options.add_argument("--window-size=1120,1000")
    #"--kiosk")
    if chrome_path is None and chrome_var is None:
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        return driver
    
    # if storing path to chrome_driver in a .env file
    if chrome_var:
        load_dotenv(find_dotenv()) 
        CHROME_DRIVER = os.environ[chrome_var]
    elif chrome_path:
        CHROME_DRIVER = chrome_path
    
    oschmod.set_mode(CHROME_DRIVER, 0o755) # set read/execute permissions
    service = Service(CHROME_DRIVER)
 
    driver = webdriver.Chrome(service=service, options=options)
    driver.maximize_window()
    driver.get(url)
    
    return driver

In [None]:
def sign_in(email, password):
    
    if find_dotenv():
        load_dotenv(find_dotenv()) 
        email = os.environ[email]
        password = os.environ[password]
    
    try:
        signInLink = driver.find_element(By.XPATH, "//a[@class='link ml-xxsm']").click()  #clicking Sign In link.

        # Enter email
        emailField = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//label[@class='css-w3qhip eb2o9h0']"))).click()
        emailField = driver.find_element(By.XPATH, "//input[@class='css-1kmcde e1h5k8h92']")
        emailField.send_keys(email, Keys.RETURN)

            # Enter password
        passwordField = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//label[@class='css-w3qhip eb2o9h0']"))).click()
        passwordField = driver.find_element(By.XPATH, "//input[@class='css-1kmcde e1h5k8h92']")
        passwordField.send_keys(password, Keys.RETURN)
        
        driver.find_element(By.XPATH, "//button[@name='submit']").click()

    except WebDriverException:
        pass
        

In [None]:
def save_file(df, filename='datascience_US'):
    
    directory_path = '../data/'
    filepath = directory_path + filename + '.csv'
    exists = os.path.exists(directory_path)
    
    if not exists:
        os.makedirs(directory_path)
        
    df.to_csv(filepath, index=False)

### Glassdoor Job Scrapper

In [None]:
def get_jobs(url, chrome_var=None, chrome_path=None, num_jobs=30, verbose=False, slp_time=5):

    if chrome_var:
        driver = driver_setup(url, chrome_var)
    elif chrome_path:
        driver = driver_setup(url, chrome_path)
    else:
        driver = driver_setup(url)
    
    jobs = []
    jobs_count = len(jobs)
    pbar = tqdm(total=num_jobs) # Init progress bar
    
    time.sleep(slp_time)
    
    result = re.search(r"SRCH_IL", url) # Check URL for countries that were typed into search bar
    
    if result:
        country = re.search(r'Job\/(.*?)-data-scientist', url).group(1)
    else:
        try:
            country = driver.find_element(By.XPATH, '//div[@class="css-m3gjah egu3u860"]/div[@class="selectedLabel"]').text.strip()
        except NoSuchElementException:
            country = np.nan
    
    print("Country:", country)
    
    try:
        total_listings = driver.find_element(By.XPATH, "//p[@data-test='jobsCount']").text.split()
            
        if len(total_listings) == 0:
            total_listings = int(driver.find_element(By.XPATH, "//h1[@data-test='jobCount-H1title']").text.split()[0])
        else:
            total_listings = int(total_listings[0])
            
    except NoSuchElementException:
            total_listings = np.nan 
            
    
    if num_jobs > total_listings:
        
        print("The number of jobs to be scrapped: {} exceeds the number of listings: {}".format(num_jobs, total_listings))
        
        num_jobs = total_listings
        
        print("The number of jobs has been updated to reflect the number of listings")
        print("")
    
    
    print("Total number of job listings: {}, number of jobs to be scraped: {}".format(total_listings, num_jobs))
    print("")

    while jobs_count < num_jobs:
        
        time.sleep(slp_time)
        time.sleep(.1)
        
        job_listings = driver.find_elements(By.CLASS_NAME, "react-job-listing")
        
        for listing in job_listings:
            
            pbar.update(1)
            
            if jobs_count >= num_jobs:
                print("Scraping completed, scraped {} of {} jobs".format(jobs_count, num_jobs))
                break
            
            listing.click()
            time.sleep(2)   

            
            try:
                driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()  #In case survey pops up. 
            except NoSuchElementException:
                pass

            try: 
                driver.find_element(By.XPATH, "//span[@alt='Close']").click()  #clicking to the X.   
            except NoSuchElementException:
                pass

            
            collected_successfully = False
            
            while not collected_successfully:
                try:
                    job_title = driver.find_element(By.XPATH,'//div[@class="css-1j389vi e1tk4kwz2"]').text.strip()
                    location = driver.find_element(By.XPATH,'//div[@class="css-56kyx5 e1tk4kwz1"]').text.strip()
                    job_description = driver.find_element(By.XPATH,'//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True        
                except:
                    

            try: # sometimes there are listings that are posted without a company name
                company_name = driver.find_element(By.XPATH,'//div[@class="css-xuk5ye e1tk4kwz5"]').text.strip() #returns any element which is direct parent.
            except:
                company_name = np.nan
            
            try:
                salary_range = driver.find_element(By.XPATH, '//span[@class="css-1hbqxax e1wijj240"]').text.strip()
            except NoSuchElementException:
                salary_range = np.nan

            try:
                salary_avg = driver.find_element(By.XPATH, '//div[@class="css-y2jiyn e2u4hf18"]').text.strip()
                salary_avg = salary_avg.split()[0]
            except NoSuchElementException:
                salary_avg = np.nan

            
            # Search for Company Container

            try:
                driver.find_element(By.ID, 'CompanyContainer')
                
                try:
                    size = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-daag8o e1pvx6aw2"])[1]//span[2]').text.strip()
    
                except NoSuchElementException:
                    size = np.nan

                try:
                    industry = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-daag8o e1pvx6aw2"])[4]//span[2]').text.strip()
                except NoSuchElementException:
                    industry = np.nan

                try:
                    sector = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-daag8o e1pvx6aw2"])[5]//span[2]').text.strip()
                except NoSuchElementException:
                    sector = np.nan

                try:
                    revenue = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-daag8o e1pvx6aw2"])[6]//span[2]').text.strip()
                except NoSuchElementException:
                    revenue = np.nan


            except NoSuchElementException:
                size = np.nan
                industry = np.nan
                sector = np.nan
                revenue = np.nan

            
            # Search for Reviews Container
            try:
                driver.find_element(By.XPATH, '//div[@data-test="company-ratings"]')

                try:
                    rating = float(driver.find_element(By.XPATH, '//div[@class="mr-sm css-ey2fjr e1pr2f4f3"]').text.strip())
                except NoSuchElementException:
                    rating = np.nan

                try:
                    recommend = driver.find_element(By.XPATH, '(//div[@class="d-flex top css-rkhv2t e1o78bat1"])[1]//div[1]').text.strip()
                except NoSuchElementException:
                    recommend = np.nan

                try:
                    ceo = driver.find_element(By.XPATH, '//div[@class="css-vkhqai ceoApprove"]').text.strip()
                except NoSuchElementException:
                    ceo = np.nan

                try:
                    opportunities = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[3]').text.strip())        
                except NoSuchElementException:
                    opportunities = np.nan
                try:
                    comp_benefits = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[6]').text.strip())        
                except NoSuchElementException:
                    comp_benefits = np.nan

                try:
                    culture = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[9]').text.strip())        
                except NoSuchElementException:
                    culture = np.nan

                try:
                    management = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[12]').text.strip())        
                except NoSuchElementException:
                    management = np.nan

                try:
                    workLife = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[15]').text.strip())        
                except NoSuchElementException:
                    workLife = np.nan


            except NoSuchElementException:
                rating = np.nan
                recommend = np.nan
                ceo = np.nan
                opportunities = np.nan
                comp_benefits = np.nan
                culture = np.nan
                management = np.nan
                workLife = np.nan


            # Get Employee Reviews
            try: 
                driver.find_element(By.ID, 'ReviewsContainer')

                try:
                    
                    pro_reviews = driver.find_element(By.XPATH, 
                                                      '(//div[@class="css-1sfecah e1vn3ovn1"])[1]//div') # check for pros

                    pro_reviews = pro_reviews.find_elements(By.XPATH, "following-sibling::p")
                    pros = [review.text for review in pro_reviews]  

                except NoSuchElementException: 
                    pros = np.nan

                try:
                    con_reviews = driver.find_element(By.XPATH, 
                                                      '(//div[@class="css-1sfecah e1vn3ovn1"])[2]//div')

                    con_reviews = con_reviews.find_elements(By.XPATH, "following-sibling::p")
                    cons = [review.text for review in con_reviews]    

                except NoSuchElementException:
                    cons = np.nan
                
                try:
                    reviewsURL = driver.find_element(By.XPATH, '//a[@class="seeAll pb-0 pt-std css-922fyb euq8tqg0"]').get_attribute('href')
                except NoSuchElementException:
                    reviewsURL = np.nan

            except NoSuchElementException: 
                pros = np.nan
                cons = np.nan
                reviewsURL = np.nan

            # Get Benefits Rating and Reviews
            try: 
                driver.find_element(By.CLASS_NAME, 'p-std')

                try: 
                    benefits_rating = float(driver.find_element(By.XPATH, '//div[@class="ratingNum mr-sm"]').text.strip())

                except NoSuchElementException: 
                    benefits_rating = np.nan
                
                try:
                    benefitsURL = driver.find_element(By.XPATH, '//a[@class="css-b6lfw4 mt-0 p-std d-flex justify-content-center"]').get_attribute('href')
                except NoSuchElementException:
                    benefitsURL = np.nan

            except NoSuchElementException: 
                benefits_rating = np.nan
                

            jobs.append({"Company Name": company_name,
                        "Job Title": job_title, 
                        "Location": location,
                        "Country": country,
                        "Job Description": job_description, 
                        "Salary Estimate": salary_range,
                        "Avg Salary": salary_avg,
                        "Size": size,
                        "Industry": industry,
                        "Sector": sector,
                        "Revenue": revenue,
                        "Rating": rating,
                        "Recommend": recommend,
                        "CEO": ceo,
                        "Benefits": benefits_rating,
                        "Opportunities": opportunities,
                        "Comp Benefits": comp_benefits,
                        "Culture": culture,
                        "Management": management,
                        "WorkLife Balance": workLife,
                        "Pros": pros,
                        "Cons": cons,
                        "Num Listings": total_listings,
                        "Reviews URL": reviewsURL,
                        "Benefits URL": benefitsURL})
            
        
            jobs_count = len(jobs)
            
            if not verbose:
                print("Scraped {} out of {} job listings".format(jobs_count, num_jobs), end='\r')
            
            # print for debugging purposes
            if verbose:
                print("Company Name: {}".format(company_name))
                print("Job Title: {}".format(job_title))
                print("Location: {}".format(location))
                print("Country: {}".format(country))
                print("Job Description: {}".format(job_description[:500]))
                print("Salary Estimate: {}".format(salary_range))
                print("Avg Salary: {}".format(salary_avg))
                print("Size: {}".format(size))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("Rating: {}".format(rating))
                print("Recommend To Friend: {}".format(recommend))
                print("Approve of CEO: {}".format(ceo))
                print("Benefits Rating: {}".format(benefits_rating))
                print("Career Opportunities: {}".format(opportunities))
                print("Comp & Benefits: {}".format(comp_benefits))
                print("Culture & Values: {}".format(culture))
                print("Senior Managment: {}".format(management))
                print("Work Life Balance: {}".format(workLife))
                print("Pros: ", pros)
                print("Cons: ", cons)
                print("")
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                print("")

        # clicking on the "next page button"

        try:
            driver.find_element(By.XPATH, '//button[@data-test="pagination-next"]').click()

        except NoSuchElementException:
            print("Scraping completed, scraped {}, out of {} job listings.".format(jobs_count, num_jobs))
            break
 
    
    pbar.close()
    driver.close()
    return pd.DataFrame(jobs)
        

In [None]:
env_var = "CHROME_DRIVER"
url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime&remoteWorkType=1&sortBy=date_desc'

df = get_jobs(url=url, chrome_var=env_var)
df

In [None]:
#url = 'https://www.glassdoor.com/Reviews/Univision-Reviews-E6046.htm'
#url = 'https://www.glassdoor.ca/Reviews/Indeed-Reviews-E100561.htm'
#url = 'https://www.glassdoor.com/Reviews/Salesforce-Reviews-E11159.htm'
url = 'https://www.glassdoor.com.ar/Evaluaciones/Chase-Evaluaciones-E690765.htm'
email = "GLASSDOOR_EMAIL"
password = "GLASSDOOR_PASSWORD"

def get_reviews(url, email, password, verbose=False):
    
    reviews = []

    driver = driver_setup(url, chrome_var="CHROME_DRIVER")

    time.sleep(10)


    try:
        driver.find_element(By.XPATH, "//div[@class='gdUserLogin authInlineContainer gdGrid bg-white']")
        sign_in(email, password)

    except NoSuchElementException:
        pass

    try:
        driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()
    except NoSuchElementException:
        pass

    time.sleep(5)

    try:
        driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()
    except NoSuchElementException:
        pass

    time.sleep(5)

    # Make the demographics dropdown clickable to select different options
    try:
        driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']")

        dropdown = driver.find_element(By.XPATH, "//div[@class='mb-xsm e1hgsnla1 css-wcay7z ew8s0qn0']/select")
        driver.execute_script("arguments[0].style.display = 'block';", dropdown)
        time.sleep(2)

        drpdn_optns = dropdown.find_elements(By.XPATH, "option")
        drpdn_optns = [option.get_attribute('value') for option in drpdn_optns] 


        dropdown = Select(dropdown)

        for option in drpdn_optns: # iterates through menu option and selects them
            time.sleep(.1)
            dropdown.select_by_value(option)  # get ratings based on demographics
            time.sleep(2)


            if option == 'raceEthnicity':
                
                try:
                    asian = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[1]").text.split()[-2]
                except NoSuchElementException:
                    asian = np.nan
                try:
                    black = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[2]").text.split()[-2]
                except NoSuchElementException:
                    black = np.nan
                try:
                    hispanic = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[3]").text.split()[-2]
                except NoSuchElementException:
                    hispanic = np.nan
                try:
                    indigenous = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[4]").text.split()[-2]
                except NoSuchElementException:
                    indigenous = np.nan
                try:
                    middleEastern = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[5]").text.split()[-2]
                except NoSuchElementException:
                    middleEastern = np.nan
                try:
                    white = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[6]").text.split()[-2]
                except NoSuchElementException:
                    white = np.nan
                try:
                    other = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[7]").text.split()[-2]
                except NoSuchElementException:
                    other = np.nan    
                

            elif option == 'gender':
                
                try:
                    men = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[1]").text.split()[-2]
                except NoSuchElementException:
                    men = np.nan    
                try:
                    women = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[2]").text.split()[-2]
                except NoSuchElementException:
                    women = np.nan    
                try:
                    trans = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[3]").text.split()[-2]
                except NoSuchElementException:
                    trans = np.nan    
              

            elif option == 'sexualOrientation':
                
                try:
                    heterosexual = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[1]").text.split()[-2]
                except NoSuchElementException:
                    heterosexual = np.nan 
                try:
                    lgbtq = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[2]").text.split()[-2]
                except NoSuchElementException:
                    lgbtq = np.nan 
               

            elif option == 'disability':
                
                try:
                    nonDisabled = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[1]").text.split()[-2]
                except NoSuchElementException:
                    nonDisabled = np.nan 
                
                try:
                    disabled = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[2]").text.split()[-2]
                except NoSuchElementException:
                    disabled = np.nan 
                               

            elif option == 'parentOrCaregiver':
                
                try:
                    caregiver = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[1]").text.split()[-2]
                except NoSuchElementException:
                    caregiver = np.nan 
                
                try:
                    nonCaregiver = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[2]").text.split()[-2]
                except NoSuchElementException:
                    nonCaregiver = np.nan 
                
                try:
                    parents = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[3]").text.split()[-2]
                except NoSuchElementException:
                    parents = np.nan 
                    
            else:
                
                try:
                    nonVeterans = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[1]").text.split()[-2]
                except NoSuchElementException:
                    nonVeterans = np.nan 
                
                try:
                    veterans = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[2]").text.split()[-2]
                except NoSuchElementException:
                    veterans = np.nan 
                    

    except NoSuchElementException:
        asian = np.nan
        black = np.nan
        hispanic = np.nan
        indigenous = np.nan
        middleEastern = np.nan
        white = np.nan
        other = np.nan
        men = np.nan
        women = np.nan
        trans = np.nan
        heterosexual = np.nan
        lgbtq = np.nan
        nonDisabled = np.nan
        disabled = np.nan
        caregiver = np.nan
        nonCaregiver = np.nan
        nonVeterans = np.nan
        veterans = np.nan
    
    reviews.append({"Asian":asian,
                 "Black":black,
                 "Hispanic/Latinx":hispanic,
                 "Indigenous":indigenous,
                 "Middle Eastern":middleEastern,
                 "White":white,
                 "Men":men,
                 "Women":women,
                 "Transexual/Non-Binary":trans,
                 "Heterosexual":heterosexual,
                 "lgbtq":lgbtq,
                 "Non-Disabled", nonDisabled,
                 "Disabled": disabled,
                 "Caregiver": caregiver,
                 "Non-Veteran": nonVeterans,
                 "Veterans": veterans        
                })

    if verbose:
        print("Asian:", asian)
        print("Black:", black)
        print("Hispanic:", hispanic)
        print("Indigenous:", indigenous)
        print("MiddleEastern:", middleEastern)
        print("White:", white)
        print("Men:", men)
        print("Women:", women)
        print("Transexual/Non-Binary:", trans)
        print("Heterosexual:", heterosexual)
        print("lgbtq:", lgbtq)
        print("Non-Disabled:", nonDisabled)
        print("Disabled:", disabled)
        print("Caregiver:", caregiver)
        print("Non-Veteran:", nonVeterans)
        print("Veterans:", veterans)


In [None]:
urls = ['https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime&remoteWorkType=1&sortBy=date_desc',
        'https://www.glassdoor.com.ar/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime', 
        'https://www.glassdoor.com.au/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://nl.glassdoor.be/Vacature/data-scientist-vacatures-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.be/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.br/Vaga/data-scientist-vagas-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.ca/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.ca/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.de/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.es/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.fr/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.hk/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.in/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.ie/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.it/Lavoro/data-scientist-lavori-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.mx/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.nl/Vacature/data-scientist-vacatures-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.nz/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.at/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://de.glassdoor.ch/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.sg/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.ch/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.uk/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/south-africa-data-scientist-jobs-SRCH_IL.0,12_IN211_KO13,27.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/uruguay-data-scientist-jobs-SRCH_IL.0,7_IN246_KO8,22.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/mexico-data-scientist-jobs-SRCH_IL.0,6_IN169_KO7,21.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/costa-rica-data-scientist-jobs-SRCH_IL.0,10_IN57_KO11,25.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/chile-data-scientist-jobs-SRCH_IL.0,5_IN49_KO6,20.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/ecuador-data-scientist-jobs-SRCH_IL.0,7_IN68_KO8,22.htm',
        'https://www.glassdoor.com/Job/nigeria-data-scientist-jobs-SRCH_IL.0,7_IN177_KO8,22.htm',
        'https://www.glassdoor.com/Job/egypt-data-scientist-jobs-SRCH_IL.0,5_IN69_KO6,20.htm',
        'https://www.glassdoor.com/Job/japan-data-scientist-jobs-SRCH_IL.0,5_IN123_KO6,20.htm',
        'https://www.glassdoor.com/Job/china-data-scientist-jobs-SRCH_IL.0,5_IN48_KO6,20.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/south-korea-data-scientist-jobs-SRCH_IL.0,11_IN135_KO12,26.htm?jobType=fulltime']


In [None]:
url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime&remoteWorkType=1&sortBy=date_desc'
driver = driver_setup(url, chrome_var="chrome_driver")

time.sleep(5)

job_listings = driver.find_elements(By.CLASS_NAME, "react-job-listing")

for listing in job_listings:
    
    listing.click()
    time.sleep(2)
    
    try:
        emailButton = driver.find_element(By.XPATH, "//button[@class='jaCreateAccountEmailSignUpButton']").click()  
    
        # Enter email
        emailField = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//label[@class='css-w3qhip eb2o9h0']"))).click()
        email = driver.find_element(By.XPATH, "//input[@class='css-1kmcde e1h5k8h92']")
        email.send_keys(GLASSDOOR_EMAIL, Keys.RETURN)

        # Enter password
        passwordField = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//label[@class='css-w3qhip eb2o9h0']"))).click()
        password = driver.find_element(By.XPATH, "//input[@class='css-1kmcde e1h5k8h92']")
        password.send_keys(GLASSDOOR_PASSWORD, Keys.RETURN)
    
    except WebDriverException:
        pass
        
    
    
"""

signIn_button = driver.find_element(By.XPATH, '//button[text()="Sign In"]')
driver.implicitly_wait(5)
signIn_button.click()
"""
    

In [None]:
"""

# find ratings based on demographics
try:
    demographics = driver.find_elements(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button")

    for element in demographics:

        ethnicity = element.text.split()[0]
        rating = element.text.split()[-2]

        if rating == 'â€”':
            rating = np.nan
        else:
            rating = float(rating)

        if ethnicity == 'Asian':
            asian = rating
        elif ethnicity == 'Black':
            black = rating
        elif ethnicity == 'Hispanic':
            hispanic = rating
        elif ethnicity == 'Indigenous':
            indigenous = rating
        elif ethnicity == 'Middle':
            middleEastern = rating
        elif ethnicity == 'White':
            white = rating
        else:
            other = rating
            
except NoSuchElementException:
    asian = np.nan
    black = np.nan
    hispanic = np.nan
    indigenous = np.nan
    middleEastern = np.nan
    white = np.nan
    other = np.nan

print("Asian:", asian)
print("Black:", black)
print("Hispanic:", hispanic)
print("Indigenous:", indigenous)
print("MiddleEastern:", middleEastern)
print("White:", white)
print("Other:", other)

"""

In [None]:
"""
try:
    driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[1]").text.split()

except NoSuchElementException:
    asian = np.nan

try:
    black = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[2]").text.split()
except NoSuchElementException:
    black = np.nan

try:
    hispanic = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[3]").text.split()
except NoSuchElementException:
    hispanic = np.nan

try:
    indigenous = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[4]").text.split()
except NoSuchElementException:
    indigenous = np.nan

try:
    middleEastern = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[5]").text.split()
except NoSuchElementException:
    middleEastern = np.nan

try:
    white = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[6]").text.split()
except NoSuchElementException:
    white = np.nan

try:
    islander = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']/button[7]").text.split()
except NoSuchElementException:
    islander = np.nan
"""

In [None]:
env_var = "chrome_driver"
dfs = [get_jobs(url=url, chrome_var=env_var, num_jobs=5) for url in urls]
df = pd.concat(dfs, ignore_index=True)
df

### Glassdoor Scraper

In [None]:
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}

In [None]:
response = requests.get(url, headers)
response.status_code

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')
#soup

In [None]:
pagination = soup.findAll("div", {"class": "paginationFooter"})[0]

In [None]:
pagination = pagination.text.strip()
pagination = pagination.split()

In [None]:
page_num = int(pagination[1])
total_pages = int(pagination[-1])

In [None]:
"""
for i in range(page_num, total_pages+1):
    if page_num > 1:
       url = f"https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14_IP{page_num}.htm?seniorityType=entrylevel&includeNoSalaryJobs=true"

"""

In [None]:
#scrapes all divs in main section of webpage

divs = soup.find_all("div", class_='module p-0 job-search-key-kxun6g exy0tjh2')
#divs

In [None]:
# scrapes job listings
listings = divs[0].find_all('li', class_='react-job-listing')
#len(listings)

In [None]:
# scraps all divs wihin each list item, must use find_all instead of find since find returns only first div it finds

divs = [item.find_all('div') for item in listings] # finds all divs within each list item, m
#divs[0]

In [None]:
left_col = [item[0] for item in divs] # can't use 'find' since item is a list
right_col = [item[1] for item in divs]

In [None]:
anchors = [item.find_all('a') for item in right_col]
#anchors

##### Company Names

In [None]:
companies = [company[0] for company in anchors]
companies = [name.find('span').text.strip() for name in companies]
companies[0]

##### Job Titles

In [None]:
titles = [company[1] for company in anchors]
titles = [title.find('span').text.strip() for title in titles]
titles[0]

##### Location

In [None]:
locations = [item.find('div', class_='d-flex flex-wrap job-search-key-1m2z0go e1rrn5ka2') for item in right_col]
locations = [location.find('span').text.strip() for location in locations]
locations[0]

##### Salary

In [None]:
salaries = []
ratings = []

In [None]:
for item in right_col:
    
    try:
        salary = item.find('div', class_='css-1buaf54 pr-xxsm') 
        salary = salary.find('span', class_='job-search-key-1hbqxax e1wijj240').text.strip()
        salary = salary.split()
        
        if len(salary) <= 4:
            salary = salary[0]  

        else:
            start_sal = salary[0]
            max_sal = salary[2]
            salary = start_sal + '-' + max_sal
    except:
        salary = np.nan
    
    salaries.append(salary)
    
salaries[0]   

##### Ratings

In [None]:
for item in left_col:
    try:
        rating = float(item.find('span', class_='job-search-key-srfzj0 e1cjmv6j0').text.strip())
    except:
        rating = np.nan
        
    ratings.append(rating)
    
ratings[0]

##### Job Listing Page

In [None]:
links = [link[0].get("href") for link in anchors]

In [None]:
urls = [f"https://www.glassdoor.com{link}" for link in links]

In [None]:
urls[0]

In [None]:
jobLink = urls[0]

In [None]:
response = requests.get(jobLink, headers)
response.status_code

In [None]:
link = 'https://www.glassdoor.com/job-listing/junior-sas-data-scientist-424-vezita-tech-JV_KO0,29_KE30,41.htm?jl=1008008033583&pos=101&ao=1110586&s=58&guid=0000018202dbf8f9920f2a5cca6a9cc3&src=GD_JOB_AD&t=SR&vt=w&ea=1&cs=1_abea6e2e&cb=1657905347193&jobListingId=1008008033583&cpc=654405A9B1E0A9F5&jrtk=3-0-1g81dnu9rk255801-1g81dnuahghre800-e5f5f92b99790728--6NYlbfkN0A9aFbeqbFpDTCoiHOd6k0wi_YQM7kD-1BJ08Zr1fUkZoDqNJGBVgd-vao9K1qY82N8I1kgImMFzYDAIglGvPLDd_djxuszz8IamPMPcX9as8QrYlFAfWUSEoUwZprhpr8YrJgAbGOJSa943B9zmKGu-lnmily_Vm49BOb2PIn7RfL5JdE5RJMYl4a4fOddmkGLqkobe84SNyejQcQhQjcFNbQpZNv5rzmr7e1JgAowQwQYBG4bbzRgYV0P_JCDy1Jazne5I0HOOD7GQL-5-aHhJieNzuA0BZwASplqp85J7rniTYqXL-CtXVMCZy3veuqlqALVVBNrIGx5nKfq75zgi41wNv1eqaC4acP-dwslixVtnnXUlBvYPrTBRohB4ZabC6Tqnn2hOk7Wtb5VmQPRi3DxrfPO5k-Z-BXsi6UuZs1di84rg1GI0di4MWn60GffS6wFi4pq4DnQu6OFWyYFMpDXWX0eH9AwqOfUDVtCmG8GWOJxwPSuF4JOugPon6v-76TtmoJc406kk32jKkKI&ctt=1657911504446'

In [None]:
response = requests.get(link, headers)
response.status_code

In [None]:
# check for missing data

print('num companies:', len(companies), 'num titles:', len(titles), 
      'num locations:', len(locations), 'num salaries:', len(salaries),
      'num ratings:', len(ratings), 'num URLS:', len(urls))
