# Job Listing Scrapper

Datasets created from scraping job engine sites including Glassdoor using Python's Selenium library and scrapes for the following fields: 

1. **company_name**: Name of the company
2. **job_title**: The title of job, eg. Data scientist, junior data scientist, senior data scientist etc.
3. **location**: Location of the job
4. **job_desc**: Tells us what is expected out of the job title
5. **salary_est**: Company's salary range pertaining to job title
6. **avg_salary**: Average salary paid by the company for job title
7. **Size**: Range of number of employee working in the company
8. **industry**: Industry that company belongs to
9. **sector**: Sector in that company belong to
10. **revenue**: Total revenue of the company per year
11. **rating**: Average company rating 
12. **recommend**: Percentage of employees that recommend working at the company
13. **benefits**: Average benefits rating
14. **opportunities**:
15. **company_benefits**:
16. **culture**:
17. **management**:
18. **worklife**:
19. **pros**:
20. **cons**: 
21. **num_listings**:
22. **reviews_url**: 
23. **benefits_url**:               

### Install Packages

In [None]:
#chromedriver - https://sites.google.com/chromium.org/driver/
#pip install -U selenium
#conda install -c conda-forge python-dotenv
#conda install -c conda-forge webdriver-manager
#conda install tqdm
#pip install oschmod

### Add Libraries

In [1]:
import os
import oschmod
import re
import time
import json
import glob
import requests
import warnings
import numpy as np
import pandas as pd

from datetime import date
from tqdm.auto import tqdm # works for both terminal and notebook
from dotenv import load_dotenv, find_dotenv
from IPython.display import display
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, WebDriverException, ElementClickInterceptedException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import Select

In [2]:
warnings.filterwarnings('ignore')

### Helper Functions

In [10]:
def driver_setup(url, chrome_var=None, chrome_path=None, linux_unix=True):
    
    options = Options()
    
    if chrome_path is None and chrome_var is None:
        service = Service(ChromeDriverManager().install())
        driver = webdriver.Chrome(service=service, options=options)
        driver.maximize_window()
        driver.get(url)
        return driver
    
    # if storing path to chrome_driver in a .env file
    if chrome_var:
        load_dotenv(find_dotenv()) 
        CHROME_DRIVER = os.environ[chrome_var]
    elif chrome_path:
        CHROME_DRIVER = chrome_path
    
    oschmod.set_mode(CHROME_DRIVER, 0o755) # set read/execute permissions
    service = Service(CHROME_DRIVER)
 
    driver = webdriver.Chrome(service=service, options=options)
    driver.maximize_window()
    driver.get(url)
    
    return driver

In [3]:
def sign_in(email, password):
    
    if find_dotenv():
        load_dotenv(find_dotenv()) 
        email = os.environ[email]
        password = os.environ[password]
    
    try:
        signInLink = driver.find_element(By.XPATH, "//a[@class='link ml-xxsm']").click()  #clicking Sign In link.

        # Enter email
        emailField = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//label[@class='css-w3qhip eb2o9h0']"))).click()
        emailField = driver.find_element(By.XPATH, "//input[@class='css-1kmcde e1h5k8h92']")
        emailField.send_keys(email, Keys.RETURN)

            # Enter password
        passwordField = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//label[@class='css-w3qhip eb2o9h0']"))).click()
        passwordField = driver.find_element(By.XPATH, "//input[@class='css-1kmcde e1h5k8h92']")
        passwordField.send_keys(password, Keys.RETURN)
        
        driver.find_element(By.XPATH, "//button[@name='submit']").click()

    except WebDriverException:
        pass
        

In [4]:
def save_file(file, filename, directory_path):
    
    today = date.today()
    today = today.strftime("%Y-%m-%d")
    
    filename = filename + '_{}.csv'.format(today)
    
    directory_exists = os.path.exists(directory_path)
    
    if not directory_exists:
        print("Creating Directory...")
        os.makedirs(directory_path)
    
    print("Saving File...")  

      
    filepath = directory_path + filename 
    
    file.to_csv(filepath, index=False)
        
    print("File Saved:", filename)
    print("Saved In:", filepath)

### Get Jobs Listings

In [2]:
def get_jobs(url, chrome_var=None, chrome_path=None, num_jobs=30, verbose=False, slp_time=5):

    if chrome_var:
        driver = driver_setup(url, chrome_var)
    elif chrome_path:
        driver = driver_setup(url, chrome_path)
    else:
        driver = driver_setup(url)
    
    jobs = []
    jobs_count = len(jobs)
    
    time.sleep(slp_time)
    
    result = re.search(r"SRCH_IL", url) # Check URL for countries that were typed into search bar
    
    if result:
        country = re.search(r'Job\/(.*?)-data', url).group(1)
    else:
        try:
            country = driver.find_element(By.XPATH, '//div[@class="css-m3gjah egu3u860"]/div[@class="selectedLabel"]').text.strip()
        except NoSuchElementException:
            country = np.nan
            
    print("Country:", country) 
    print("")
    
    filename = 'ds_listings_{}'.format(country)
    
    
    try:
        total_listings = driver.find_element(By.XPATH, "//p[@data-test='jobsCount']").text.split()
            
        if len(total_listings) == 0:
            total_listings = int(driver.find_element(By.XPATH, "//h1[@data-test='jobCount-H1title']").text.split()[0])
        else:
            total_listings = int(total_listings[0])
            
    except NoSuchElementException:
            total_listings = np.nan 
            
    
    if num_jobs > total_listings:
        
        print("The number of jobs to be scrapped: {} exceeds the number of listings: {}".format(num_jobs, total_listings))
        
        num_jobs = total_listings
        
        print("The number of jobs has been updated to reflect the number of listings")
        print("")
    
    
    print("Total number of job listings: {}, number of jobs to be scraped: {}".format(total_listings, num_jobs))
    print("")
    
    pbar = tqdm(total=num_jobs) # Init progress bar

    while jobs_count < num_jobs:
        
        time.sleep(slp_time)
        time.sleep(.1)
        
        job_listings = driver.find_elements(By.CLASS_NAME, "react-job-listing")
        
        for listing in job_listings:
            
            pbar.update(1)
            
            if jobs_count >= num_jobs:
                print("Scraping completed, scraped {} of {} jobs".format(jobs_count, num_jobs))
                break
            
            try:
                driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()  #In case survey pops up. 
            except NoSuchElementException:
                pass
            
            listing.click()
            time.sleep(2)   

            
            try:
                driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()  #In case survey pops up. 
            except NoSuchElementException:
                pass

            try: 
                driver.find_element(By.XPATH, "//span[@alt='Close']").click()  #clicking to the X.   
            except NoSuchElementException:
                pass

            
            collected_successfully = False
            
            while not collected_successfully: 
                try: 
                    job_title = driver.find_element(By.XPATH,'//div[@class="css-1vg6q84 e1tk4kwz4"]').text.strip()
                    location = driver.find_element(By.XPATH,'//div[@class="css-56kyx5 e1tk4kwz5"]').text.strip()
                    job_description = driver.find_element(By.XPATH,'//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True        
                except:
                    time.sleep(slp_time)
                    

            try: # sometimes there are listings that are posted without a company name
                company_name = driver.find_element(By.XPATH,'(//div[@class="css-87uc0g e1tk4kwz1"])').text.strip() #returns any element which is direct parent.
            except:
                company_name = np.nan 
            
            try:
                salary_range = driver.find_element(By.XPATH, '//div[@class="css-w04er4 e1tk4kwz6"]/div[4]/span').text.strip()
            except NoSuchElementException:  
                salary_range = np.nan

            try:
                salary_avg = driver.find_element(By.XPATH, '//div[@class="css-1bluz6i e2u4hf13"]').text.strip()
                salary_avg = salary_avg.split()[0]
            except NoSuchElementException:
                salary_avg = np.nan

            
            # Search for Company Container

            try:
                driver.find_element(By.ID, 'CompanyContainer')
                
                try: 
                    size = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-rmzuhb e1pvx6aw0"])[1]//span[2]').text.strip()
    
                except NoSuchElementException:
                    size = np.nan

                try:
                    industry = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-rmzuhb e1pvx6aw0"])[4]//span[2]').text.strip()
                except NoSuchElementException:
                    industry = np.nan

                try:
                    sector = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-rmzuhb e1pvx6aw0"])[5]//span[2]').text.strip()
                except NoSuchElementException:
                    sector = np.nan

                try:
                    revenue = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-rmzuhb e1pvx6aw0"])[6]//span[2]').text.strip()
                except NoSuchElementException:
                    revenue = np.nan


            except NoSuchElementException:
                size = np.nan
                industry = np.nan
                sector = np.nan
                revenue = np.nan

            
            # Search for Reviews Container
            try:
                driver.find_element(By.XPATH, '//div[@data-test="company-ratings"]')

                try: 
                    rating = float(driver.find_element(By.XPATH, '//div[@class="mr-sm css-ey2fjr e1pr2f4f2"]').text.strip())
                except NoSuchElementException:
                    rating = np.nan

                try:
                    recommend = driver.find_element(By.XPATH, '(//div[@class="d-flex top css-1efnr4n e1o78bat2"])[1]//div[1]').text.strip()
                except NoSuchElementException: # 
                    recommend = np.nan

                try:
                    ceo = driver.find_element(By.XPATH, '//div[@class="css-ztsow4 ceoApprove"]').text.strip()
                except NoSuchElementException:
                    ceo = np.nan

                try:
                    opportunities = float(driver.find_element(By.XPATH, '//ul[@class="css-38kpu8 erz4gkm0"]/span[3]').text.strip())        
                except NoSuchElementException:
                    opportunities = np.nan
                try:
                    comp_benefits = float(driver.find_element(By.XPATH, '//ul[@class="css-38kpu8 erz4gkm0"]/span[6]').text.strip())        
                except NoSuchElementException:
                    comp_benefits = np.nan

                try:
                    culture = float(driver.find_element(By.XPATH, '//ul[@class="css-38kpu8 erz4gkm0"]/span[9]').text.strip())        
                except NoSuchElementException:
                    culture = np.nan

                try:
                    management = float(driver.find_element(By.XPATH, '//ul[@class="css-38kpu8 erz4gkm0"]/span[12]').text.strip())        
                except NoSuchElementException:
                    management = np.nan

                try:
                    worklife = float(driver.find_element(By.XPATH, '//ul[@class="css-38kpu8 erz4gkm0"]/span[15]').text.strip())        
                except NoSuchElementException:
                    worklife = np.nan


            except NoSuchElementException:
                rating = np.nan
                recommend = np.nan
                ceo = np.nan
                opportunities = np.nan
                comp_benefits = np.nan
                culture = np.nan
                management = np.nan
                worklife = np.nan


            # Get Employee Reviews
            try: 
                driver.find_element(By.ID, 'ReviewsContainer')

                try:
                    
                    pro_reviews = driver.find_element(By.XPATH, 
                                                      '(//div[@class="css-r14ud0 e1vn3ovn4"])[1]//div') # check for pros

                    pro_reviews = pro_reviews.find_elements(By.XPATH, "following-sibling::p")
                    pros = [review.text for review in pro_reviews]  

                except NoSuchElementException: 
                    pros = np.nan

                try:
                    con_reviews = driver.find_element(By.XPATH, 
                                                      '(//div[@class="css-r14ud0 e1vn3ovn4"])[1]//div')

                    con_reviews = con_reviews.find_elements(By.XPATH, "following-sibling::p")
                    cons = [review.text for review in con_reviews]    

                except NoSuchElementException:
                    cons = np.nan
                
                try: 
                    reviews_url = driver.find_element(By.XPATH, '//a[@class="seeAll pb-0 pt-std css-922fyb euq8tqg0"]').get_attribute('href')
                except NoSuchElementException:
                    reviews_url = np.nan

            except NoSuchElementException: 
                pros = np.nan
                cons = np.nan
                reviews_url = np.nan

            # Get Benefits Rating and Reviews
            try: 
                driver.find_element(By.CLASS_NAME, 'p-std')

                try: 
                    benefits_rating = float(driver.find_element(By.XPATH, '//div[@class="ratingNum mr-sm"]').text.strip())

                except NoSuchElementException: 
                    benefits_rating = np.nan
                
                try: 
                    benefits_url = driver.find_element(By.XPATH, '//a[@class="css-zuof7g mt-0 p-std d-flex justify-content-center"]').get_attribute('href')
                except NoSuchElementException:
                    benefits_url = np.nan

            except NoSuchElementException: 
                benefits_rating = np.nan
                

            jobs.append({"company_name": company_name,
                        "job_title": job_title, 
                        "location": location,
                        "country": country,
                        "job_desc": job_description, 
                        "salary_est": salary_range,
                        "avg_salary": salary_avg,
                        "size": size,
                        "industry": industry,
                        "sector": sector,
                        "revenue": revenue,
                        "rating": rating,
                        "recommend": recommend,
                        "ceo": ceo,
                        "benefits": benefits_rating,
                        "opportunities": opportunities,
                        "company_benefits": comp_benefits,
                        "culture": culture,
                        "management": management,
                        "worklife": worklife,
                        "pros": pros,
                        "cons": cons,
                        "num_listings": total_listings,
                        "reviews_url": reviews_url,
                        "benefits_url": benefits_url})
            
        
            jobs_count = len(jobs)
            
            if not verbose:
                print("Scraped {} out of {} job listings".format(jobs_count, num_jobs), end='\r')
            
            # print for debugging purposes
            if verbose:
                print("Company Name: {}".format(company_name))
                print("Job Title: {}".format(job_title))
                print("Location: {}".format(location))
                print("Country: {}".format(country))
                print("Job Description: {}".format(job_description[:500]))
                print("Salary Estimate: {}".format(salary_range))
                print("Avg Salary: {}".format(salary_avg))
                print("Size: {}".format(size))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("Rating: {}".format(rating))
                print("Recommend To Friend: {}".format(recommend))
                print("Approve of CEO: {}".format(ceo))
                print("Benefits Rating: {}".format(benefits_rating))
                print("Career Opportunities: {}".format(opportunities))
                print("Comp & Benefits: {}".format(comp_benefits))
                print("Culture & Values: {}".format(culture))
                print("Senior Managment: {}".format(management))
                print("Work Life Balance: {}".format(worklife))
                print("Pros: ", pros)
                print("Cons: ", cons)
                print("Number of Listings: {}".format(total_listings))
                print("Link to Company Reviews: {}".format(reviews_url))
                print("Linke to Company Benefits: {}".format(benefits_url))
                print("")
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                print("")

        # clicking on the "next page button"

        try:
            driver.find_element(By.XPATH, '//button[@data-test="pagination-next"]').click()

        except NoSuchElementException:
            print("Scraping completed, scraped {}, out of {} job listings.".format(jobs_count, num_jobs))
            break
 
    
    jobs = pd.DataFrame(jobs)
    
    if verbose:
        display(jobs.head())
    
    save_file(jobs, filename=filename, directory_path='../data/listings/')
    
    print("")
    
    pbar.close()
    driver.close()
    return jobs

In [17]:
env_var = "CHROME_DRIVER"
url = 'https://www.glassdoor.com/Job/australia-data-science-jobs-SRCH_IL.0,9_IN16_KO10,22.htm'

df_aust = get_jobs(url=url, chrome_var=env_var, num_jobs=500)
df_aust

Country: australia

Total number of job listings: 4120, number of jobs to be scraped: 500



  0%|          | 0/500 [00:00<?, ?it/s]

Scraping completed, scraped 500 of 500 jobs
Saving File...
File Saved:  ds_listings_australia_2022-10-17.csv



Unnamed: 0,Company Name,Job Title,Location,Country,Job Description,Salary Estimate,Avg Salary,Size,Industry,Sector,...,Opportunities,Comp Benefits,Culture,Management,WorkLife Balance,Pros,Cons,Num Listings,Reviews URL,Benefits URL
0,Novecom,Data Scientist,Newcastle,australia,Novecom is seeking a person with a computer sc...,A$85K - A$135K (Glassdoor est.),,1 to 50 Employees,,,...,,,,,,,,4120,,
1,ING\n4.0,Data Scientist,Sydney,australia,At ING we put our Customers at the forefront o...,A$85K - A$135K (Glassdoor est.),,10000+ Employees,Investment & Asset Management,Financial Services,...,3.8,3.9,4.0,3.5,4.0,,,4120,https://www.glassdoor.com/Reviews/ING-Reviews-...,https://www.glassdoor.com/Benefits/ING-Benefit...
2,Children's Medical Research Institute,Data Scientist - Procan Cancer Data Science (CDS),Westmead,australia,Make an important contribution to the health o...,A$85K - A$135K (Glassdoor est.),,,,,...,,,,,,,,4120,,
3,Murdoch Childrens Research Institute\n4.1,Data Scientist,Melbourne,australia,12-month Contract in a research institute comm...,A$85K - A$135K (Glassdoor est.),,1001 to 5000 Employees,Nonprofit & NGO,$25 to $100 million (USD),...,3.7,4.0,4.2,3.3,3.5,,,4120,https://www.glassdoor.com/Reviews/Murdoch-Chil...,
4,Qantas Airways Limited\n3.8,Data Insights Analyst,Melbourne,australia,Working on challenging and exciting initiative...,A$85K - A$135K (Glassdoor est.),,10000+ Employees,"Airlines, Airports & Air Transportation",Transportation & Logistics,...,3.5,3.6,3.6,3.1,3.6,,,4120,https://www.glassdoor.com/Reviews/Qantas-Revie...,https://www.glassdoor.com/Benefits/Qantas-Bene...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,CSIRO\n3.8,Environmental Spatial Data Scientist,Townsville,australia,Acknowledgement of Country\nCSIRO acknowledges...,A$70K - A$110K (Employer est.),,5001 to 10000 Employees,National Agencies,Government & Public Administration,...,3.4,3.5,3.7,3.2,4.2,,,4120,https://www.glassdoor.com/Reviews/Commonwealth...,https://www.glassdoor.com/Benefits/Commonwealt...
496,Travers Bushfire & Ecology (TBE),GIS Officer (undergraduate degree in Environme...,Australia,australia,An exciting opportunity for an energetic and p...,A$70K - A$110K (Employer est.),,,,,...,,,,,,,,4120,,
497,MVP1 Ventures\n3.7,Head of Analytics and Data Science,Remote,australia,"Responsibilities\nSet up a data analytics, dat...",A$70K - A$110K (Employer est.),,1 to 50 Employees,,,...,3.8,3.8,3.8,3.6,3.8,,,4120,https://www.glassdoor.com/Reviews/MVP1-Venture...,
498,Australian Government Department of Defence\n3.8,APS 6 - Program Decision Support Analyst,Canberra,australia,"$85,607 - $97,790 (plus Super)\nFairbairn - AC...",A$70K - A$110K (Employer est.),,10000+ Employees,Government & Public Administration,$10+ billion (USD),...,3.9,3.7,3.6,3.2,3.6,,,4120,https://www.glassdoor.com/Reviews/Australian-G...,https://www.glassdoor.com/Benefits/Australian-...


In [19]:
env_var = "CHROME_DRIVER"
url = 'https://www.glassdoor.com/Job/uruguay-data-science-jobs-SRCH_IL.0,7_IN246_KO8,20.htm'

df_UY = get_jobs(url=url, chrome_var=env_var, num_jobs=500)
df_UY

Country: uruguay

The number of jobs to be scrapped: 500 exceeds the number of listings: 98
The number of jobs has been updated to reflect the number of listings

Total number of job listings: 98, number of jobs to be scraped: 98



  0%|          | 0/98 [00:00<?, ?it/s]

Scraping completed, scraped 98 of 98 jobs
Saving File...
File Saved:  ds_listings_uruguay_2022-10-17.csv



Unnamed: 0,Company Name,Job Title,Location,Country,Job Description,Salary Estimate,Avg Salary,Size,Industry,Sector,...,Opportunities,Comp Benefits,Culture,Management,WorkLife Balance,Pros,Cons,Num Listings,Reviews URL,Benefits URL
0,Sabre\n4.0,Sr Data Analytics,Montevideo,uruguay,Job Family:\nBusiness Operations / Data\nSabre...,,,5001 to 10000 Employees,Enterprise Software & Network Solutions,Information Technology,...,3.6,3.7,3.9,3.5,4.0,,,98,https://www.glassdoor.com/Reviews/Sabre-Review...,https://www.glassdoor.com/Benefits/Sabre-Benef...
1,Roche\n4.3,GxP Quality Specialist,Montevideo,uruguay,The Position\nMission\nThe GxP Quality Special...,,,5001 to 10000 Employees,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,...,3.9,4.1,4.3,3.7,4.1,,,98,https://www.glassdoor.com/Reviews/Roche-Review...,https://www.glassdoor.com/Benefits/Roche-Benef...
2,EPAM Systems\n4.3,Data Resource Manager,Uruguay,uruguay,The Senior Data Engineer is responsible for ov...,,,10000+ Employees,Information Technology Support Services,Information Technology,...,4.0,3.5,4.1,3.9,4.0,,,98,https://www.glassdoor.com/Reviews/EPAM-Systems...,https://www.glassdoor.com/Benefits/EPAM-System...
3,RELX\n3.9,Data Scientist II,Río Negro,uruguay,O profissional contratado para essa posição ir...,,,10000+ Employees,Information Technology Support Services,Information Technology,...,3.5,3.6,3.9,3.4,4.0,,,98,https://www.glassdoor.com/Reviews/RELX-Reviews...,https://www.glassdoor.com/Benefits/RELX-Benefi...
4,Búsquedas IT\n5.0,DATA ANALYST (TABLEAU) - REMOTO PARA USA,Montevideo,uruguay,ABIERTO\nConfidencial REF: #1311\n\nHeadquarte...,,,1 to 50 Employees,Research & Development,Management & Consulting,...,3.0,3.0,3.0,3.0,3.0,,,98,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,Canonical - Jobs\n3.5,MAAS Hardware Lab Engineer (Greater Boston Area),Montevideo,uruguay,This is a Python software engineering opportun...,,,501 to 1000 Employees,Computer Hardware Development,Information Technology,...,3.7,3.6,3.4,3.0,3.6,,,98,https://www.glassdoor.com/Reviews/Canonical-Re...,https://www.glassdoor.com/Benefits/Canonical-B...
94,EPAM Systems\n4.3,"Lead DevOps Engineer, GCP - Remote",Uruguay,uruguay,We are hiring a Cloud DevOps & Infrastructure ...,,,10000+ Employees,Information Technology Support Services,Information Technology,...,4.0,3.5,4.1,3.9,4.0,,,98,https://www.glassdoor.com/Reviews/EPAM-Systems...,https://www.glassdoor.com/Benefits/EPAM-System...
95,Prometeo Talent,️ DevOps Engineer | Top U.S technology company...,Montevideo,uruguay,Nowsta is a venture-backed technology company ...,,,Unknown,,,...,,,,,,,,98,,
96,Canonical - Jobs\n3.5,Software Engineer - MAAS,Montevideo,uruguay,This is an exciting opportunity for a software...,,,501 to 1000 Employees,Computer Hardware Development,Information Technology,...,3.7,3.6,3.4,3.0,3.6,,,98,https://www.glassdoor.com/Reviews/Canonical-Re...,https://www.glassdoor.com/Benefits/Canonical-B...


### Get Company Reviews

In [8]:
def get_reviews(url, email, password, chrome_var=None, chrome_path=None, verbose=False, slp_time=5):

    if chrome_var:
        driver = driver_setup(url, chrome_var)
    elif chrome_path:
        driver = driver_setup(url, chrome_path)
    else:
        driver = driver_setup(url)
        
    reviews = []

    time.sleep(10)

    try:
        driver.find_element(By.XPATH, "//div[@class='gdUserLogin authInlineContainer gdGrid bg-white']")
        sign_in(email, password)

    except NoSuchElementException:
        pass

    try:
        driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()
    except NoSuchElementException:
        pass

    time.sleep(slp_time)

    try:
        driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()
    except NoSuchElementException:
        pass

    time.sleep(slp_time) 
    
    company_name = driver.find_element(By.XPATH, "//p[@class='employerName mt-xxsm mb-0 mx-0']").text.strip()
    
    filename = '{}_CompanyReviews'.format(company_name)
    
    
    try:
        overall_rating = driver.find_element(By.XPATH, "//div[@id='EmpStats']/div/div[1]/div/div/div").text

    except NoSuchElementException:
        overall_rating = np.nan   
        
    try:  
        recommend = driver.find_element(By.ID, "EmpStats_Recommend").text.strip()

    except NoSuchElementException:
        recommend = np.nan
    
    try: 
        ceo = driver.find_element(By.ID, "EmpStats_Approve").text.strip()
    except NoSuchElementException:
        ceo = np.nan
    
    if verbose:
        print("Company:", company_name)
        print("Overall Rating:", overall_rating)
        print("")
    

    # Make the demographics dropdown clickable to select different options
    try: 
        driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']")

        dropdown = driver.find_element(By.XPATH, "//div[@class='mb-xsm e1hgsnla1 css-wcay7z ew8s0qn0']/select")
        driver.execute_script("arguments[0].style.display = 'block';", dropdown)
        time.sleep(2) 

        drpdn_optns = dropdown.find_elements(By.XPATH, "option")
        drpdn_optns = [option.get_attribute('value') for option in drpdn_optns] 


        dropdown = Select(dropdown)

        for option in drpdn_optns:
            
            time.sleep(.1)
            dropdown.select_by_value(option)  # get ratings based on demographics
            time.sleep(3)
            
            demographics = driver.find_element(By.XPATH, "//div[@class='d-flex flex-wrap demographicOptions']")
            demo_optns = demographics.find_elements(By.XPATH, "button")
            
            if verbose:
                print(option)
                print('-----------------------------------------------------------')
                print("")

            for demographic in demo_optns:
                demo = demographic.find_element(By.XPATH, "div").text.strip()
                rating = demographic.find_element(By.XPATH, ".//div[@class='d-flex align-items-center overallRating']/h3").text.strip()

                if verbose:
                    print(demo,":",rating)
                
                reviews.append({
                            "company": company_name,
                            "company_rating": overall_rating,
                            "recommend": recommend,
                            "ceo":ceo, 
                            "category": option,
                            "demographic": demo, 
                            "rating": rating})

            if verbose:
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                print("")
               
                
                             
    except NoSuchElementException:
        company_name = np.nan
        overall_rating = np.nan
        recommend = np.nan
        ceo = np.nan
        option = np.nan
        demo = np.nan
        rating = np.nan
    
        reviews.append({"company": company_name,
                        "company_rating": overall_rating,
                        "recommend": recommend,
                        "ceo":ceo, 
                        "category": option,
                        "demographic": demo, 
                        "rating": rating})
        
    
    reviews = pd.DataFrame(reviews)
    
    if verbose:
        display(reviews.head())

    save_file(reviews, filename='{}_reviews'.format(company_name), directory_path='../data/reviews/')
    print("")
    
    driver.close()
    return reviews

In [27]:
#url = 'https://www.glassdoor.com/Reviews/Univision-Reviews-E6046.htm'
#url = 'https://www.glassdoor.ca/Reviews/Indeed-Reviews-E100561.htm'
#url = 'https://www.glassdoor.com.ar/Evaluaciones/Chase-Evaluaciones-E690765.htm'

email = "GLASSDOOR_EMAIL"
password = "GLASSDOOR_PASSWORD"
url = 'https://www.glassdoor.com/Reviews/Salesforce-Reviews-E11159.htm'

reviews_df = get_reviews(url, email, password, verbose=True)
#reviews_df


Scrapping Reviews for Company Name: Salesforce
Salesforce Company Rating: 4.4

raceEthnicity
-----------------------------------------------------------

Asian (74) : 4.3
Black or African American (22) : 3.7
Hispanic or Latinx (23) : 4.0
Indigenous American or Alaska Native (5) : 3.8
Middle Eastern (9) : 4.9
White (151) : 4.3
Native Hawaiian or Other Pacific Islander : —
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

gender
-----------------------------------------------------------

Men (195) : 4.4
Transgender and/or Non-Binary (5) : 3.2
Women (128) : 4.1
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

sexualOrientation
-----------------------------------------------------------

Heterosexual (222) : 4.2
LGBTQ+ (24) : 4.2
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

disability
-----------------------------------------------------------

Non-disabled (224) : 4.3
People with Disabilities (22) : 4.1
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

Unnamed: 0,company,company_rating,recommend,ceo,category,demographic,demographic_rating
0,Salesforce,4.4,90%,95%,raceEthnicity,Asian (74),4.3
1,Salesforce,4.4,90%,95%,raceEthnicity,Black or African American (22),3.7
2,Salesforce,4.4,90%,95%,raceEthnicity,Hispanic or Latinx (23),4.0
3,Salesforce,4.4,90%,95%,raceEthnicity,Indigenous American or Alaska Native (5),3.8
4,Salesforce,4.4,90%,95%,raceEthnicity,Middle Eastern (9),4.9





### Get Job Benefits

In [9]:
def get_benefits(url, email, password, chrome_var=None, chrome_path=None, verbose=False, slp_time=5):
    
    if chrome_var:
        driver = driver_setup(url, chrome_var)
    elif chrome_path:
        driver = driver_setup(url, chrome_path)
    else:
        driver = driver_setup(url)
    
    benefits = []

    time.sleep(10)

    try:
        driver.find_element(By.XPATH, "//div[@class='gdUserLogin authInlineContainer gdGrid bg-white']")
        sign_in(email, password)

    except NoSuchElementException:
        pass

    try:
        driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()
    except NoSuchElementException:
        pass

    time.sleep(slp_time)

    try:
        driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()
    except NoSuchElementException:
        pass

    time.sleep(slp_time)

    try:
        driver.find_element(By.XPATH, "//div[@class='css-1r7hsi3 e1mvlfgj2']")

        try:
            company_name = driver.find_element(By.ID, "DivisionsDropdownComponent").text.strip()
        except NoSuchElementException:
            company_name = 'Company Not Found'      

        try:
            overall_rating = float(driver.find_element(By.XPATH, "//strong[@class='css-b63kyi css-16iqw5x']").text.strip())
        except NoSuchElementException:
            overall_rating = np.nan
            
        
        if verbose:
            print("Company:", company_name)
            print("Overall Rating:", overall_rating)
            print("")
            
            
        try:
            tabs = driver.find_elements(By.XPATH, '//span[@class="css-1lydi83 e1dy3s0i0"]')
            elements = driver.find_elements(By.XPATH, '//div[@class="css-1sapift eqy0zfp0"]')

            for tab, element in zip(tabs, elements):

                tab_name = tab.text.strip()

                tab.click()
                time.sleep(2)

                reviews = element.find_elements(By.XPATH, "div")
                
                if verbose:
                    print(tab_name, "| Num Benefits:", len(reviews))
                    print('-----------------------------------------------------------')
                    print("")
                    
                time.sleep(2)
                
                try:
                    driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()
                except NoSuchElementException:
                    pass

                for review in reviews:

                    time.sleep(2)

                    try:
                        benefit = review.find_element(By.XPATH, "div").text.strip()
                    except NoSuchElementException:
                        benefit = np.nan

                    try: # '.' needed we are telling the engine to search in the scope of the element.
                        rating = float(review.find_element(By.XPATH, './/span[@class="mr-xxsm strong css-1p6dnxi ecvyovn3"]').text.strip())
                    except NoSuchElementException:
                        rating = np.nan

                    benefits.append({
                        "company": company_name,
                        "company_rating": overall_rating,
                        "category": tab_name,
                        "benefit": benefit, 
                        "rating": rating})
                    
                    if verbose:
                        print(benefit,':',rating)
                        
                if verbose:
                    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                    print("")
               

        except NoSuchElementException:
            print("Error: elements not found")
        
    except NoSuchElementException:
        company_name = np.nan
        overall_rating = np.nan
        benefit = np.nan
        rating = np.nan
        
        benefits.append({
                        "company": company_name,
                        "company_rating": overall_rating,
                        "category": tab_name,
                        "benefit": benefit, 
                        "rating": rating})
        
    benefits = pd.DataFrame(benefits)
        
    if verbose:
        display(benefits.head())
        
    
    save_file(benefits, filename='{}_benefits'.format(company_name), directory_path='../data/benefits/') 
    print("")
    
    driver.close()
    return benefits


In [37]:
env_var = "CHROME_DRIVER"
email = "GLASSDOOR_EMAIL"
password = "GLASSDOOR_PASSWORD"
url = 'https://www.glassdoor.com/Benefits/The-Home-Depot-US-Benefits-EI_IE655.0,14_IL.15,17_IN1.htm'
#url = 'https://www.glassdoor.com/Benefits/Comcentric-US-Benefits-EI_IE719287.0,10_IL.11,13_IN1.htm'

benefits_df = get_benefits(url, email, password, verbose=True)


Company: The Home Depot
Overall Rating: 3.7

Insurance, Health & Wellness | Num Reviews: 13
-----------------------------------------------------------

Health Insurance : 3.2
Vision Insurance : 3.5
Dental Insurance : 3.2
Life Insurance : 3.3
Accidental Death & Dismemberment Insurance : 3.3
Disability Insurance : 3.1
Supplemental Life Insurance : 3.2
Health Savings Account (HSA) : 3.2
Mental Health Care : 3.6
Occupational Accident Insurance : 3.1
Flexible Spending Account (FSA) : 4.0
Retiree Health & Medical : 5.0
Health Care On-Site : 5.0
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

Financial & Retirement | Num Reviews: 9
-----------------------------------------------------------

401K Plan : 4.0
Employee Stock Purchase Plan : 3.6
Retirement Plan : 3.8
Stock Options : 3.6
Performance Bonus : 3.3
Charitable Gift Matching : 3.7
Supplemental Workers' Compensation : 4.4
Pension Plan : 2.0
Equity Incentive Plan : 4.0
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

Unnamed: 0,company,company_rating,category,benefit,rating
0,The Home Depot,3.7,"Insurance, Health & Wellness",Health Insurance,3.2
1,The Home Depot,3.7,"Insurance, Health & Wellness",Vision Insurance,3.5
2,The Home Depot,3.7,"Insurance, Health & Wellness",Dental Insurance,3.2
3,The Home Depot,3.7,"Insurance, Health & Wellness",Life Insurance,3.3
4,The Home Depot,3.7,"Insurance, Health & Wellness",Accidental Death & Dismemberment Insurance,3.3


Creating Directory...
Saving File...
File Saved: The Home Depot_benefits_2022-11-17.csv
Saved In: ../data/benefits/The Home Depot_benefits_2022-11-17.csv



Unnamed: 0,company,company_rating,category,benefit,rating
0,The Home Depot,3.7,"Insurance, Health & Wellness",Health Insurance,3.2
1,The Home Depot,3.7,"Insurance, Health & Wellness",Vision Insurance,3.5
2,The Home Depot,3.7,"Insurance, Health & Wellness",Dental Insurance,3.2
3,The Home Depot,3.7,"Insurance, Health & Wellness",Life Insurance,3.3
4,The Home Depot,3.7,"Insurance, Health & Wellness",Accidental Death & Dismemberment Insurance,3.3
5,The Home Depot,3.7,"Insurance, Health & Wellness",Disability Insurance,3.1
6,The Home Depot,3.7,"Insurance, Health & Wellness",Supplemental Life Insurance,3.2
7,The Home Depot,3.7,"Insurance, Health & Wellness",Health Savings Account (HSA),3.2
8,The Home Depot,3.7,"Insurance, Health & Wellness",Mental Health Care,3.6
9,The Home Depot,3.7,"Insurance, Health & Wellness",Occupational Accident Insurance,3.1


### Create and Save Final Dataset

In [20]:
all_files = glob.glob(os.path.join('../data/', "*.csv"))

df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
df

Unnamed: 0,Company Name,Job Title,Location,Country,Job Description,Salary Estimate,Avg Salary,Size,Industry,Sector,...,Opportunities,Comp Benefits,Culture,Management,WorkLife Balance,Pros,Cons,Num Listings,Reviews URL,Benefits URL
0,"Lima Consulting Group, LLC\n4.3",Data Scientist (Brasil: Remoto),Trabalho remoto,Brasil,"Somos a Lima Consulting, apaixonados por dados...",,,De 1 a 50 funcionários,Menos de US$ 1 milhão,,...,4.0,3.8,4.4,4.2,4.2,,,73,,https://www.glassdoor.com.br/Benefits/Lima-Con...
1,Streetbees\n3.4,Data Scientist,Trabalho remoto,Brasil,Streetbees is a market intelligence platform t...,,,De 51 a 200 funcionários,Pesquisa e desenvolvimento,Gerenciamento e consultoria,...,3.5,3.4,3.5,3.1,3.3,,,73,,
2,4flow\n4.2,Junior Data Scientist,Campinas,Brasil,What can you expect from us?\nAs part of the 4...,,,De 1.001 a 5.000 funcionários,Consultoria empresarial,Gerenciamento e consultoria,...,4.1,3.5,4.4,4.1,4.0,,,73,https://www.glassdoor.com.br/Avalia%C3%A7%C3%B...,https://www.glassdoor.com.br/Benefits/4flow-Be...
3,HP\n4.2,Machine Learning Data Scientist,Porto Alegre,Brasil,"At HP, we believe in the power of ideas. Our v...",,,Mais de 10.000 funcionários,Desenvolvimento de hardware,Tecnologia da informação,...,3.8,3.7,4.4,3.9,4.2,,,73,https://www.glassdoor.com.br/Avalia%C3%A7%C3%B...,https://www.glassdoor.com.br/Benefits/HP-Inc-B...
4,Samsung Electronics\n3.6,Data Science Intern,Campinas,Brasil,Position Summary\nSRBR AI team is looking for ...,,,De 1.001 a 5.000 funcionários,Desenvolvimento de hardware,Tecnologia da informação,...,3.3,3.8,3.1,3.0,2.9,,,73,https://www.glassdoor.com.br/Avalia%C3%A7%C3%B...,https://www.glassdoor.com.br/Benefits/Samsung-...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8200,Worley\n3.6,Senior Inside Sales Coordinator,South Africa,south-africa,Company : Worley\nPrimary Location\n: South Af...,,,10000+ Employees,Energy & Utilities,"Energy, Mining & Utilities",...,3.3,3.2,3.5,3.1,3.5,,,145,https://www.glassdoor.com/Reviews/Worley-Revie...,https://www.glassdoor.com/Benefits/Worley-Bene...
8201,RGA\n4.1,In-Force Management Actuary,Cape Town,south-africa,Position Overview\n\nRGA is a multinational re...,,,1001 to 5000 Employees,Insurance Carriers,Insurance,...,3.7,4.0,4.0,3.7,3.8,,,145,https://www.glassdoor.com/Reviews/Reinsurance-...,https://www.glassdoor.com/Benefits/Reinsurance...
8202,Zeal HR\n5.0,Solution Support Engineer (Aviation),Cape Town,south-africa,Solution Support Engineer (Aviation)\n\nIntrod...,,,1 to 50 Employees,,,...,5.0,5.0,5.0,5.0,5.0,,,145,https://www.glassdoor.com/Reviews/Zeal-HR-Revi...,
8203,Salt Recruitment\n4.4,Customer Engagement Manager,Cape Town,south-africa,Customer Engagement Manager\nThe purpose of th...,,,201 to 500 Employees,HR Consulting,Human Resources & Staffing,...,4.3,4.1,4.4,4.3,4.3,,,145,https://www.glassdoor.com/Reviews/Salt-Digital...,https://www.glassdoor.com/Benefits/Salt-Digita...


In [21]:
df.to_csv('../data/final.csv', index=False)

In [18]:
urls = ['https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime&remoteWorkType=1&sortBy=date_desc',
        'https://www.glassdoor.com/Job/australia-data-science-jobs-SRCH_IL.0,9_IN16_KO10,22.htm',
        'https://www.glassdoor.com.ar/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime', 
        'https://nl.glassdoor.be/Vacature/data-scientist-vacatures-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.be/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.br/Vaga/data-scientist-vagas-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.ca/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.ca/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.de/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.es/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.fr/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.hk/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.in/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.ie/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.it/Lavoro/data-scientist-lavori-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.mx/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.nl/Vacature/data-scientist-vacatures-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.nz/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.at/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://de.glassdoor.ch/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.sg/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.ch/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.uk/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/south-africa-data-scientist-jobs-SRCH_IL.0,12_IN211_KO13,27.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/uruguay-data-science-jobs-SRCH_IL.0,7_IN246_KO8,20.htm',
        'https://www.glassdoor.com/Job/mexico-data-scientist-jobs-SRCH_IL.0,6_IN169_KO7,21.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/costa-rica-data-scientist-jobs-SRCH_IL.0,10_IN57_KO11,25.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/chile-data-scientist-jobs-SRCH_IL.0,5_IN49_KO6,20.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/ecuador-data-scientist-jobs-SRCH_IL.0,7_IN68_KO8,22.htm',
        'https://www.glassdoor.com/Job/nigeria-data-scientist-jobs-SRCH_IL.0,7_IN177_KO8,22.htm',
        'https://www.glassdoor.com/Job/egypt-data-scientist-jobs-SRCH_IL.0,5_IN69_KO6,20.htm',
        'https://www.glassdoor.com/Job/japan-data-scientist-jobs-SRCH_IL.0,5_IN123_KO6,20.htm',
        'https://www.glassdoor.com/Job/china-data-scientist-jobs-SRCH_IL.0,5_IN48_KO6,20.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/south-korea-data-scientist-jobs-SRCH_IL.0,11_IN135_KO12,26.htm?jobType=fulltime']
