In [1]:
# Install packages

#import sys
#!{sys.executable} -m pip install selenium
#!{sys.executable} -m pip install webdriver_manager

In [2]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import re
import numpy as np

In [3]:
service=Service(ChromeDriverManager().install())

In [13]:
def get_jobs(keyword, location, num_jobs, verbose, slp_time):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    jobs = []
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder by changing service=(addres)
    #Chrome driver is being installed everytime the kernel is restart and saved as variable, 
    #not needing to provide location path
    driver = webdriver.Chrome(service=service, options=options)
    driver.set_window_size(1120, 1000)
    
    # Glassdoor website URL
    # This program only works for US and Canada due to glassdoor requiring location id for proper search
    if location.lower() == 'canada':
        # Below is the url location tags for jobs in Canada
        loc = "&locT=N&locId=3&locName=Canada"
    elif location.lower() == 'us' or location.lower() == 'usa':
        # Below is the url location tags for jobs in US
        loc = "&locT=N&cId=1&locName=United%20States"
    else:
        print("Invalid Location: Canada or US only")
        return pd.DataFrame(jobs)
    
    url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+keyword+"&sc.keyword="+keyword+loc+"&jobType="
    #url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    driver.get(url)
    

    #If true, should be still looking for new jobs.
    while len(jobs) < num_jobs:  

        #Let the page load. Change this number based on your internet speed.
        #Or, wait until the webpage is loaded, instead of hardcoding it.
        time.sleep(slp_time)

        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element(By.CLASS_NAME,"selected").click()
        except ElementClickInterceptedException:
            pass

        time.sleep(1)

        try:
            driver.find_element(By.CSS_SELECTOR,'[alt="Close"]').click() #clicking to the X.
            print(' x out worked')
        except NoSuchElementException:
            print(' x out failed')
            pass

        
        #Gather all the job node elements from the left column of Glassdoor
        #These are the buttons we're going to click.
        job_buttons = driver.find_elements(By.XPATH, "//article[@id='MainCol']//ul/li[starts-with(@class, 'react-job-listing')]")  
        
        for job_button in job_buttons:  
            
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            if len(jobs) >= num_jobs:
                break

            #Click on each job node element
            webdriver.ActionChains(driver).move_to_element(job_button).click(job_button).perform()
            
            time.sleep(1)
            collected_successfully = False
            
            #Gather the basic information from the loaded job posting main page
            while not collected_successfully:
                try:
                    try:
                        company_name = driver.find_element(By.XPATH, './/div[@data-test="employerName"]').text
                    except NoSuchElementException:
                        company_name = -1
                    try:
                        division = driver.find_element(By.XPATH, './/div[@class="division"]').text
                    except NoSuchElementException:
                        division = -1
                    try:
                        location = driver.find_element(By.XPATH, './/div[@data-test="location"]').text
                    except NoSuchElementException:
                        location = -1
                    try:
                        job_title = driver.find_element(By.XPATH, './/div[@data-test="jobTitle"]').text
                    except NoSuchElementException:
                        job_title = -1
                    try:
                        job_description = driver.find_element(By.XPATH, './/div[@class="jobDescriptionContent desc"]').text
                    except NoSuchElementException:
                        job_description = -1
                        
                    collected_successfully = True
                except:
                    time.sleep(5)

            try:
                #Glassdoor salary estimate
                salary_estimate = job_button.find_element(By.XPATH, './/span[@data-test="detailSalary"]').text
            except NoSuchElementException:
                salary_estimate = -1 #You need to set a "not found value. It's important."
            
            try:
                #Glassdoor employer review rating score
                rating = driver.find_element(By.XPATH, './/div[@id="employerStats"]/div[1]/div').text
            except NoSuchElementException:
                rating = -1 #You need to set a "not found value. It's important."

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Division: {}".format(division))
                print("Location: {}".format(location))


            #Gather additional information about the company from Glassdoor company overview section
            try:
                driver.find_element(By.XPATH, './/div[@id="CompanyContainer"]')

                try:
                    size = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Size"]/span[2]').text
                except NoSuchElementException:
                    size = -1

                try:
                    founded = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Founded"]/span[2]').text
                except NoSuchElementException:
                    founded = -1

                try:
                    type_of_ownership = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Type"]/span[2]').text
                except NoSuchElementException:
                    type_of_ownership = -1

                try:
                    industry = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Industry"]/span[2]').text
                except NoSuchElementException:
                    industry = -1

                try:
                    sector = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Sector"]/span[2]').text
                except NoSuchElementException:
                    sector = -1

                try:
                    revenue = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Revenue"]/span[2]').text
                except NoSuchElementException:
                    revenue = -1


            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                size = -1
                founded = -1
                type_of_ownership = -1
                industry = -1
                sector = -1
                revenue = -1

                
            if verbose:
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Rating" : float(rating),
            "Company Name" : company_name,
            "Division" : division,             
            "Location" : location,
            "Size" : size,
            "Founded" : int(founded),
            "Type of ownership" : type_of_ownership,
            "Industry" : industry,
            "Sector" : sector,
            "Revenue" : revenue})
            #add job to jobs
                
        #Clicking on the "next page" button
        try:
            driver.find_element(By.XPATH, "//article[@id='MainCol']//button[starts-with(@class, 'nextButton')]").click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break
   
    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.

In [11]:
# Can change the salary part to get the entire range estimate rather than average
# Company descriptors do not necessarily include the same factors listed (line 113-145)
# As the section uses div indexing, the values may not match with correct column
# Can instead perform an evaluation based on the sub-heading

# https://stackoverflow.com/questions/3655549/xpath-containstext-some-string-doesnt-work-when-used-with-node-with-more
# //*[contains(text(),'ABC')]

# try:
#    driver.findElement.textcontains('industry')
#    try:
#        industry = driver.findElement(industry).text
#    except NoSuchElementException:
#        industry = None
# except NoSuchElementException:
#     industry = None

In [6]:
# Change the location keyword to either "canada" or "us"

df = get_jobs('data scientist','canada',35, False, 15)
#df = get_jobs('data scientist','canada',1000, False, 15)

df.to_csv('glassdoor_jobs.csv', index = False)

 x out failed
Progress: 0/150
Progress: 1/150
Progress: 2/150
Progress: 3/150
Progress: 4/150
Progress: 5/150
Progress: 6/150
Progress: 7/150
Progress: 8/150
Progress: 9/150
Progress: 10/150
Progress: 11/150
Progress: 12/150
Progress: 13/150
Progress: 14/150
Progress: 15/150
Progress: 16/150
Progress: 17/150
Progress: 18/150
Progress: 19/150
Progress: 20/150
Progress: 21/150
Progress: 22/150
Progress: 23/150
Progress: 24/150
Progress: 25/150
Progress: 26/150
Progress: 27/150
Progress: 28/150
Progress: 29/150
 x out failed
Progress: 30/150
Progress: 31/150
Progress: 32/150
Progress: 33/150
Progress: 34/150
Progress: 35/150
Progress: 36/150
Progress: 37/150
Progress: 38/150
Progress: 39/150
Progress: 40/150
Progress: 41/150
Progress: 42/150
Progress: 43/150
Progress: 44/150
Progress: 45/150
Progress: 46/150
Progress: 47/150
Progress: 48/150
Progress: 49/150
Progress: 50/150
Progress: 51/150
Progress: 52/150
Progress: 53/150
Progress: 54/150
Progress: 55/150
Progress: 56/150
Progress: 57/

In [7]:
df.head(50)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Division,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Junior Data Scientist,CA$74K - CA$90K (Glassdoor est.),We are looking for our next Junior Data Scient...,3.9,M&M Food Market\n3.9,,Mississauga,1001 to 5000 Employees,,Company - Private,Animal Production,Agriculture,$25 to $100 million (USD)
1,Junior Data Scientist,CA$11K (Employer est.),We are looking for our next Junior Data Scient...,3.9,M&M Food Market\n3.9,,Mississauga,1001 to 5000 Employees,,Company - Private,Animal Production,Agriculture,$25 to $100 million (USD)
2,Data Scientist – Revenue Management,CA$84K - CA$96K (Glassdoor est.),Powered by water... and by people like you\n\n...,-1.0,BC Hydro\n3.8,,Vancouver,1001 to 5000 Employees,,Self-employed,Energy & Utilities,"Energy, Mining & Utilities",Unknown / Non-Applicable
3,Data Scientist,CA$55K - CA$119K (Glassdoor est.),Join our Winning Team as a Data Scientist\nWhe...,4.5,Carfax\n4.5,,London,1001 to 5000 Employees,1984.0,Company - Public,Internet & Web Services,Information Technology,Unknown / Non-Applicable
4,Analytics Data Scientist,CA$65K - CA$100K (Employer est.),BrainFinance is a leading financial technology...,-1.0,Brain Finance,,Montreal,Unknown,,Company - Private,,,Unknown / Non-Applicable
5,Data Scientist,CA$85K - CA$120K (Glassdoor est.),Referred applicants should not apply directly ...,3.6,Loblaw Companies Limited\n3.6,,Brampton,10000+ Employees,,Company - Public,Vehicle Dealers,Retail & Wholesale,$10+ billion (USD)
6,"SENIOR MANAGER, DATA & ARTIFICIAL INTELLIGENCE...",CA$67K - CA$93K (Glassdoor est.),"Location: Pickering, ON, CA, L1W 3J2\nReq ID: ...",4.2,Ontario Power Generation\n4.2,,Pickering,5001 to 10000 Employees,1999.0,Government,Energy & Utilities,"Energy, Mining & Utilities",$1 to $5 billion (USD)
7,Lead Data Scientist (Hybrid),CA$56K - CA$108K (Glassdoor est.),"Job Category:\nProperty Tax, Software\nOpportu...",4.0,Altus Group\n4.0,,Toronto,1001 to 5000 Employees,2005.0,Company - Public,Real Estate,Real Estate,$100 to $500 million (USD)
8,Data Science and Analytics Manager,CA$76K - CA$88K (Glassdoor est.),Data Science Manager\nGALE is a creative media...,3.7,GALE Partners\n3.7,,Toronto,501 to 1000 Employees,2014.0,Company - Private,Advertising & Public Relations,Media & Communication,Unknown / Non-Applicable
9,Machine Learning Engineer,CA$60K - CA$98K (Glassdoor est.),Charger Logistics is a world class asset-based...,3.5,Charger Logistics Inc\n3.5,,Brampton,1001 to 5000 Employees,2005.0,Company - Private,Taxi & Car Services,Transportation & Logistics,$5 to $25 million (USD)


In [9]:
df.dtypes

Job Title            object
Salary Estimate      object
Job Description      object
Rating               object
Company Name         object
Division             object
Location             object
Size                 object
Founded              object
Type of ownership    object
Industry             object
Sector               object
Revenue              object
dtype: object