In [1]:
# Install packages

#import sys
#!{sys.executable} -m pip install selenium
#!{sys.executable} -m pip install webdriver_manager

In [2]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import re
import numpy as np

In [3]:
service=Service(ChromeDriverManager().install())

In [4]:
def get_jobs(keyword, location, num_jobs, verbose, slp_time, jobs):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    #jobs = []
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder by changing service=(addres)
    #Chrome driver is being installed everytime the kernel is restart and saved as variable, 
    #not needing to provide location path
    driver = webdriver.Chrome(service=service, options=options)
    driver.set_window_size(1120, 1000)
    
    # Glassdoor website URL
    # This program only works for US and Canada due to glassdoor requiring location id for proper search
    if location.lower() == 'canada':
        # Below is the url location tags for jobs in Canada
        loc = "&locT=N&locId=3&locName=Canada"
    elif location.lower() == 'us' or location.lower() == 'usa':
        # Below is the url location tags for jobs in US
        loc = "&locT=N&cId=1&locName=United%20States"
    else:
        print("Invalid Location: Canada or US only")
        return pd.DataFrame(jobs)
    #url = "https://www.glassdoor.com/Job/canada-data-scientist-jobs-SRCH_IL.0,6_IN3_KO7,21_IP17.htm?includeNoSalaryJobs=true"
    url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+keyword+"&sc.keyword="+keyword+loc+"&jobType="
    #url = 'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="' + keyword + '"&locT=C&locId=1147401&locKeyword=San%20Francisco,%20CA&jobType=all&fromAge=-1&minSalary=0&includeNoSalaryJobs=true&radius=100&cityId=-1&minRating=0.0&industryId=-1&sgocId=-1&seniorityType=all&companyId=-1&employerSizes=0&applicationType=0&remoteWorkType=0'
    driver.get(url)
    

    #If true, should be still looking for new jobs.
    while len(jobs) < num_jobs:  

        #Let the page load. Change this number based on your internet speed.
        #Or, wait until the webpage is loaded, instead of hardcoding it.
        time.sleep(slp_time)

        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element(By.CLASS_NAME,"selected").click()
        except ElementClickInterceptedException:
            pass

        time.sleep(1)

        try:
            driver.find_element(By.CSS_SELECTOR,'[alt="Close"]').click() #clicking to the X.
            print(' x out worked')
        except NoSuchElementException:
            print(' x out failed')
            pass

        
        #Gather all the job node elements from the left column of Glassdoor
        #These are the buttons we're going to click.
        job_buttons = driver.find_elements(By.XPATH, "//article[@id='MainCol']//ul/li[starts-with(@class, 'react-job-listing')]")  
        
        for job_button in job_buttons:  
            
            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            if len(jobs) >= num_jobs:
                break

            #Click on each job node element
            webdriver.ActionChains(driver).move_to_element(job_button).click(job_button).perform()
            
            time.sleep(1)
            collected_successfully = False
            
            #Gather the basic information from the loaded job posting main page
            while not collected_successfully:
                try:
                    try:
                        company_name = driver.find_element(By.XPATH, './/div[@data-test="employerName"]').text
                    except NoSuchElementException:
                        company_name = -1
                    try:
                        division = driver.find_element(By.XPATH, './/div[@class="division"]').text
                    except NoSuchElementException:
                        division = -1
                    try:
                        location = driver.find_element(By.XPATH, './/div[@data-test="location"]').text
                    except NoSuchElementException:
                        location = -1
                    try:
                        job_title = driver.find_element(By.XPATH, './/div[@data-test="jobTitle"]').text
                    except NoSuchElementException:
                        job_title = -1
                    try:
                        driver.find_element(By.XPATH, './/div[@id="JobDescriptionContainer"]//div[text()="Show More"]').click()
                        tim.sleep(0.5)
                    except NoSuchElementException:
                        pass
                    try:
                        job_description = driver.find_element(By.XPATH, './/div[@class="jobDescriptionContent desc"]').text
                    except NoSuchElementException:
                        job_description = -1
                        
                    collected_successfully = True
                except:
                    time.sleep(5)

            try:
                #Glassdoor salary estimate
                salary_estimate = job_button.find_element(By.XPATH, './/span[@data-test="detailSalary"]').text
            except NoSuchElementException:
                salary_estimate = -1 #You need to set a "not found value. It's important."
            
            try:
                #Glassdoor employer review rating score
                rating = driver.find_element(By.XPATH, './/div[@id="employerStats"]/div[1]/div').text
            except NoSuchElementException:
                rating = -1 #You need to set a "not found value. It's important."

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Division: {}".format(division))
                print("Location: {}".format(location))


            #Gather additional information about the company from Glassdoor company overview section
            try:
                driver.find_element(By.XPATH, './/div[@id="CompanyContainer"]')

                try:
                    size = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Size"]/span[2]').text
                except NoSuchElementException:
                    size = -1

                try:
                    founded = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Founded"]/span[2]').text
                except NoSuchElementException:
                    founded = -1

                try:
                    type_of_ownership = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Type"]/span[2]').text
                except NoSuchElementException:
                    type_of_ownership = -1

                try:
                    industry = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Industry"]/span[2]').text
                except NoSuchElementException:
                    industry = -1

                try:
                    sector = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Sector"]/span[2]').text
                except NoSuchElementException:
                    sector = -1

                try:
                    revenue = driver.find_element(By.XPATH, './/*[@id="EmpBasicInfo"]//div[span/text()="Revenue"]/span[2]').text
                except NoSuchElementException:
                    revenue = -1


            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                size = -1
                founded = -1
                type_of_ownership = -1
                industry = -1
                sector = -1
                revenue = -1

                
            if verbose:
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Rating" : float(rating),
            "Company Name" : company_name,
            "Division" : division,             
            "Location" : location,
            "Size" : size,
            "Founded" : int(founded),
            "Type of ownership" : type_of_ownership,
            "Industry" : industry,
            "Sector" : sector,
            "Revenue" : revenue})
            #add job to jobs
                
        #Clicking on the "next page" button
        try:
            driver.find_element(By.XPATH, "//article[@id='MainCol']//button[starts-with(@class, 'nextButton')]").click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break
   
    return  #This line converts the dictionary object into a pandas DataFrame.

In [5]:
# Can change the salary part to get the entire range estimate rather than average
# Company descriptors do not necessarily include the same factors listed (line 113-145)
# As the section uses div indexing, the values may not match with correct column
# Can instead perform an evaluation based on the sub-heading

# https://stackoverflow.com/questions/3655549/xpath-containstext-some-string-doesnt-work-when-used-with-node-with-more
# //*[contains(text(),'ABC')]

# try:
#    driver.findElement.textcontains('industry')
#    try:
#        industry = driver.findElement(industry).text
#    except NoSuchElementException:
#        industry = None
# except NoSuchElementException:
#     industry = None

In [6]:
# Initializing empty job list
## Previous version, this was automated within the function, however if an error occurs during collection,
## the collected data was not saved. This will save the already collected data
jobs = []

# Change the location keyword to either "canada" or "us"

get_jobs('data scientist','canada',10, False, 15, jobs)
#get_jobs('data scientist','canada',1000, False, 15, jobs)

df = pd.DataFrame(jobs)
df.to_csv('glassdoor_data_scientist_canada_221117_1000.csv', index = False)

#jobs.clear()

 x out failed
Progress: 0/10
Progress: 1/10
Progress: 2/10
Progress: 3/10
Progress: 4/10
Progress: 5/10
Progress: 6/10
Progress: 7/10
Progress: 8/10
Progress: 9/10
Progress: 10/10


In [7]:
df.head(10)

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Division,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Scientist,CA$55K - CA$65K (Glassdoor est.),Charger Logistics is a world class asset-based...,3.5,Charger Logistics Inc\n3.5,-1,Brampton,1001 to 5000 Employees,2005,Company - Private,Taxi & Car Services,Transportation & Logistics,$5 to $25 million (USD)
1,Junior Data Scientist,CA$74K - CA$90K (Glassdoor est.),We are looking for our next Junior Data Scient...,3.8,M&M Food Market\n3.8,-1,Mississauga,1001 to 5000 Employees,-1,Company - Private,Animal Production,Agriculture,$25 to $100 million (USD)
2,Data Scientist,CA$85K - CA$120K (Glassdoor est.),Referred applicants should not apply directly ...,3.6,Loblaw Companies Limited\n3.6,-1,Brampton,10000+ Employees,-1,Company - Public,Vehicle Dealers,Retail & Wholesale,$10+ billion (USD)
3,Data Scientist,CA$55K - CA$122K (Glassdoor est.),Join our Winning Team as a Data Scientist\nWhe...,4.5,Carfax\n4.5,-1,London,1001 to 5000 Employees,1984,Company - Public,Internet & Web Services,Information Technology,Unknown / Non-Applicable
4,Applied Data Scientist,CA$55K (Employer est.),Position\nApplied Data Scientist\nReporting to...,-1.0,Airudi,-1,Remote,-1,-1,-1,-1,-1,-1
5,Staff Data Scientist,-1,Staff Data Scientist\n\nMust Have:\n\n7+ years...,4.6,ISG Search Inc\n4.6,-1,Toronto,1 to 50 Employees,1990,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable
6,"Sr. Data Scientist - Up to $110,000 - Toronto",CA$110K (Employer est.),Role: Sr. Data Scientist\nStructure: Permanent...,5.0,CorGTA Inc.\n5.0,-1,Toronto,1 to 50 Employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable
7,Data Analyst,CA$87K - CA$132K (Glassdoor est.),Position Title – Data Analyst\nLocation – Edmo...,3.7,Mphasis\n3.7,-1,Edmonton,10000+ Employees,1998,Company - Public,Information Technology Support Services,Information Technology,$5 to $10 billion (USD)
8,"Director, Data Governance and Enterprise Analy...",CA$140K - CA$168K (Employer est.),Company Bio\nIMAGINE a hospital where everyone...,4.2,North York General Hospital\n4.2,-1,Toronto,1001 to 5000 Employees,1968,Hospital,Health Care Services & Hospitals,Healthcare,$100 to $500 million (USD)
9,Junior Software Developer: Data Science,CA$55K - CA$75K (Employer est.),About Us\nCopperstone helps mining companies m...,5.0,Copperstone Technologies Ltd.\n5.0,-1,Edmonton,1 to 50 Employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable


In [8]:
df.dtypes

Job Title             object
Salary Estimate       object
Job Description       object
Rating               float64
Company Name          object
Division               int64
Location              object
Size                  object
Founded                int64
Type of ownership     object
Industry              object
Sector                object
Revenue               object
dtype: object

In [9]:
df.iloc[1]

Job Title                                        Junior Data Scientist
Salary Estimate                       CA$74K - CA$90K (Glassdoor est.)
Job Description      We are looking for our next Junior Data Scient...
Rating                                                             3.8
Company Name                                      M&M Food Market\n3.8
Division                                                            -1
Location                                                   Mississauga
Size                                            1001 to 5000 Employees
Founded                                                             -1
Type of ownership                                    Company - Private
Industry                                             Animal Production
Sector                                                     Agriculture
Revenue                                      $25 to $100 million (USD)
Name: 1, dtype: object

In [10]:
# url2 = "https://www.glassdoor.com/Job/canada-data-scientist-jobs-SRCH_IL.0,6_IN3_KO7,21_IP6.htm?includeNoSalaryJobs=true&pgc=AB4ABYEAlgAAAAAAAAAAAAAAAe5EeBQAuQEDARs4CzoGfhJTbiC5ApjWmIvkwH1s1z3uEPNZzSGKaHAxaZM5%2F5DRVe8pQ1%2BsJPUTeE4IuXG8UtUIgJ6mDPs9y%2B%2Bv0vYftzSodDRaF4hgccAHeXCtjYhMzWT%2F4iey7pomOSCVRrWsTBjbZwk1B9PpR%2BTKkE6L30NLRundHM7eEfmOfMSwGn8yBHdRkJPRYHk5bGBwE%2B0aExGZ%2B3TyREdF0kq0eylv1MTOuvKxPKomfoG1hRi%2FXi9IAAA%3D"
# options = webdriver.ChromeOptions()
# driver2 = webdriver.Chrome(service=service, options=options)
# driver2.set_window_size(1120, 1000)
# driver2.get(url2)

In [11]:
# driver2.find_element(By.XPATH, './/div[@id="JobDescriptionContainer"]//div[text()="Show More"]').click()

In [12]:
# print(driver2.find_element(By.XPATH, './/div[@class="jobDescriptionContent desc"]').text)

In [13]:
df['Company Name'].value_counts()

Charger Logistics Inc\n3.5            1
M&M Food Market\n3.8                  1
Loblaw Companies Limited\n3.6         1
Carfax\n4.5                           1
Airudi                                1
ISG Search Inc\n4.6                   1
CorGTA Inc.\n5.0                      1
Mphasis\n3.7                          1
North York General Hospital\n4.2      1
Copperstone Technologies Ltd.\n5.0    1
Name: Company Name, dtype: int64