# Job Listing Scrapper

Dataset created from scraping job engine sites including Glassdoor, Indeed, LinkedIn, and Angel using Python's Selenium library and scrapes for the following fields: 

1. **Company Name**: Name of the company
2. **Job Title**: The title of job, eg. Data scientist, junior data scientist, senior data scientist etc.
3. **Job Description**: Tells us what is expected out of the job title.
4. **Job Requirement**: Required skills
5. **Salary Estimate**: Range of salary and the source.
6. **Benefits**: Benefits offered by the company including medical insurance, equity, etc.
7. **Location**: Location of the job
8. **Size**: Range of number of employee working in the company
9. **Rating**: It gives the rating of the company
10. **Review**: Employee Reviews
11. **Industry**: Industry of the company
12. **Sector**: Sector in which company works
13. **Revenue**: Total revenue of the company per year

### Install Packages

In [None]:
#chromedriver - https://sites.google.com/chromium.org/driver/
#pip install -U selenium
#conda install -c conda-forge python-dotenv
#conda install -c conda-forge webdriver-manager
#conda install tqdm

### Add Libraries

In [1]:
import os
import time
import requests
import numpy as np
import pandas as pd

from tqdm.auto import tqdm # works for both terminal and notebook
from dotenv import load_dotenv, find_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

### Initialize Webdriver

In [None]:
load_dotenv()

In [None]:
chrome_driver = os.environ["chrome_driver"]
os.chmod(chrome_driver, 755)

In [2]:
#url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?remoteWorkType=1&sortBy=date_desc'
#url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime&remoteWorkType=1'
#url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?seniorityType=entrylevel&remoteWorkType=1'
url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime&seniorityType=entrylevel&remoteWorkType=1'



In [None]:
urls = ['https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime&remoteWorkType=1&sortBy=date_desc',
        'https://www.glassdoor.com.ar/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime', 
        'https://www.glassdoor.com.au/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://nl.glassdoor.be/Vacature/data-scientist-vacatures-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.be/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.br/Vaga/data-scientist-vagas-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.ca/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.ca/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.de/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.es/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.fr/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.hk/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.in/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.ie/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.it/Lavoro/data-scientist-lavori-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com.mx/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.nl/Vacature/data-scientist-vacatures-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.nz/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.at/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://de.glassdoor.ch/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.sg/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://fr.glassdoor.ch/Emploi/data-scientist-emplois-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.co.uk/Job/data-scientist-jobs-SRCH_KO0,14.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/south-africa-data-scientist-jobs-SRCH_IL.0,12_IN211_KO13,27.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/uruguay-data-scientist-jobs-SRCH_IL.0,7_IN246_KO8,22.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/mexico-data-scientist-jobs-SRCH_IL.0,6_IN169_KO7,21.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/costa-rica-data-scientist-jobs-SRCH_IL.0,10_IN57_KO11,25.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/chile-data-scientist-jobs-SRCH_IL.0,5_IN49_KO6,20.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/ecuador-data-scientist-jobs-SRCH_IL.0,7_IN68_KO8,22.htm',
        'https://www.glassdoor.com/Job/nigeria-data-scientist-jobs-SRCH_IL.0,7_IN177_KO8,22.htm',
        'https://www.glassdoor.com/Job/egypt-data-scientist-jobs-SRCH_IL.0,5_IN69_KO6,20.htm',
        'https://www.glassdoor.com/Job/japan-data-scientist-jobs-SRCH_IL.0,5_IN123_KO6,20.htm',
        'https://www.glassdoor.com/Job/china-data-scientist-jobs-SRCH_IL.0,5_IN48_KO6,20.htm?jobType=fulltime',
        'https://www.glassdoor.com/Job/south-korea-data-scientist-jobs-SRCH_IL.0,11_IN135_KO12,26.htm?jobType=fulltime']

In [None]:
#Initializing the webdriver
options = Options()

In [None]:
#service = Service(ChromeDriverManager().install())
service = Service(chrome_driver)

In [2]:
def driver_setup(env_var, url):
    
    load_dotenv(find_dotenv()) # search for .env file
    CHROME_DRIVER = os.environ[env_var]
    os.chmod(CHROME_DRIVER, 755) # for unix/lenux computers only, gives file read/execute rights 
    options = Options()
    service = Service(CHROME_DRIVER)
    driver = webdriver.Chrome(service=service, options=options)
    driver.set_window_size(1120, 1000)
    driver.get(url)
    
    return driver

In [34]:
def get_jobs(url, env_var, num_jobs=30, verbose=False, slp_time=5):

    driver = driver_setup(env_var, url)
    
    jobs = []
    jobs_count = len(jobs)
    collected_successfully = False

    pbar = tqdm(total=num_jobs) # Init progress bar

    
    while not collected_successfully:
        try:

            total_listings = driver.find_element(By.XPATH, "//p[@data-test='jobsCount']").text.split()
            
            if len(total_listings) == 0:
                total_listings = int(driver.find_element(By.XPATH, "//h1[@data-test='jobCount-H1title']").text.split()[0])

            else:
                total_listings = int(total_listings[0])
            
            country = driver.find_element(By.XPATH, '//div[@class="css-m3gjah egu3u860"]/div[@class="selectedLabel"]').text.strip()
            print("Country:", country)
            collected_successfully = True
            
        except NoSuchElementException:
            time.sleep(slp_time)
            
    
    if num_jobs > total_listings:
        
        print("The number of jobs to be scrapped: {} exceeds the number of listings: {}".format(num_jobs, total_listings))
        num_jobs = total_listings
        print("The number of jobs has been updated to reflect the number of listings")
        print("")
    
    
    print("Total number of job listings: {}, number of jobs to be scraped: {}".format(total_listings, num_jobs))
    print("")

    while jobs_count < num_jobs:
        
        time.sleep(slp_time)
        time.sleep(.1)
        
        job_listings = driver.find_elements(By.CLASS_NAME, "react-job-listing")
        
        for listing in job_listings:
            
            pbar.update(1)
            
            if jobs_count >= num_jobs:
                print("Scraping completed, scraped {} of {} jobs".format(jobs_count, num_jobs))
                break
            
            listing.click()
            time.sleep(2)   

            
            try:
                driver.find_element(By.XPATH, "//div[@class='qual_x_close']").click()  #In case survey pops up. 
            except NoSuchElementException:
                pass

            try: 
                driver.find_element(By.XPATH, "//span[@alt='Close']").click()  #clicking to the X.   
            except NoSuchElementException:
                pass

            
            collected_successfully = False
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element(By.XPATH,'//div[@class="css-xuk5ye e1tk4kwz5"]').text.strip() #returns any element which is direct parent.
                    job_title = driver.find_element(By.XPATH,'//div[@class="css-1j389vi e1tk4kwz2"]').text.strip()
                    location = driver.find_element(By.XPATH,'//div[@class="css-56kyx5 e1tk4kwz1"]').text.strip()
                    job_description = driver.find_element(By.XPATH,'//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True        
                except:
                    time.sleep(slp_time)

            try:
                salary_range = driver.find_element(By.XPATH, '//span[@class="css-1hbqxax e1wijj240"]').text.strip()
            except NoSuchElementException:
                salary_range = np.nan

            try:
                salary_avg = driver.find_element(By.XPATH, '//div[@class="css-y2jiyn e2u4hf18"]').text.strip()
                salary_avg = salary_avg.split()[0]
            except NoSuchElementException:
                salary_avg = np.nan

            
            # Search for Company Container

            try:
                driver.find_element(By.ID, 'CompanyContainer')
                
                try:
                    size = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-daag8o e1pvx6aw2"])[1]//span[2]').text.strip()
    
                except NoSuchElementException:
                    size = np.nan

                try:
                    industry = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-daag8o e1pvx6aw2"])[4]//span[2]').text.strip()
                except NoSuchElementException:
                    industry = np.nan

                try:
                    sector = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-daag8o e1pvx6aw2"])[5]//span[2]').text.strip()
                except NoSuchElementException:
                    sector = np.nan

                try:
                    revenue = driver.find_element(By.XPATH, 
                                               '(//div[@class="d-flex justify-content-start css-daag8o e1pvx6aw2"])[6]//span[2]').text.strip()
                except NoSuchElementException:
                    revenue = np.nan


            except NoSuchElementException:
                size = np.nan
                industry = np.nan
                sector = np.nan
                revenue = np.nan
                #country = np.nan

            
            # Search for Reviews Container
            try:
                driver.find_element(By.XPATH, '//div[@data-test="company-ratings"]')

                try:
                    rating = float(driver.find_element(By.XPATH, '//div[@class="mr-sm css-ey2fjr e1pr2f4f3"]').text.strip())
                except NoSuchElementException:
                    rating = np.nan

                try:
                    recommend = driver.find_element(By.XPATH, '(//div[@class="d-flex top css-rkhv2t e1o78bat1"])[1]//div[1]').text.strip()
                except NoSuchElementException:
                    recommend = np.nan

                try:
                    ceo = driver.find_element(By.XPATH, '//div[@class="css-vkhqai ceoApprove"]').text.strip()
                except NoSuchElementException:
                    ceo = np.nan

                try:
                    opportunities = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[3]').text.strip())        
                except NoSuchElementException:
                    opportunities = np.nan
                try:
                    comp_benefits = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[6]').text.strip())        
                except NoSuchElementException:
                    comp_benefits = np.nan

                try:
                    culture = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[9]').text.strip())        
                except NoSuchElementException:
                    culture = np.nan

                try:
                    management = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[12]').text.strip())        
                except NoSuchElementException:
                    management = np.nan

                try:
                    workLife = float(driver.find_element(By.XPATH, '//ul[@class="css-1t3mcrv erz4gkm2"]/span[15]').text.strip())        
                except NoSuchElementException:
                    workLife = np.nan


            except NoSuchElementException:
                rating = np.nan
                recommend = np.nan
                ceo = np.nan
                opportunities = np.nan
                comp_benefits = np.nan
                culture = np.nan
                management = np.nan
                workLife = np.nan


            # Get Employee Reviews
            try: 
                driver.find_element(By.ID, 'ReviewsContainer')

                try:
                    
                    pro_reviews = driver.find_element(By.XPATH, 
                                                      '(//div[@class="css-1sfecah e1vn3ovn1"])[1]//div') # check for pros

                    pro_reviews = pro_reviews.find_elements(By.XPATH, "following-sibling::p")
                    pros = [review.text for review in pro_reviews]  

                except NoSuchElementException: 
                    pros = np.nan

                try:
                    con_reviews = driver.find_element(By.XPATH, 
                                                      '(//div[@class="css-1sfecah e1vn3ovn1"])[2]//div')

                    con_reviews = con_reviews.find_elements(By.XPATH, "following-sibling::p")
                    cons = [review.text for review in con_reviews]    

                except NoSuchElementException:
                    cons = np.nan

            except NoSuchElementException: 
                pros = np.nan
                cons = np.nan

            # Get Benefits Rating and Reviews
            try: 
                driver.find_element(By.CLASS_NAME, 'p-std')

                try: 
                    benefits_rating = float(driver.find_element(By.XPATH, '//div[@class="ratingNum mr-sm"]').text.strip())

                except NoSuchElementException: 
                    benefits_rating = np.nan

            except NoSuchElementException: 
                benefits_rating = np.nan
                

            jobs.append({"Company Name": company_name,
                        "Job Title": job_title, 
                        "Location": location,
                        "Country": country,
                        "Job Description": job_description, 
                        "Salary Estimate": salary_range,
                        "Avg Salary": salary_avg,
                        "Size": size,
                        "Industry": industry,
                        "Sector": sector,
                        "Revenue": revenue,
                        "Rating": rating,
                        "Recommend": recommend,
                        "CEO": ceo,
                        "Benefits": benefits_rating,
                        "Opportunities": opportunities,
                        "Comp Benefits": comp_benefits,
                        "Culture": culture,
                        "Management": management,
                        "WorkLife Balance": workLife,
                        "Pros": pros,
                        "Cons": cons})
            
        
            jobs_count = len(jobs)
            
            if not verbose:
                print("Scraped {} out of {} job listings".format(jobs_count, num_jobs), end='\r')
            
            # print for debugging purposes
            if verbose:
                print("Company Name: {}".format(company_name))
                print("Job Title: {}".format(job_title))
                print("Location: {}".format(location))
                print("Country: {}".format(country))
                print("Job Description: {}".format(job_description[:500]))
                print("Salary Estimate: {}".format(salary_range))
                print("Avg Salary: {}".format(salary_avg))
                print("Size: {}".format(size))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("Rating: {}".format(rating))
                print("Recommend To Friend: {}".format(recommend))
                print("Approve of CEO: {}".format(ceo))
                print("Benefits Rating: {}".format(benefits_rating))
                print("Career Opportunities: {}".format(opportunities))
                print("Comp & Benefits: {}".format(comp_benefits))
                print("Culture & Values: {}".format(culture))
                print("Senior Managment: {}".format(management))
                print("Work Life Balance: {}".format(workLife))
                print("Pros: ", pros)
                print("Cons: ", cons)
                print("")
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                print("")

        # clicking on the "next page button"

        try:
            driver.find_element(By.XPATH, '//button[@data-test="pagination-next"]').click()

        except NoSuchElementException:
            print("Scraping completed, scraped {}, out of {} job listings.".format(jobs_count, num_jobs))
            break
 
    
    pbar.close()
    driver.close()
    return pd.DataFrame(jobs)
        

In [15]:
env_var = "chrome_driver"
url = 'https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?remoteWorkType=1&sortBy=date_desc'

df = get_jobs(url=url, env_var=env_var, num_jobs=30)
df   

  0%|          | 0/30 [00:00<?, ?it/s]

Country:  United States
Total number of job listings: 2826, number of jobs to be scraped: 30

Scraped 30 out of 30 job listings

Unnamed: 0,Company Name,Job Title,Location,Country,Job Description,Salary Estimate,Avg Salary,Size,Industry,Sector,...,Recommend,CEO,Benefits,Opportunities,Comp Benefits,Culture,Management,WorkLife Balance,Pros,Cons
0,The Home Depot\n3.8,"Data Scientist, Marketing & Online (Remote)","Atlanta, GA",United States,Position Purpose:\nThe Data Scientist is respo...,Employer Provided Salary:$90K - $160K,"$125,000",10000+ Employees,Home Furniture & Housewares Stores,Retail & Wholesale,...,74 %,84 %,3.6,3.6,3.6,3.8,3.4,3.5,"[""Great culture and hours"" (in 45 reviews), ""P...","[""Honestly none, it really is a fantastic comp..."
1,The Home Depot\n3.8,"Lead Data Scientist, Marketing & Online (Remote)","Atlanta, GA",United States,Position Purpose:\nThe Lead Data Scientist is ...,Employer Provided Salary:$120K - $240K,"$180,000",10000+ Employees,Home Furniture & Housewares Stores,Retail & Wholesale,...,74 %,84 %,3.6,3.6,3.6,3.8,3.4,3.5,"[""Great culture and hours"" (in 45 reviews), ""P...","[""Honestly none, it really is a fantastic comp..."
2,The Home Depot\n3.8,"Senior Data Scientist, Marketing & Online (Rem...","Atlanta, GA",United States,Position Purpose:\nThe Sr. Data Scientist is r...,Employer Provided Salary:$90K - $190K,"$140,000",10000+ Employees,Home Furniture & Housewares Stores,Retail & Wholesale,...,74 %,84 %,3.6,3.6,3.6,3.8,3.4,3.5,"[""Great culture and hours"" (in 45 reviews), ""P...","[""Honestly none, it really is a fantastic comp..."
3,The Home Depot\n3.8,"Principal Data Scientist, Online Marketing (Re...","Atlanta, GA",United States,The Home Depot is able to offer virtual employ...,Employer Provided Salary:$150K - $280K,"$215,000",10000+ Employees,Home Furniture & Housewares Stores,Retail & Wholesale,...,74 %,84 %,3.6,3.6,3.6,3.8,3.4,3.5,"[""Great culture and hours"" (in 45 reviews), ""P...","[""Honestly none, it really is a fantastic comp..."
4,Teleperformance\n4.1,Sr Data Analyst - HR,Remote,United States,Overview:\nWe are looking for a skilled Data A...,,,5001 to 10000 Employees,Telecommunications Services,Telecommunications,...,82 %,89 %,2.5,4.1,3.8,4.0,3.9,3.8,"[""less physical health issues, good salary"" (i...","[""Low salary for whatever work you do"" (in 155..."
5,Akkshayya LLC,Senior Data Scientist,Remote,United States,***** THIS POSITION IS FOR FEDERAL GOVERNMENT ...,Employer Provided Salary:$85 - $90 Per Hour,$87.50,,,,...,,,,,,,,,,
6,General Motors\n4.1,Data Scientist - Advanced Analytics,United States,United States,Job Description\n\nRemote- This position does ...,Employer Provided Salary:$140K,"$140,000",10000+ Employees,Transportation Equipment Manufacturing,Manufacturing,...,81 %,88 %,4.2,3.8,4.0,4.0,3.6,3.8,"[""Benefits are good!"" (in 29 reviews), ""GM off...","[""great place to work, great benefit!"" (in 29 ..."
7,Clarkston Consulting\n4.4,Data Scientist,Remote,United States,Do you want the opportunity to leverage your s...,,,201 to 500 Employees,Business Consulting,Management & Consulting,...,82 %,96 %,3.9,4.3,4.0,4.6,4.4,4.0,"[""smart people at the top of the field"" (in 7 ...","[""Training is bad, you will have to figure out..."
8,Cuesta Partners LLC\n5.0,Data Strategy Technology Consultant,Remote,United States,"Flexible, USA\nCuesta Partners is looking for ...",,,1 to 50 Employees,Business Consulting,Management & Consulting,...,100 %,,,4.8,4.7,5.0,5.0,4.8,"[""A small but growing team with the most impre...",[No Cons have been reported by the Glassdoor c...
9,Cuesta Partners LLC\n5.0,Senior Data Strategy Technology Consultant,Remote,United States,Cuesta Partners is looking for a data & digita...,,,1 to 50 Employees,Business Consulting,Management & Consulting,...,100 %,,,4.8,4.7,5.0,5.0,4.8,"[""A small but growing team with the most impre...",[No Cons have been reported by the Glassdoor c...


In [35]:
env_var = "chrome_driver"
url = 'https://www.glassdoor.com.ar/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime'
df = get_jobs(url=url, env_var=env_var)
df

  0%|          | 0/30 [00:00<?, ?it/s]

Country: Argentina
Total number of job listings: 61, number of jobs to be scraped: 30

Scraped 30 out of 30 job listings

Unnamed: 0,Company Name,Job Title,Location,Country,Job Description,Salary Estimate,Avg Salary,Size,Industry,Sector,...,Recommend,CEO,Benefits,Opportunities,Comp Benefits,Culture,Management,WorkLife Balance,Pros,Cons
0,Waterplan,Data Scientist,Trabajo desde casa,Argentina,Do you want to contribute towards solving one ...,,,,,,...,,,,,,,,,,
1,etermax\n4.3,Lead Data Scientist - Remoto,Trabajo desde casa,Argentina,"Somos etermax, una compañía internacional de t...",,,De 201 a 500 empleados,Desarrollo de equipos informáticos,Tecnologías de la información,...,93 %,93 %,4.1,4.1,3.9,4.3,3.8,4.5,,
2,CIDER S.A.\n3.2,Data Analyst / Data Scientist,Argentina,Argentina,"Argentina, CABA - Paternal\nNos encontramos en...",,,No se sabe,,,...,66 %,,2.0,3.0,2.6,2.5,2.6,2.8,,
3,Kunan S.A - Tu socio tecnologico\n4.4,Data Scientist,Córdoba,Argentina,We are looking to expand our Artificial Intell...,,,De 1 a 50 empleados,Soporte informático,Tecnologías de la información,...,100 %,,4.0,3.5,3.6,4.6,4.1,4.8,"[""Very good professional development policy."" ...",[La comunidad de Glassdoor no ha informado nin...
4,Equifax\n3.9,Data Scientist Senior - Argentina,Buenos Aires,Argentina,Lead statistical analysis and build scorecards...,,,Más de 10 000 empleados,Investigación y desarrollo,Administración y consultoría,...,77 %,87 %,3.8,3.8,3.8,3.8,3.6,3.9,"[""Great people to work with"" (en 27 evaluacion...","[""They don’t care about the people and if you ..."
5,Vates\n3.6,SR Data Scientist,Córdoba,Argentina,DESCRIPCIÓN\nMe contacto con vos ya que nos en...,,,De 201 a 500 empleados,Tecnologías de la información,Desconocido/no corresponde,...,80 %,,3.6,3.7,3.0,3.5,3.5,3.7,[La comunidad de Glassdoor no ha informado nin...,[La comunidad de Glassdoor no ha informado nin...
6,etermax\n4.3,Data Scientist (SSr/Sr) - Remoto,Trabajo desde casa,Argentina,"Somos etermax, una compañía internacional de t...",,,De 201 a 500 empleados,Desarrollo de equipos informáticos,Tecnologías de la información,...,93 %,93 %,4.1,4.1,3.9,4.3,3.8,4.5,,
7,La Caja\n4.2,Data Scientist,Buenos Aires,Argentina,Queres ser parte de un proyecto de transformac...,,,De 1001 a 5000 empleados,De $5 a $25 millones (USD),,...,96 %,,3.6,3.5,4.0,3.9,3.5,4.3,,
8,Equifax\n3.9,Data Scientist Canadá,Buenos Aires,Argentina,Equifax is looking for an experienced Data Sci...,,,Más de 10 000 empleados,Investigación y desarrollo,Administración y consultoría,...,77 %,87 %,3.8,3.8,3.8,3.8,3.6,3.9,"[""Great people to work with"" (en 27 evaluacion...","[""They don’t care about the people and if you ..."
9,Tecnoap\n4.1,SSR Data Scientist,Ensenada,Argentina,Employment Information\nReference: ZR_6_JOB\nM...,,,De 51 a 200 empleados,"Energía, minería e infraestructura pública",Desconocido/no corresponde,...,87 %,,,3.8,3.4,4.2,3.6,3.8,,


In [36]:
env_var = "chrome_driver"
urls = ['https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm?remoteWorkType=1&sortBy=date_desc',
        'https://www.glassdoor.com.ar/Empleo/data-scientist-empleos-SRCH_KO0,14.htm?jobType=fulltime']

dfs = [get_jobs(url=url, env_var=env_var) for url in urls]
df = pd.concat(dfs, ignore_index=True)
df

  0%|          | 0/30 [00:00<?, ?it/s]

Country: United States
Total number of job listings: 2894, number of jobs to be scraped: 30

Scraped 30 out of 30 job listings

  0%|          | 0/30 [00:00<?, ?it/s]

Country: Argentina
Total number of job listings: 61, number of jobs to be scraped: 30

Scraped 30 out of 30 job listings

Unnamed: 0,Company Name,Job Title,Location,Country,Job Description,Salary Estimate,Avg Salary,Size,Industry,Sector,...,Recommend,CEO,Benefits,Opportunities,Comp Benefits,Culture,Management,WorkLife Balance,Pros,Cons
0,The Home Depot\n3.8,"Data Scientist, Marketing & Online (Remote)","Atlanta, GA",United States,Position Purpose:\nThe Data Scientist is respo...,Employer Provided Salary:$90K - $160K,"$125,000",10000+ Employees,Home Furniture & Housewares Stores,Retail & Wholesale,...,74 %,84 %,3.6,3.6,3.6,3.8,3.4,3.5,"[""Great culture and hours"" (in 45 reviews), ""P...","[""Honestly none, it really is a fantastic comp..."
1,CyberCoders\n4.1,Senior Data Scientist,"Germantown, MD",United States,Senior Data Scientist\n-THIS POSITION IS 100% ...,Employer Provided Salary:$170K - $200K,"$185,000",201 to 500 Employees,Staffing & Subcontracting,Human Resources & Staffing,...,70 %,82 %,4.3,4.1,3.9,4.0,3.9,4.0,"[""good communication, good people kinda"" (in 1...","[""bad managment, few offices kinda"" (in 1 revi..."
2,CyberCoders\n4.1,Associate Data Scientist Hybrid,"Stamford, CT",United States,Associate Data Scientist Hybrid\nThis will be ...,Employer Provided Salary:$80K - $100K,"$90,000",201 to 500 Employees,Staffing & Subcontracting,Human Resources & Staffing,...,70 %,82 %,4.3,4.1,3.9,4.0,3.9,4.0,"[""good communication, good people kinda"" (in 1...","[""bad managment, few offices kinda"" (in 1 revi..."
3,CVS Health\n3.0,Senior Data Scientist,Texas,United States,Develops and/or uses algorithms and statistica...,,,10000+ Employees,Health Care Services & Hospitals,Healthcare,...,42 %,46 %,3.1,3.1,3.0,2.9,2.7,2.8,"[""Good in work life balance"" (in 51 reviews), ...","[""Sometimes work life balance isn't so balance..."
4,Cuesta Partners LLC\n5.0,Data Strategy Technology Consultant,Remote,United States,"Flexible, USA\nCuesta Partners is looking for ...",,,1 to 50 Employees,Business Consulting,Management & Consulting,...,100 %,,,4.8,4.7,5.0,5.0,4.8,"[""A small but growing team with the most impre...",[No Cons have been reported by the Glassdoor c...
5,Teleperformance\n4.1,Sr Data Analyst - HR,Remote,United States,Overview:\nWe are looking for a skilled Data A...,,,5001 to 10000 Employees,Telecommunications Services,Telecommunications,...,82 %,89 %,2.5,4.1,3.8,4.0,3.9,3.8,"[""less physical health issues, good salary"" (i...","[""Low salary for whatever work you do"" (in 155..."
6,Cuesta Partners LLC\n5.0,Senior Data Strategy Technology Consultant,Remote,United States,Cuesta Partners is looking for a data & digita...,,,1 to 50 Employees,Business Consulting,Management & Consulting,...,100 %,,,4.8,4.7,5.0,5.0,4.8,"[""A small but growing team with the most impre...",[No Cons have been reported by the Glassdoor c...
7,General Motors\n4.1,Data Scientist - Advanced Analytics,"Warren, MI",United States,Job Description\n\nRemote- This position does ...,Employer Provided Salary:$140K,"$140,000",10000+ Employees,Transportation Equipment Manufacturing,Manufacturing,...,81 %,88 %,4.2,3.8,4.0,4.0,3.6,3.8,"[""Benefits are good!"" (in 29 reviews), ""GM off...","[""great place to work, great benefit!"" (in 29 ..."
8,CalypsoAI\n4.3,Data Scientist - Professional Services,Remote,United States,Data Scientist - Professional Services (Client...,,,1 to 50 Employees,Enterprise Software & Network Solutions,Information Technology,...,83 %,,,4.4,3.7,4.2,4.4,4.0,"[""Amazing people"" (in 5 reviews), ""Lots of aut...","[""Beyond building great technologies, we have ..."
9,Piper Companies\n4.4,Remote Machine Learning Engineer,Remote,United States,Piper Companies is seeking a Remote Machine Le...,Employer Provided Salary:$140K - $150K,"$145,000",201 to 500 Employees,HR Consulting,Human Resources & Staffing,...,90 %,96 %,4.1,4.5,4.5,4.4,4.4,4.4,"[""I am so grateful to be a part of a great cul...","[""The culture of this organization has drastic..."


### Glassdoor Scraper

In [None]:
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}

In [None]:
response = requests.get(url, headers)
response.status_code

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')
#soup

In [None]:
pagination = soup.findAll("div", {"class": "paginationFooter"})[0]

In [None]:
pagination = pagination.text.strip()
pagination = pagination.split()

In [None]:
page_num = int(pagination[1])
total_pages = int(pagination[-1])

In [None]:
"""
for i in range(page_num, total_pages+1):
    if page_num > 1:
       url = f"https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14_IP{page_num}.htm?seniorityType=entrylevel&includeNoSalaryJobs=true"

"""

In [None]:
#scrapes all divs in main section of webpage

divs = soup.find_all("div", class_='module p-0 job-search-key-kxun6g exy0tjh2')
#divs

In [None]:
# scrapes job listings
listings = divs[0].find_all('li', class_='react-job-listing')
#len(listings)

In [None]:
# scraps all divs wihin each list item, must use find_all instead of find since find returns only first div it finds

divs = [item.find_all('div') for item in listings] # finds all divs within each list item, m
#divs[0]

In [None]:
left_col = [item[0] for item in divs] # can't use 'find' since item is a list
right_col = [item[1] for item in divs]

In [None]:
anchors = [item.find_all('a') for item in right_col]
#anchors

##### Company Names

In [None]:
companies = [company[0] for company in anchors]
companies = [name.find('span').text.strip() for name in companies]
companies[0]

##### Job Titles

In [None]:
titles = [company[1] for company in anchors]
titles = [title.find('span').text.strip() for title in titles]
titles[0]

##### Location

In [None]:
locations = [item.find('div', class_='d-flex flex-wrap job-search-key-1m2z0go e1rrn5ka2') for item in right_col]
locations = [location.find('span').text.strip() for location in locations]
locations[0]

##### Salary

In [None]:
salaries = []
ratings = []

In [None]:
for item in right_col:
    
    try:
        salary = item.find('div', class_='css-1buaf54 pr-xxsm') 
        salary = salary.find('span', class_='job-search-key-1hbqxax e1wijj240').text.strip()
        salary = salary.split()
        
        if len(salary) <= 4:
            salary = salary[0]  

        else:
            start_sal = salary[0]
            max_sal = salary[2]
            salary = start_sal + '-' + max_sal
    except:
        salary = np.nan
    
    salaries.append(salary)
    
salaries[0]   

##### Ratings

In [None]:
for item in left_col:
    try:
        rating = float(item.find('span', class_='job-search-key-srfzj0 e1cjmv6j0').text.strip())
    except:
        rating = np.nan
        
    ratings.append(rating)
    
ratings[0]

##### Job Listing Page

In [None]:
links = [link[0].get("href") for link in anchors]

In [None]:
urls = [f"https://www.glassdoor.com{link}" for link in links]

In [None]:
urls[0]

In [None]:
jobLink = urls[0]

In [None]:
response = requests.get(jobLink, headers)
response.status_code

In [None]:
link = 'https://www.glassdoor.com/job-listing/junior-sas-data-scientist-424-vezita-tech-JV_KO0,29_KE30,41.htm?jl=1008008033583&pos=101&ao=1110586&s=58&guid=0000018202dbf8f9920f2a5cca6a9cc3&src=GD_JOB_AD&t=SR&vt=w&ea=1&cs=1_abea6e2e&cb=1657905347193&jobListingId=1008008033583&cpc=654405A9B1E0A9F5&jrtk=3-0-1g81dnu9rk255801-1g81dnuahghre800-e5f5f92b99790728--6NYlbfkN0A9aFbeqbFpDTCoiHOd6k0wi_YQM7kD-1BJ08Zr1fUkZoDqNJGBVgd-vao9K1qY82N8I1kgImMFzYDAIglGvPLDd_djxuszz8IamPMPcX9as8QrYlFAfWUSEoUwZprhpr8YrJgAbGOJSa943B9zmKGu-lnmily_Vm49BOb2PIn7RfL5JdE5RJMYl4a4fOddmkGLqkobe84SNyejQcQhQjcFNbQpZNv5rzmr7e1JgAowQwQYBG4bbzRgYV0P_JCDy1Jazne5I0HOOD7GQL-5-aHhJieNzuA0BZwASplqp85J7rniTYqXL-CtXVMCZy3veuqlqALVVBNrIGx5nKfq75zgi41wNv1eqaC4acP-dwslixVtnnXUlBvYPrTBRohB4ZabC6Tqnn2hOk7Wtb5VmQPRi3DxrfPO5k-Z-BXsi6UuZs1di84rg1GI0di4MWn60GffS6wFi4pq4DnQu6OFWyYFMpDXWX0eH9AwqOfUDVtCmG8GWOJxwPSuF4JOugPon6v-76TtmoJc406kk32jKkKI&ctt=1657911504446'

In [None]:
response = requests.get(link, headers)
response.status_code

In [None]:
# check for missing data

print('num companies:', len(companies), 'num titles:', len(titles), 
      'num locations:', len(locations), 'num salaries:', len(salaries),
      'num ratings:', len(ratings), 'num URLS:', len(urls))
