In [1]:
# necessary for scraping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import time
import random

In [2]:
# scraping url for indeed search : data science job in Vancouver BC , radius 50km 
url = 'https://ca.indeed.com/jobs?q=data+science&l=Canada&radius=50&sort=date&start='

In [3]:
# function to scrape job on indeed given url
def indeed_scrape(url):
    # set up web browser for scraping
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(  
        options=options
    )
    driver.get(url)
    driver.maximize_window()
    #  collect all job listing elements
    job_cards = driver.find_elements(By.CSS_SELECTOR, ".cardOutline")
    # create list to store job listing
    jobs = []
    # create variable to store data in job posting
    date = None
    applications = None
    job_title = None
    company_name = None
    company_reviews = None
    location = None
    pay = None
    job_type = None
    benefits = None
    job_description = None
    # iterate through job listing elements
    for job_card in job_cards:
        # extract job date posted
        try:
            date_element = job_card.find_element(By.CSS_SELECTOR, ".date")
            date = date_element.text
        except NoSuchElementException: pass
        # close pop-up dialog
        try:
            dialog_element = driver.find_element(By.CSS_SELECTOR, "[role='dialog']")
            close_button = dialog_element.find_element(By.CSS_SELECTOR, "[aria-label=close][type='button']")
            close_button.click()
        except NoSuchElementException: pass
        # click job listing
        job_card.click()
        # wait for listing to load by check if job title element exists
        # extract job titles
        try:
            title_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".jobsearch-JobInfoHeader-title")))
            job_title = title_element.text.replace("\n- job post", "")
        except TimeoutException: pass
        # navigate to detail job listing panel
        job_details_element = driver.find_element(By.CSS_SELECTOR, ".jobsearch-RightPane")
        # extract company name, review, location
        company_info_element = job_details_element.find_element(By.CSS_SELECTOR, "[data-testid='jobsearch-CompanyInfoContainer']")
        company_info = company_info_element.text.split("\n")
        company_name = company_info[0]
        if len(company_info) == 2:
            company_location = company_info[1]
        if len(company_info) == 3:
            company_reviews = company_info[1]
            company_location = company_info[2]
        # extract salary and job type
        for div in job_details_element.find_elements(By.CSS_SELECTOR, "#jobDetailsSection div"):
            if div.text == "Pay":
                pay_element = div.find_element(By.XPATH, "following-sibling::*")
                pay = pay_element.text
            elif div.text == "Job type":
                job_type_element = div.find_element(By.XPATH, "following-sibling::*")
                job_type = job_type_element.text
        # extract benefits
        try:
            benefits_element = job_details_element.find_element(By.ID, "benefits")
            benefits = []
            for benefit_element in benefits_element.find_elements(By.TAG_NAME, "li"):
                benefit = benefit_element.text
                benefits.append(benefit)
        except NoSuchElementException: pass
        try:
            description_element = job_details_element.find_element(By.ID, "jobDescriptionText")
            job_description = description_element.text
        except NoSuchElementException: pass
        # create dictionary to store data for job posting
        job_posting = {}
        job_posting["date"] = date
        job_posting["job_title"] = job_title 
        job_posting["company_name"] = company_name
        job_posting["company_reviews"] = company_reviews
        job_posting["company_location"] = company_location
        job_posting["pay"] = pay
        job_posting["job_type"] = job_type
        job_posting["benefits"] = benefits
        job_posting["job_description"] = job_description
        jobs.append(job_posting)
        time.sleep(random.uniform(1, 5))
    # close the browser
    driver.quit()
    # return dictionary of job postings
    return jobs
    print("Scraping done!!")

In [6]:
# scrape data for first number pages of indeed
job_df = pd.DataFrame()
page_num = 25
for page in range(page_num):
    start_page = str(page*10)
    job_df  = pd.concat([job_df,pd.DataFrame(indeed_scrape(url+start_page))], ignore_index=True)

In [8]:
# export data table to csv file
job_df.to_csv(r"C:\Users\bosst\Data Science Job Market Research\data\indeed_jobs.csv")
job_df

Unnamed: 0,date,job_title,company_name,company_reviews,company_location,pay,job_type,benefits,job_description
0,Posted\nJust posted,Machine Learning Engineer,Blurb.fm,,Remote,"$66,613.42–$144,330.80 a year",Fixed term contract,,ML ENGINEER\nAre you a machine learning expert...
1,Posted\nJust posted,"Ingénieur de données et visualisation, Science...",Pratt & Whitney,"1,423 reviews","Longueuil, QC","$66,613.42–$144,330.80 a year",Full-time,"[Company pension, Flexible schedule]",Date Posted:\n2023-11-14\nCountry:\nCanada\nLo...
2,Posted\nJust posted,"Data Analyst, Merchandise Planning",SSENSE,299 reviews,"Montréal, QC","$66,613.42–$144,330.80 a year",Fixed term contract,"[Company pension, Flexible schedule]","Company Description\n\nSSENSE, pronounced [es-..."
3,Posted\nJust posted,Data Science / Machine learning specialist - F...,CIUSSS de l'Ouest-de-l’Île-de-Montréal,43 reviews,"Verdun, QC","$66,613.42–$144,330.80 a year",Full-time,"[Company pension, Flexible schedule]",Reason for Post :\nReplacement more than de 6 ...
4,Posted\nJust posted,Spécialiste en science des données / apprentis...,CIUSSS de l'Ouest-de-l’Île-de-Montréal,43 reviews,"Verdun, QC","$66,613.42–$144,330.80 a year",Temporary\nFull-time,"[Company pension, Flexible schedule]",Raison d'être de l'affichage :\nRemplacement d...
...,...,...,...,...,...,...,...,...,...
370,Employer\nActive 22 days ago,"Senior Software Engineer, Machine Learning",owl.co,279 reviews,"Vancouver, BC•Hybrid remote","$120,000–$230,000 a year",Full-time,"[Dental care, Designated paid holidays, Disabi...",We are not working with recruitment agencies a...
371,Posted\nPosted 30+ days ago,Spécialiste Senior Données Maitres SAP (Master...,Bel Group,279 reviews,"Montréal, QC","$120,000–$230,000 a year",Permanent,"[Dental care, Designated paid holidays, Disabi...","VOUS NOUS CONNAISSEZ DEJA, C'EST SÛR\nVous ave..."
372,Posted\nPosted 30+ days ago,Bi-lingual Product Owner,TV2 Consulting,279 reviews,"Montréal, QC","$81,000–$90,000 a year",Fixed term contract,"[Dental care, Designated paid holidays, Disabi...",Be bilingual French and English\nScrum certifi...
373,Posted\nPosted 30+ days ago,Part-time Instructor in Working with AI,Universite Concordia,20 reviews,"Montréal, QC","$81,000–$90,000 a year",Part-time,"[Dental care, Designated paid holidays, Disabi...",Part-time Instructor in Working with AI\nLast ...
