In [18]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

import pandas as pd

import os
import random
import datetime
import re
import time


In [2]:
class WebDriver:

    def __new__(cls, download_path: str, headless: bool = True) -> webdriver.Chrome:
        """
        Initialize a web scraper for the browser

        Parameters:
            - download_path - the path where the files will be downloaded on your device
            - headless - if True, the browser will not show up when the script is runnning.
                         if False, the browser will show up when the script is running.
        """
        options = Options()
        # A headless system is a computer that operates without a monitor, 
        # graphical user interface (GUI) or peripheral devices, such as keyboard and mouse
        if headless:
            options.add_argument('--headless')

        # Setup custom download path

        options.add_experimental_option("prefs", {
            "download.default_directory": download_path,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing.enabled": True
        })

        chrome_install = ChromeDriverManager().install()
        folder = os.path.dirname(chrome_install)
        chromedriver_path = os.path.join(folder, "chromedriver.exe")
        service = ChromeService(chromedriver_path)
        driver = webdriver.Chrome(options, service)
        return driver

In [3]:
def sleep(start: int, stop: int):
    """
    Make the program sleep for a random number of seconds between start and stop

    Parameters:
        - start - the minimum number of seconds to sleep for
        - stop - the maximum number of seconds to sleep for
    """
    seconds = random.randint(start, stop)
    print(f"Sleeping for {seconds} seconds")
    time.sleep(seconds)

In [4]:
driver = WebDriver(r"C:\Users\Nikolai\Documents\GitHub", False)

In [5]:
driver.get("https://uk.indeed.com/jobs?q=warehouse&l=London%2C+Greater+London&from=searchOnDesktopSerp&vjk=f66b98780c97cbb2")
sleep(5, 7)

Sleeping for 6 seconds


In [25]:
table = driver.find_element(By.ID, "mosaic-jobResults")

In [75]:
data = []
for card in driver.find_elements(By.XPATH, "//div[contains(@class, 'job_seen_beacon')]"):
    
    if len(card.text) == 0:
        continue

    try:
        job_title = card.find_element(By.TAG_NAME, "span").text
    except:
        job_title = None
    try:
        salary = card.find_element(By.XPATH, ".//div[contains(@class, 'salary-snippet')]").text.strip()
    except:
        salary = None

    try:
        job_type = card.find_element(By.XPATH, '//*[@data-testid="attribute_snippet_testid"]').text
    except:
        job_type = None

    if salary:
        data.append({"job_title": job_title, "salary": salary, "job_type": job_type})
    
df = pd.DataFrame(data)
df["pay"] = "hourly"
df.loc[df["salary"].str.endswith("year"), "pay"] = "yearly"

df["salary"] = df["salary"].str.split(" a").str[0]

salaries = df["salary"].str.replace("From ", "").str.replace("£", "").str.replace(",", "").str.split(" - ").apply(lambda x: [float(value) for value in x])

df["min_salary"] = salaries.apply(lambda x: min(x))
df["max_salary"] = salaries.apply(lambda x: max(x))


df["salary_per_year"] = df.apply(lambda row: row["min_salary"] if row["min_salary"] == row["max_salary"] else (row["min_salary"] + row["max_salary"]) / 2, axis=1)

df.loc[df["pay"] == "hourly", "salary_per_year"] = None


df.to_excel("indeed.xlsx", index=False)