##### Importações:

In [1]:
import logging
from selenium.webdriver.remote.remote_connection import LOGGER
from selenium.common.exceptions import *
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from parsel import Selector
from time import sleep
import random
import pandas as pd

##### Funções

Função que inicia o driver

In [3]:
def start_driver():
    options = Options()
    LOGGER.setLevel(logging.WARNING)
    # add arguments to options variable:
    arguments = ['--lang=pt-BR', '--start-maximized', '--disable-notifications'] #'--headless'
    for argument in arguments:
        options.add_argument(argument)

    driver = webdriver.Chrome(options=options)

    wait = WebDriverWait(
        driver,
        10,
        poll_frequency=1,
        ignored_exceptions=[
            NoSuchElementException,
            ElementNotVisibleException,
            ElementNotSelectableException
        ]
    )
    return driver, wait

Função para digitar naturalmente

In [4]:
def natural_type(text, field):
    for letter in text:
        field.send_keys(letter)
        sleep(random.randint(1,5)/30)


Função para gerar intervalos aleatórios:

In [5]:
def natural_type(text, field):
    for letter in text:
        field.send_keys(letter)
        sleep(random.randint(1,5)/30)


Função para abrir a página

In [6]:
def get_page(site, driver, wait):
    return driver.get(site)

Função para pesquisar as vagas

In [9]:
def search(driver, wait, text):
    wait.until(
        expected_conditions.visibility_of_all_elements_located(
        (By.XPATH, "//input[@id='text-input-what']")
        )
    )
    wait.until(
        expected_conditions.element_to_be_clickable(
            (By.XPATH, "//input[@id='text-input-what']")
        )
    )
    
    job_field = driver.find_element(By.XPATH, "//input[@id='text-input-what']")
    job_field.click()
    natural_type(text[0], job_field)
    sleep(1.2)
    location_field = driver.find_element(By.XPATH, "//input[@id='text-input-where']")
    location_field.click()
    sleep(1.8)
    natural_type(text[1], location_field)
    location_field.send_keys(Keys.ENTER)
    wait.until(
        expected_conditions.visibility_of_all_elements_located(
            (By.XPATH, "//td[@class='resultContent css-1qwrrf0 eu4oa1w0']")
        )
    )


Função para scraping

In [10]:
def scrape(driver, wait, base_url):
    response = Selector(text=driver.page_source)
    for vaga in response.xpath("//td[@class='resultContent css-1qwrrf0 eu4oa1w0']"):
        yield {
            "Cargo" : vaga.xpath("./div/h2/a/span/text()").get(),
            "Empresa" : vaga.xpath("./div[@class='company_location css-17fky0v e37uo190']//span[@data-testid='company-name']/text()").get(),
            "Local" : vaga.xpath("./div[@class='company_location css-17fky0v e37uo190']/div/div[@data-testid='text-location']/text()").get(),
            "Link" : base_url + vaga.xpath("./div/h2/a/@href").get()
        }

##### Função para ir para a **próxima página**

In [11]:
def next_page(driver, wait):
    next_btn = driver.find_element(By.XPATH, "//a[@aria-label='Next Page']")
    driver.execute_script("arguments[0].scrollIntoView();", next_btn)
    next_btn.click()
    wait.until(
        expected_conditions.element_to_be_clickable(
            (By.XPATH, "//td[@class='resultContent css-1qwrrf0 eu4oa1w0']")
        )
    )

##### Função main

In [12]:
def main():
    base_url = "https://br.indeed.com"
    url = "https://br.indeed.com/jobs?q=python&l=Belo+Horizonte%2C+MG&vjk=f0f0b60f99b7badf"
    data = []
    driver, wait = start_driver()
    get_page(url, driver, wait)
    #search(driver, wait, text)
    # iniciando o scraping...
    while True:
        page_data = scrape(driver, wait, base_url)
        data.extend(page_data)
        # pagination:
        try:
            next_page(driver, wait)
        except ElementClickInterceptedException:
            driver.refresh()
        except:
            break
    # salvando os dados:
    df = pd.DataFrame(data)
    df.to_json("vagas.json", orient='records', index=False)  


##### Executando...

In [13]:
main()

Verificando o JSON:

In [17]:
resultado = pd.read_json("vagas.json")

In [18]:
resultado

Unnamed: 0,Cargo,Empresa,Local,Link
0,Desenvolvedor Backend Júnior Python / Pesquisa...,BairesDev,"Home Office in Belo Horizonte, MG",https://br.indeed.com/rc/clk?jk=f0f0b60f99b7ba...
1,Desenvolvedora/Engenheira Python - Trabalho Re...,BairesDev,"Home Office in Belo Horizonte, MG",https://br.indeed.com/rc/clk?jk=064fa074d318a2...
2,Desenvolvedor Junior,ANA LEITE CONSULTORIA E GESTAO LTDA,"Home Office in Belo Horizonte, MG",https://br.indeed.com/rc/clk?jk=79f5acab2b8ce2...
3,Desenvolvedor Python,Grupo LPJ,"Home Office in Belo Horizonte, MG",https://br.indeed.com/rc/clk?jk=673d971751ec4b...
4,Analista de Business Intelligence,GRUPO BARCELOS,"Belo Horizonte, MG",https://br.indeed.com/rc/clk?jk=e59beb315b119c...
...,...,...,...,...
287,Senior Mine Planning Optimization Consultant-C...,KPI Digital,"Home Office in Belo Horizonte, MG",https://br.indeed.com/rc/clk?jk=f206170d694262...
288,Software Engineer for AI Training Data (Python),G2i Inc.,Home Office in Minas Gerais,https://br.indeed.com/rc/clk?jk=5ae813ab021e86...
289,DevOps Engineer Sênior,Grupo Fácil,"Belo Horizonte, MG",https://br.indeed.com/rc/clk?jk=6ca2f3ec790ea7...
290,Engenheiro de dados sênior (aws) - vaga tempor...,Netvagas,"Belo Horizonte, MG",https://br.indeed.com/rc/clk?jk=f95b30c0128e74...
