# Coleta de vagas do Linkedin

In [1]:
import time
from datetime import datetime

import pandas as pd
import requests
from bs4 import BeautifulSoup

**Funções auxiliares:**

In [2]:
def fetch_page(url, verbose=False):
    '''
    Fetches a web page with retries and exponential backoff.
    Args:
        url (str): The URL of the web page to fetch.
        verbose (bool): If True, prints status messages.
    Returns:
        str: The content of the web page, or None if failed.
    '''
    wait_time = 1

    while True:
        if wait_time > 60:
            if verbose:
                print(f"Failed to fetch {url} after multiple attempts.")
            return None

        try:
            response = requests.get(url)
            # Raise an error for bad responses
            response.raise_for_status()
            if verbose:
                print(f"Successfully fetched {url}")
            return response.text
        except requests.exceptions.RequestException as e:
            if verbose:
                print(f"Error fetching {url}: {e}. Retrying in {wait_time} seconds...")

            time.sleep(wait_time)
            # Exponential backoff with a max wait time
            wait_time *= 2

In [3]:
def fetch_job_details(job_id, is_remote, verbose=False):
    '''
    Fetches job details from LinkedIn job posting.
    Args:
        job_id (str): The ID of the job to fetch.
        is_remote (bool): Whether the job is remote.
        verbose (bool): If True, prints status messages.
    Returns:
        dict: A dictionary containing job details, or None if failed.
    '''
    job_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"

    job_data = fetch_page(job_url, verbose)
    # If this job could not be fetched, skip it
    if not job_data:
        return None

    # The response is a HTML page, parse it
    soup = BeautifulSoup(job_data, 'html.parser')

    # Information about the job
    job_name = soup.select_one('.top-card-layout__entity-info a')
    job_location = soup.select_one('.topcard__flavor-row .topcard__flavor--bullet')
    job_description = soup.select_one('.description__text')
    job_remote = is_remote

    # If anything is missing, skip this job
    if not job_name or not job_location or not job_description:
        if verbose:
            print(f"Skipping job {job_id} due to missing information.")
        return None

    # Create a dictionary to hold job details
    job_details = {
        'id': job_id,
        'name': job_name.get_text(strip=True),
        'location': job_location.get_text(strip=True),
        'description': job_description.get_text(strip=True),
        'remote': job_remote,
    }

    return job_details

In [4]:
job_location = "Brazil"
maximum_time = "r86400" # consider only jobs posted in the last 24 hours

def fetch_jobs_with_keywords(keywords, verbose=False):
    '''
    Fetches job details from LinkedIn using a certain keyword. Considers both remote and non-remote jobs.
    Args:
        keywords (str): The keywords to search for.
        verbose (bool): If True, prints status messages.
    Returns:
        list: A list of dictionaries containing job details, or an empty list if nothing was found.
    '''
    all_jobs = []

    base_search_url = "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search"

    keywords = keywords.replace(" ", "%20")

    # Fetch both remote and non-remote jobs
    for is_remote in [False, True]:
        if verbose:
            print(f"Fetching {'remote' if is_remote else 'non-remote'} jobs for keywords: {keywords}")

        # Add keyword, location, time parameter and remote filter
        location = job_location
        time = maximum_time
        remote = "2" if is_remote else "1%2C3"

        search_url = f"{base_search_url}?keywords={keywords}&location={location}&f_TPR={time}&f_WT={remote}&start={{}}"

        pagination_index = 0
        while True:
            jobs = fetch_page(search_url.format(pagination_index), verbose)

            # If there was an error fetching the page, stop the search
            if not jobs:
                break

            # Parse the job listings
            soup = BeautifulSoup(jobs, 'html.parser')

            # Check if there are no jobs found
            total_jobs = soup.find_all("li")
            if not total_jobs:
                if verbose:
                    print(f"No more jobs found on page {pagination_index}.")
                break

            # Extract job details for each job
            for job in total_jobs:
                job_id = job.find("div", {"class": "base-card"}).get('data-entity-urn').split(":")[3] if job.find("div", {"class": "base-card"}) else None

                if job_id is None:
                    # No more jobs to process
                    break

                # Only process if previous searches (with different keywords) didn't fetch this job
                job_information = fetch_job_details(job_id, is_remote, verbose)

                # Add job information if it was successfully fetched
                if job_information:
                    all_jobs.append(job_information)

            # Move to the next page
            pagination_index += len(total_jobs)

    if verbose:
        print(f"Total jobs fetched: {len(all_jobs)}")

    return all_jobs

**Palavras-chave para as buscas:**

In [5]:
# Search keywords
keywords = [
    'Desenvolvedor',
    'Programador',
    'Software',
    'Hardware',
    'IA',
    'Inteligência Artificial',
    'Machine Learning',
    'Data Science',
    'Engenheiro de Software',
    'Engenheiro de Dados',
    'Desenvolvimento Web',
    'Backend',
    'Frontend',
    'Full Stack',
    'Cloud',
    'DevOps',
    'Big Data',
    'QA',
    'UX',
    'UI',
    'CI',
    'CD',
    'Android',
    'iOS',
    'Mobile',
    'TI',
    'Cibersegurança',
    'Redes',
    'Robótica',
    'Jogos',
]

**Coleta de dados usando as palavras-chave definidas:**

In [6]:
verbose = False

# Fetch jobs for each keyword and save everything to a CSV file
all_jobs_data = []
for keyword in keywords:
    jobs = fetch_jobs_with_keywords(keyword, verbose)

    if jobs:
        all_jobs_data.extend(jobs)

# Convert to a DataFrame, remove possible duplicates, and save as a CSV
if all_jobs_data:
    df = pd.DataFrame(all_jobs_data)
    # Due to how Linkedin searches work, some jobs may appear multiple times if they match multiple keywords
    df = df.drop_duplicates()

    csv_filename = f"linkedinjobs_{datetime.today().strftime('%Y-%m-%d')}.csv"
    df.to_csv(csv_filename, index=False)

    print(f"Saved {len(all_jobs_data)} jobs")

Saved 3290 jobs
