# Construção do grafo

In [1]:
import ast
import random
import re
import uuid
from datetime import datetime

import networkx as nx
import numpy as np
import pandas as pd
import rapidfuzz

In [2]:
scraped_jobs = pd.read_csv('merged_data.csv')
technical_skills = pd.read_csv('technical_skills_final.csv')

In [3]:
# Associando um novo UUID para cada job
scraped_jobs['id'] = scraped_jobs['id'].astype('str')
for i in range(scraped_jobs.shape[0]):
    scraped_jobs.at[i, 'id'] = str(uuid.uuid4())

## Pré-processamento das localizações

In [4]:
location_to_region = {
    # States to regions mapping
    'Acre': 'North', 'Alagoas': 'Northeast', 'Amapá': 'North', 'Amazonas': 'North', 'Bahia': 'Northeast', 'Ceará': 'Northeast', 'Distrito Federal': 'Central-West', 'Espírito Santo': 'Southeast', 'Goiás': 'Central-West', 'Maranhão': 'Northeast', 'Mato Grosso': 'Central-West', 'Mato Grosso do Sul': 'Central-West', 'Minas Gerais': 'Southeast', 'Pará': 'North', 'Paraíba': 'Northeast', 'Paraná': 'South', 'Pernambuco': 'Northeast', 'Piauí': 'Northeast', 'Rio de Janeiro': 'Southeast', 'Rio Grande do Norte': 'Northeast', 'Rio Grande do Sul': 'South', 'Rondônia': 'North', 'Roraima': 'North', 'Santa Catarina': 'South', 'São Paulo': 'Southeast', 'Sergipe': 'Northeast', 'Tocantins': 'North',

    # Other locations used in the dataset
    'Federal District': 'Central-West', 'Belo Horizonte': 'Southeast', 'Porto Alegre': 'South', 'Curitiba': 'South', 'Campinas': 'Southeast', 'Ribeirão Preto': 'Southeast', 'Natal': 'Northeast', 'Recife': 'Northeast', 'Vitoria': 'Southeast', 'Londrina': 'South', 'Goiania': 'Central-West', 'Brasilia': 'Central-West', 'Salvador': 'Northeast', 'Florianopolis': 'South', 'Fortaleza': 'Northeast', 'Belem': 'North', 'Manaus': 'North', 'João Pessoa': 'Northeast', 'Cuiaba': 'Central-West',
}

# Remap locations to the region they belong to
def map_location(location):
    '''
    Remaps a location to its corresponding region in Brazil.
    Args:
        location (str): The location string to be remapped.
    Returns:
        str: The region corresponding to the location, or the original location if not found.
    '''
    for state, region in location_to_region.items():
        if state in location:
            return region
    return location

scraped_jobs['location'] = scraped_jobs['location'].apply(map_location)

In [5]:
# Output all existing values in column location for non-remote jobs
print(f"Regions for non-remote jobs: {scraped_jobs[scraped_jobs['remote'] == False]['location'].unique()}")

# Output all existing values in column location for remote jobs
print(f"Regions for remote jobs: {scraped_jobs[scraped_jobs['remote'] == True]['location'].unique()}")

Regions for non-remote jobs: ['Southeast' 'South' 'Northeast' 'Central-West' 'North']
Regions for remote jobs: ['Brazil' 'Southeast' 'South' 'Northeast' 'Central-West' 'North'
 'Latin America']


Todas as vagas não remotas tiveram sua localização identificada. Para vagas remotas, faz sentido manter "Brasil" se não houver sido especificado a localização.

In [6]:
# Remap "Latin America" to "Brazil" to maintain consistency
scraped_jobs['location'] = scraped_jobs['location'].replace('Latin America', 'Brazil')

## Identificação das habilidades técnicas a partir do título e da descrição

In [7]:
def get_token_splitter(skill):
    '''
    Returns a regex pattern to split text into tokens, ensuring characters existing in the given skill are preserved.
    Args:
        skill (str): The skill string to determine which characters to preserve.
    Returns:
        str: The regex pattern for token splitting.
    '''
    # Find all non-letter characters in the skill
    non_letters = set(re.findall(r"[^a-zA-Z]", skill))
    escaped_non_letters = "".join([re.escape(c) for c in non_letters])
    # Create a regex pattern that matches sequences of characters that are not letters, digits,
    #    $, #, +, or any of the non-letter characters in the skill
    # This ensures that characters like '+' in 'C++' are preserved as part of the token, and
    #    avoids splitting 'C++' into 'C' and '' which could lead to false positives
    token_splitter = re.compile(fr"[^\w\$\#\+{escaped_non_letters}]+")
    return token_splitter

In [8]:
def remove_numbers(string):
    '''
    Removes all numeric characters from the input string.
    Args:
        string (str): The input string from which numbers should be removed.
    Returns:
        str: The input string with all numeric characters removed.
    '''
    return re.sub(r'\d+', '', string)

In [9]:
def extract_skills(job_title, job_description):
    '''
    Extracts technical skills from job title and description. The skills are matched using exact matching for skills with less than 6 characters, and fuzzy matching (normalized Indel similarity) for skills with 6 or more characters.
    Args:
        job_title (str): The job title.
        job_description (str): The job description.
    Returns:
        list: A list of extracted technical skills.
    '''
    found_technical_skills = set()

    # Remove numbers (phone numbers, years of experience, framework versions, etc.) from title and description
    job_title = remove_numbers(job_title)
    job_description = remove_numbers(job_description)

    title = job_title.lower()
    description = job_description.lower()

    for skill in technical_skills['skill']:
        skill = skill.lower()
        # If the skill contains less than 6 characters, use exact match
        if len(skill) < 6:
            # Split title and description by non-alphanumeric characters not present in the skill
            token_splitter = get_token_splitter(skill)
            split_title = re.split(token_splitter, title)
            split_description = re.split(token_splitter, description)

            if skill in split_description or skill in split_title:
                found_technical_skills.add(skill)
        else:
            # Use fuzzy matching (normalized Indel similarity) for skills with 6 or more characters
            minimum_match_score = int(100 * len(skill) / (len(skill) + 1)) # At most 1 insertion/deletion

            # Don't consider matches if the skill is longer than the text being searched
            if len(skill) <= len(description) and rapidfuzz.fuzz.partial_ratio(skill, description) > minimum_match_score:
                found_technical_skills.add(skill)
            elif len(skill) <= len(title) and rapidfuzz.fuzz.partial_ratio(skill, title) > minimum_match_score:
                found_technical_skills.add(skill)

    return list(found_technical_skills) if found_technical_skills else []

In [10]:
map_skills = True

if map_skills:
    # Create a new column 'found_skills' to store the extracted skills before the graph is constructed
    scraped_jobs['found_skills'] = np.empty((scraped_jobs.shape[0], 0)).tolist()

    try:
        for i in range(scraped_jobs.shape[0]):
            # Add to the column 'found_skills' the extracted skills from the job title and description
            scraped_jobs.at[i, 'found_skills'] = extract_skills(scraped_jobs['name'].iloc[i], scraped_jobs['description'].iloc[i])

            if (i % 100) == 0:
                print(f"\rProcessed entries up to {i}, total done {i / scraped_jobs.shape[0]}".ljust(80), end='')

        print(f"\rProcessed all entries, total done 100%".ljust(80))
    except KeyboardInterrupt:
        print("\nProcess interrupted.".ljust(80))

    # Save the dataframe with the new column to a new CSV file
    scraped_jobs.to_csv('scraped_jobs_with_skills.csv', index=False)

Processed all entries, total done 100%                                         


In [11]:
# Save 50 random samples of the dataframe to a new CSV file for manual verification
scraped_jobs.sample(n=50, random_state=random.seed(datetime.now().timestamp())).to_csv('scraped_jobs_with_skills_sample.csv', index=False)

In [12]:
# Remove from scraped_jobs rows where found_skills is empty
# Even though related keywords were used in the scraping process,
#    some jobs returned by the API are completely unrelated to technology
scraped_jobs = scraped_jobs[scraped_jobs['found_skills'].map(len) > 0]

## Construção do Grafo

In [13]:
read_from_csv = False

if read_from_csv:
    scraped_jobs = pd.read_csv('scraped_jobs_with_skills.csv')
    scraped_jobs['found_skills'] = scraped_jobs['found_skills'].apply(ast.literal_eval)

G = nx.Graph()

# Create edges between jobs based on skills
for _, row in scraped_jobs.iterrows():
    # Create a new node for this job
    G.add_node(row['id'], type="job", location=row['location'], remote=row['remote'])
    for skill in row['found_skills']:
        # Create a new node for the skill if it doesn't exist
        if not G.has_node(skill):
            G.add_node(skill, type="skill")
        # Create an edge between the job and the skill
        G.add_edge(row['id'], skill)

# Save as GEXF
nx.write_gexf(G, 'job_skill_graph.gexf')