# Análise dos requisitos técnicos mais importantes nas vagas

In [1]:
import numpy as np
import networkx as nx
import pandas as pd

In [2]:
# Importando o grafo completo
G = nx.read_gexf("job_skill_graph.gexf")

In [3]:
# Analisando os tipos de nós
node_types = nx.get_node_attributes(G, "type")
print("Tipos de nós no grafo:", set(node_types.values()))

Tipos de nós no grafo: {'job', 'skill'}


## Analisando o grau dos requisitos

In [4]:
skill_degrees = G.degree([n for n, t in node_types.items() if t == "skill"])
skill_degrees_df = pd.DataFrame(
    skill_degrees, columns=["skill", "jobs requiring skill"]
)

In [5]:
# Visualizando os 20 requisitos mais centrais por grau
skill_degrees_df = skill_degrees_df.sort_values(
    by="jobs requiring skill", ascending=False
).reset_index(drop=True)
print(skill_degrees_df.head(20))

         skill  jobs requiring skill
0          sql                 12783
1       python                 12267
2         java                 10891
3          aws                 10732
4        ci/cd                  9610
5        azure                  8656
6          git                  7905
7        scrum                  7528
8       docker                  6502
9   javascript                  6456
10       react                  5664
11  kubernetes                  5316
12      kanban                  5266
13    power bi                  5199
14         gcp                  4477
15       agile                  4250
16        html                  4204
17       linux                  4042
18         css                  4015
19  postgresql                  3984


In [6]:
# Salvando os resultados completos dos graus
skill_degrees_df.to_csv("results/full_graph_skills_degrees.csv", index=False)

In [7]:
# Adicionando o atributo de grau ao conhecimentos técnicos no grafo
skill_degrees = dict(skill_degrees)
nx.set_node_attributes(G, skill_degrees, "degree")

## Analisando o Closeness dos requisitos

In [8]:
skill_closeness = {}
for n in G.nodes():
    if node_types[n] == "skill":
        # Calculando apenas para os nós de conhecimento técnico para melhorar a performance
        skill_closeness[n] = nx.closeness_centrality(G, u=n)

skill_closeness_df = pd.DataFrame(
    skill_closeness.items(), columns=["skill", "closeness"]
)

In [9]:
# Visualizando os 20 requisitos com maior closeness
skill_closeness_df = skill_closeness_df.sort_values(
    by="closeness", ascending=False
).reset_index(drop=True)
print(skill_closeness_df.head(20))

         skill  closeness
0          sql   0.392968
1       python   0.391230
2         java   0.383314
3          aws   0.382655
4        ci/cd   0.375651
5        azure   0.369932
6        scrum   0.366575
7          git   0.366120
8   javascript   0.361033
9       kanban   0.355711
10    power bi   0.354995
11       react   0.354959
12      docker   0.352053
13       agile   0.350872
14       linux   0.349090
15           c   0.349077
16        html   0.348526
17          go   0.347903
18         css   0.347184
19  kubernetes   0.347059


In [10]:
# Salvando os resultados completos de closeness
skill_closeness_df.to_csv("results/full_graph_skills_closeness.csv", index=False)

In [11]:
# Adicionando o atributo de closeness ao conhecimentos técnicos no grafo
nx.set_node_attributes(G, skill_closeness, "closeness")

## Exportando a rede com os atributos novos normalizados

In [12]:
def normalize_feature(G, feature_name):
    """
    Normalizes a feature of the nodes in a graph to a range between 0 and 1 (min-max normalization).
    Args:
        G (networkx.Graph): The input graph.
        feature_name (str): The name of the node attribute to be normalized.
    Returns:
        dict: A dictionary with node IDs as keys and normalized feature values as values.
    """
    features = nx.get_node_attributes(G, feature_name)
    values = np.array(list(features.values()))
    min_val = values.min()
    max_val = values.max()
    normalized = {
        n: float((feat - min_val) / (max_val - min_val)) for n, feat in features.items()
    }
    return normalized

In [13]:
# Normalizando os atributos para análises futuras
normalized_degrees = normalize_feature(G, "degree")
normalized_closeness = normalize_feature(G, "closeness")

nx.set_node_attributes(G, normalized_degrees, "degree")
nx.set_node_attributes(G, normalized_closeness, "closeness")

In [14]:
# Exportando a rede com os atributos novos (normalizados) para visualizações
nx.write_gexf(G, "job_skill_graph_with_metrics.gexf")

## Analisando as maiores variações

In [15]:
# Calculando as variações entre os atributos normalizados
variation = {}
for n in G.nodes():
    if G.nodes[n]["type"] == "skill":
        degree = G.nodes[n]["degree"]
        closeness = G.nodes[n]["closeness"]
        variation[n] = degree - closeness

variation_df = pd.DataFrame(variation.items(), columns=["skill", "degree - closeness"])

In [16]:
# Analisando as 10 maiores variações
variation_df = variation_df.sort_values(
    by="degree - closeness", ascending=False
).reset_index(drop=True)
print(variation_df.head(10))

            skill  degree - closeness
0       smalltalk            0.000078
1             sql            0.000000
2          python           -0.031476
3     cisco webex           -0.044138
4          zbrush           -0.066926
5  autodesk revit           -0.067893
6          lumion           -0.069563
7          pvsyst           -0.074661
8          moveit           -0.077918
9  google android           -0.084282


In [17]:
# Analisando as 10 menores variações
variation_df = variation_df.sort_values(
    by="degree - closeness", ascending=True
).reset_index(drop=True)
print(variation_df.head(10).sort_values(by="degree - closeness"))

             skill  degree - closeness
0         electron           -0.652799
1           centos           -0.643081
2  microsoft excel           -0.641462
3            slack           -0.637959
4             zoom           -0.637702
5              sas           -0.634947
6           trello           -0.634110
7             miro           -0.633690
8             room           -0.633612
9            unity           -0.632653


In [18]:
# Analisando a variação média e o desvio padrão
print(f"Variação média: {variation_df['degree - closeness'].mean()}")
print(f"Desvio padrão: {variation_df['degree - closeness'].std()}")

Variação média: -0.46902348208533917
Desvio padrão: 0.12099175714810649
