In [1]:
import pandas as pd
import numpy as np

def transformar_colunas_em_listas(df: pd.DataFrame, colunas: list[str]) -> pd.DataFrame:
    """ Transforma as colunas que tem string separadas por v√≠rgula em listas 

        Exemplo:
            skills = 'Python,Machine Learning,Data Science'
            transformar_colunas_em_listas(df, ['skills'])
    
            skills = ['Python', 'Machine Learning', 'Data Science']

        Args:
            df (pd.DataFrame): DataFrame com as colunas
            colunas (list[str]): Lista de colunas a transformar

        Returns:
            pd.DataFrame: DataFrame com as colunas transformadas
    """
    for coluna in colunas:
        df[coluna] = df[coluna].apply(lambda x: x.split(',') if isinstance(x, str) else [])
    return df
    

colunas = ['skills', 'technical_knowledge', 'technologies']
job_results = pd.read_csv('job_analysis_results.csv')
job_results = transformar_colunas_em_listas(job_results, colunas)

job_results.head()


Unnamed: 0,titulo,empresa,localizacao,skills,technologies,technical_knowledge
0,"Intern, Data Scientist",Mastercard,"Lisbon, Lisbon, Portugal",[],[],"[Data Analysis, Machine Learning Modeling, S..."
1,Python AI Developer,MOZANTECH,Lisbon Metropolitan Area,[Python],[],[Generative AI]
2,"AI Engineer (Remote, Contract)",INFUSE,Portugal,"[Python, SQL, NoSQL, Git, Object-Oriented ...","[Flask, Quart, asyncio, FastAPI, pandas, ...","[Machine Learning Modeling, Model Training, ..."
3,Python Engineer (Remote),Veeva Systems,Portugal,"[Python, SQL, JavaScript, HTML, CSS]","[Django, MySQL, PostgreSQL, GraphQL, React...","[Software Engineering, API Development, Syst..."
4,AI Enginner,Affinity,Lisbon Metropolitan Area,[Python],[],"[Machine Learning Modeling, Large Language Mo..."


# üìä AN√ÅLISE EXPLORAT√ìRIA AVAN√áADA E DASHBOARD INTERATIVO

---

## üîç Parte 1: An√°lise Explorat√≥ria de Dados (EDA)

In [2]:
# An√°lise Explorat√≥ria Detalhada do Dataset

print("="*80)
print("üîç AN√ÅLISE EXPLORAT√ìRIA DE DADOS")
print("="*80)

# 1. Informa√ß√µes b√°sicas
print("\nüìã INFORMA√á√ïES B√ÅSICAS DO DATASET:")
print(f"   ‚Ä¢ Total de vagas: {len(job_results)}")
print(f"   ‚Ä¢ Colunas: {list(job_results.columns)}")
print(f"   ‚Ä¢ Tipos de dados:\n{job_results.dtypes}")

# 2. An√°lise de completude dos dados
print("\nüìä COMPLETUDE DOS DADOS:")
missing_data = job_results.isnull().sum()
for col in job_results.columns:
    total = len(job_results)
    missing = missing_data[col]
    pct = (missing / total) * 100
    filled = total - missing
    print(f"   ‚Ä¢ {col}: {filled}/{total} preenchidos ({100-pct:.1f}%)")

# 3. An√°lise de distribui√ß√£o por empresa
print("\nüè¢ TOP 10 EMPRESAS COM MAIS VAGAS:")
empresa_counts = job_results['empresa'].value_counts().head(10)
for i, (empresa, count) in enumerate(empresa_counts.items(), 1):
    print(f"   {i:2}. {empresa}: {count} vagas")

# 4. An√°lise de localiza√ß√£o
print("\nüìç TOP 10 LOCALIZA√á√ïES:")
loc_counts = job_results['localizacao'].value_counts().head(10)
for i, (loc, count) in enumerate(loc_counts.items(), 1):
    print(f"   {i:2}. {loc}: {count} vagas")

# 5. An√°lise de complexidade das vagas
print("\nüéØ AN√ÅLISE DE COMPLEXIDADE DAS VAGAS:")
job_results_copy = job_results.copy()
job_results_copy['num_skills'] = job_results_copy['skills'].apply(lambda x: len(x) if isinstance(x, list) else 0)
job_results_copy['num_technologies'] = job_results_copy['technologies'].apply(lambda x: len(x) if isinstance(x, list) else 0)
job_results_copy['num_knowledge'] = job_results_copy['technical_knowledge'].apply(lambda x: len(x) if isinstance(x, list) else 0)
job_results_copy['total_requirements'] = job_results_copy['num_skills'] + job_results_copy['num_technologies'] + job_results_copy['num_knowledge']

print(f"   ‚Ä¢ M√©dia de skills por vaga: {job_results_copy['num_skills'].mean():.2f}")
print(f"   ‚Ä¢ M√©dia de tecnologias por vaga: {job_results_copy['num_technologies'].mean():.2f}")
print(f"   ‚Ä¢ M√©dia de conhecimentos t√©cnicos por vaga: {job_results_copy['num_knowledge'].mean():.2f}")
print(f"   ‚Ä¢ M√©dia total de requisitos por vaga: {job_results_copy['total_requirements'].mean():.2f}")

print(f"\n   ‚Ä¢ Vaga com MAIS requisitos: {job_results_copy['total_requirements'].max()} requisitos")
print(f"   ‚Ä¢ Vaga com MENOS requisitos: {job_results_copy['total_requirements'].min()} requisitos")

# Guardar para an√°lises posteriores
job_results_extended = job_results_copy

üîç AN√ÅLISE EXPLORAT√ìRIA DE DADOS

üìã INFORMA√á√ïES B√ÅSICAS DO DATASET:
   ‚Ä¢ Total de vagas: 83
   ‚Ä¢ Colunas: ['titulo', 'empresa', 'localizacao', 'skills', 'technologies', 'technical_knowledge']
   ‚Ä¢ Tipos de dados:
titulo                 object
empresa                object
localizacao            object
skills                 object
technologies           object
technical_knowledge    object
dtype: object

üìä COMPLETUDE DOS DADOS:
   ‚Ä¢ titulo: 83/83 preenchidos (100.0%)
   ‚Ä¢ empresa: 83/83 preenchidos (100.0%)
   ‚Ä¢ localizacao: 83/83 preenchidos (100.0%)
   ‚Ä¢ skills: 83/83 preenchidos (100.0%)
   ‚Ä¢ technologies: 83/83 preenchidos (100.0%)
   ‚Ä¢ technical_knowledge: 83/83 preenchidos (100.0%)

üè¢ TOP 10 EMPRESAS COM MAIS VAGAS:
    1. SplitMetrics: 6 vagas
    2. Planner 5D: 4 vagas
    3. Welocalize: 4 vagas
    4. NielsenIQ: 3 vagas
    5. Lognext: 2 vagas
    6. N-iX: 2 vagas
    7. Nunegal Consulting: 2 vagas
    8. TechDelivery: 2 vagas
    9. TransPerf

In [3]:
# An√°lise de Cruzamento: Skills + Technical Knowledge
from collections import defaultdict
import pandas as pd

print("="*80)
print("üîó AN√ÅLISE DE CRUZAMENTO: SKILLS √ó CONHECIMENTO T√âCNICO")
print("="*80)

# Criar matriz de coocorr√™ncia entre skills e conhecimentos t√©cnicos
skill_knowledge_matrix = defaultdict(lambda: defaultdict(int))

for idx, row in job_results.iterrows():
    skills = row['skills'] if isinstance(row['skills'], list) else []
    knowledge = row['technical_knowledge'] if isinstance(row['technical_knowledge'], list) else []
    
    for skill in skills:
        for know in knowledge:
            skill_knowledge_matrix[skill][know] += 1

# Top combina√ß√µes Python + Conhecimento T√©cnico
print("\nüêç TOP 10: PYTHON + CONHECIMENTO T√âCNICO")
if 'Python' in skill_knowledge_matrix:
    python_knowledge = sorted(skill_knowledge_matrix['Python'].items(), key=lambda x: x[1], reverse=True)[:10]
    for i, (know, count) in enumerate(python_knowledge, 1):
        print(f"   {i:2}. Python + {know}: {count} vagas")

# Top combina√ß√µes SQL + Conhecimento T√©cnico
print("\nüóÑÔ∏è  TOP 10: SQL + CONHECIMENTO T√âCNICO")
if 'SQL' in skill_knowledge_matrix:
    sql_knowledge = sorted(skill_knowledge_matrix['SQL'].items(), key=lambda x: x[1], reverse=True)[:10]
    for i, (know, count) in enumerate(sql_knowledge, 1):
        print(f"   {i:2}. SQL + {know}: {count} vagas")

# Converter para DataFrame para an√°lises futuras
skill_know_data = []
for skill, knowledge_dict in skill_knowledge_matrix.items():
    for know, count in knowledge_dict.items():
        skill_know_data.append({'skill': skill, 'knowledge': know, 'count': count})

skill_knowledge_df = pd.DataFrame(skill_know_data)
print(f"\nüìä Total de combina√ß√µes √∫nicas: {len(skill_knowledge_df)}")

üîó AN√ÅLISE DE CRUZAMENTO: SKILLS √ó CONHECIMENTO T√âCNICO

üêç TOP 10: PYTHON + CONHECIMENTO T√âCNICO
    1. Python +  Model Deployment: 48 vagas
    2. Python + Machine Learning Modeling: 37 vagas
    3. Python +  System Design: 31 vagas
    4. Python +  Data Pipeline Development: 30 vagas
    5. Python +  Large Language Models (LLMs): 29 vagas
    6. Python +  Software Engineering: 29 vagas
    7. Python +  Containerization: 29 vagas
    8. Python +  Cloud Computing: 28 vagas
    9. Python +  API Development: 25 vagas
   10. Python +  Natural Language Processing (NLP): 24 vagas

üóÑÔ∏è  TOP 10: SQL + CONHECIMENTO T√âCNICO

üìä Total de combina√ß√µes √∫nicas: 604


In [4]:
# An√°lise de Cruzamento: Technologies + Technical Knowledge
from collections import defaultdict

print("="*80)
print("üîß AN√ÅLISE DE CRUZAMENTO: TECNOLOGIAS √ó CONHECIMENTO T√âCNICO")
print("="*80)

# Criar matriz de coocorr√™ncia entre tecnologias e conhecimentos t√©cnicos
tech_knowledge_matrix = defaultdict(lambda: defaultdict(int))

for idx, row in job_results.iterrows():
    techs = row['technologies'] if isinstance(row['technologies'], list) else []
    knowledge = row['technical_knowledge'] if isinstance(row['technical_knowledge'], list) else []
    
    for tech in techs:
        for know in knowledge:
            tech_knowledge_matrix[tech][know] += 1

# An√°lise das tecnologias mais importantes
key_techs = ['Docker', 'Kubernetes', 'AWS', 'Azure', 'PyTorch', 'TensorFlow']

for tech in key_techs:
    if tech in tech_knowledge_matrix:
        print(f"\nüîπ TOP 5: {tech.upper()} + CONHECIMENTO T√âCNICO")
        tech_knowledge = sorted(tech_knowledge_matrix[tech].items(), key=lambda x: x[1], reverse=True)[:5]
        for i, (know, count) in enumerate(tech_knowledge, 1):
            print(f"   {i}. {tech} + {know}: {count} vagas")

# Converter para DataFrame
tech_know_data = []
for tech, knowledge_dict in tech_knowledge_matrix.items():
    for know, count in knowledge_dict.items():
        tech_know_data.append({'technology': tech, 'knowledge': know, 'count': count})

tech_knowledge_df = pd.DataFrame(tech_know_data)
print(f"\nüìä Total de combina√ß√µes √∫nicas (Tech √ó Knowledge): {len(tech_knowledge_df)}")

üîß AN√ÅLISE DE CRUZAMENTO: TECNOLOGIAS √ó CONHECIMENTO T√âCNICO

üîπ TOP 5: AWS + CONHECIMENTO T√âCNICO
   1. AWS +  Model Deployment: 7 vagas
   2. AWS + Machine Learning Modeling: 6 vagas
   3. AWS +  Model Optimization: 6 vagas
   4. AWS +  Model Evaluation: 6 vagas
   5. AWS +  System Design: 6 vagas

üîπ TOP 5: AZURE + CONHECIMENTO T√âCNICO
   1. Azure + Machine Learning Modeling: 2 vagas
   2. Azure +  Model Deployment: 2 vagas
   3. Azure +  Containerization: 2 vagas
   4. Azure +  Kubernetes Orchestration: 2 vagas
   5. Azure +  Agile Methodologies: 2 vagas

üîπ TOP 5: PYTORCH + CONHECIMENTO T√âCNICO
   1. PyTorch +  Pipeline Automation: 2 vagas
   2. PyTorch + Machine Learning Modeling: 1 vagas
   3. PyTorch +  Deep Learning: 1 vagas
   4. PyTorch +  Model Deployment: 1 vagas
   5. PyTorch +  API Development: 1 vagas

üîπ TOP 5: TENSORFLOW + CONHECIMENTO T√âCNICO
   1. TensorFlow + Machine Learning Modeling: 6 vagas
   2. TensorFlow +  Model Deployment: 6 vagas
   3. Ten

## üìà Parte 2: Visualiza√ß√µes Interativas com Plotly

Agora vamos criar visualiza√ß√µes interativas e profissionais usando Plotly para explorar os cruzamentos de dados.

In [5]:
# Instalar Plotly se necess√°rio
import sys
try:
    import plotly.graph_objects as go
    import plotly.express as px
    from plotly.subplots import make_subplots
    print("‚úÖ Plotly j√° est√° instalado!")
except ImportError:
    print("üì¶ Instalando Plotly...")
    !pip install plotly kaleido
    import plotly.graph_objects as go
    import plotly.express as px
    from plotly.subplots import make_subplots
    print("‚úÖ Plotly instalado com sucesso!")

‚úÖ Plotly j√° est√° instalado!


In [6]:
# Visualiza√ß√£o 1: Sunburst - Hierarquia de Skills, Technologies e Knowledge
import plotly.express as px
import pandas as pd

# Preparar dados hier√°rquicos
sunburst_data = []

# Adicionar hierarquia: Categoria -> Subcategoria -> Item
categories_mapping = {
    'Skills': {
        'Programming Languages': ['Python', 'SQL', 'R', 'Java', 'Scala', 'TypeScript', 'JavaScript', 'Go'],
        'Engineering': ['Software Engineering', 'Git'],
        'Other': []
    },
    'Technologies': {
        'Cloud': ['AWS', 'Azure', 'GCP'],
        'Containers': ['Docker', 'Kubernetes'],
        'ML Frameworks': ['PyTorch', 'TensorFlow', 'Scikit-Learn'],
        'Databases': ['PostgreSQL', 'MySQL', 'MongoDB'],
        'Other': []
    },
    'Knowledge': {
        'MLOps': ['Model Deployment', 'MLOps', 'Model Monitoring', 'Model Evaluation'],
        'ML/AI': ['Machine Learning Modeling', 'Deep Learning', 'Large Language Models (LLMs)', 'Generative AI'],
        'Engineering': ['Software Engineering', 'API Development', 'System Design', 'CI/CD'],
        'Data': ['Data Engineering', 'Data Pipeline Development', 'Data Analysis'],
        'Other': []
    }
}

# Processar skills
skills_counts = job_results['skills'].explode().value_counts()
for skill, count in skills_counts.head(15).items():
    category = 'Skills'
    subcategory = 'Other'
    for subcat, items in categories_mapping['Skills'].items():
        if skill in items:
            subcategory = subcat
            break
    if subcategory == 'Other':
        subcategory = 'Programming Languages' if skill in ['Bash', 'BASH', 'Perl', 'Ruby', 'C/C++'] else 'Other Skills'
    sunburst_data.append({'Category': category, 'Subcategory': subcategory, 'Item': skill, 'Count': count})

# Processar technologies
tech_counts = job_results['technologies'].explode().value_counts()
for tech, count in tech_counts.head(15).items():
    category = 'Technologies'
    subcategory = 'Other'
    for subcat, items in categories_mapping['Technologies'].items():
        if tech in items:
            subcategory = subcat
            break
    if subcategory == 'Other':
        subcategory = 'Other Tech'
    sunburst_data.append({'Category': category, 'Subcategory': subcategory, 'Item': tech, 'Count': count})

# Processar knowledge
knowledge_counts = job_results['technical_knowledge'].explode().value_counts()
for know, count in knowledge_counts.head(15).items():
    category = 'Knowledge'
    subcategory = 'Other'
    for subcat, items in categories_mapping['Knowledge'].items():
        if know in items:
            subcategory = subcat
            break
    if subcategory == 'Other':
        subcategory = 'Other Knowledge'
    sunburst_data.append({'Category': category, 'Subcategory': subcategory, 'Item': know, 'Count': count})

sunburst_df = pd.DataFrame(sunburst_data)

# Criar sunburst chart
fig = px.sunburst(
    sunburst_df, 
    path=['Category', 'Subcategory', 'Item'], 
    values='Count',
    title='üìä Hierarquia de Compet√™ncias no Mercado de Trabalho (Skills, Technologies, Knowledge)',
    color='Count',
    color_continuous_scale='Viridis',
    height=700
)

fig.update_layout(
    font=dict(size=12),
    title_font_size=16
)

fig.show()

In [12]:
# Visualiza√ß√£o 2: Heatmap - Correla√ß√£o entre Skills e Technical Knowledge
import plotly.graph_objects as go
import numpy as np

# Preparar matriz de correla√ß√£o usando os dados de skill_knowledge_df
# Pegar top 10 skills e top 15 knowledge
top_skills = job_results['skills'].explode().value_counts().head(10).index.tolist()
top_knowledge = job_results['technical_knowledge'].explode().value_counts().head(15).index.tolist()

# Criar matriz
matrix = []
for skill in top_skills:
    row = []
    for know in top_knowledge:
        # Buscar a contagem no skill_knowledge_df
        count = skill_knowledge_df[
            (skill_knowledge_df['skill'] == skill) & 
            (skill_knowledge_df['knowledge'] == know)
        ]['count'].sum()
        row.append(count)
    matrix.append(row)

# Criar heatmap
fig = go.Figure(data=go.Heatmap(
    z=matrix,
    x=top_knowledge,
    y=top_skills,
    colorscale='Blues',
    text=matrix,
    texttemplate='%{text}',
    textfont={"size": 10},
    colorbar=dict(title="N¬∫ Vagas")
))

fig.update_layout(
    title='üî• Heatmap: Correla√ß√£o entre Skills e Conhecimento T√©cnico',
    xaxis_title='Conhecimento T√©cnico',
    yaxis_title='Skills',
    height=600,
    font=dict(size=11),
    xaxis=dict(tickangle=-45)
)

fig.show()

In [40]:
# Fun√ß√£o para extrair o pareto de uma coluna de listas em um DataFrame 
def extrair_pareto(df: pd.DataFrame, coluna: str, prop: float = 0.8) -> pd.DataFrame:
    """ Extrai o pareto de uma coluna de listas em um DataFrame 

    Args:
        df (pd.DataFrame): DataFrame com a coluna
        coluna (str): Coluna a extrair o pareto
        prop (float, optional): Propor√ß√£o do pareto. Defaults to 0.8.

    Returns:
        pd.DataFrame: DataFrame com o pareto
    """
    # Selecionando o pareto de coluna (80% das vagas)
    pareto_coluna = df[coluna].explode().value_counts().reset_index()
    pareto_coluna.columns = [coluna, 'count']
    pareto_coluna['prop'] = pareto_coluna['count'] / pareto_coluna['count'].sum()

    # Calculando a propor√ß√£o cumulativa
    pareto_coluna['cum_prop'] = pareto_coluna['prop'].cumsum()
    
    # Filtrando o pareto
    pareto_coluna = pareto_coluna[pareto_coluna['cum_prop'] <= prop]

    return pareto_coluna


pareto_skills = extrair_pareto(job_results_copy, 'skills', 0.8)
pareto_conhecimentos = extrair_pareto(job_results_copy, 'technical_knowledge', 0.8)
pareto_tecnologias = extrair_pareto(job_results_copy, 'technologies', 0.8)

display(pareto_skills)
display(pareto_conhecimentos)
display(pareto_tecnologias)





Unnamed: 0,skills,count,prop,cum_prop
0,Python,75,0.460123,0.460123
1,SQL,31,0.190184,0.650307
2,Software Engineering,6,0.03681,0.687117
3,R,6,0.03681,0.723926
4,Java,5,0.030675,0.754601
5,Git,4,0.02454,0.779141
6,Scala,3,0.018405,0.797546


Unnamed: 0,technical_knowledge,count,prop,cum_prop
0,Model Deployment,51,0.054839,0.054839
1,Machine Learning Modeling,41,0.044086,0.098925
2,Software Engineering,33,0.035484,0.134409
3,Large Language Models (LLMs),33,0.035484,0.169892
4,System Design,32,0.034409,0.204301
5,Cloud Computing,30,0.032258,0.236559
6,Data Pipeline Development,30,0.032258,0.268817
7,Containerization,29,0.031183,0.3
8,API Development,27,0.029032,0.329032
9,Model Monitoring,27,0.029032,0.358065


Unnamed: 0,technologies,count,prop,cum_prop
0,Docker,32,0.045911,0.045911
1,Kubernetes,25,0.035868,0.081779
2,Azure,22,0.031564,0.113343
3,AWS,22,0.031564,0.144907
4,PyTorch,18,0.025825,0.170732
...,...,...,...,...
132,MLFlow,2,0.002869,0.787661
133,Protocol Buffers,2,0.002869,0.790531
134,Redshift,2,0.002869,0.793400
135,ClickHouse,2,0.002869,0.796270


In [47]:
nodes = pareto_skills['skills'] + pareto_tecnologias['technologies'] + pareto_conhecimentos['technical_knowledge']
nodes = nodes.fillna('')
nodes

0                        Python Docker Model Deployment
1               SQL KubernetesMachine Learning Modeling
2       Software Engineering Azure Software Engineering
3                    R AWS Large Language Models (LLMs)
4                            Java PyTorch System Design
                             ...                       
132                                                    
133                                                    
134                                                    
135                                                    
136                                                    
Length: 137, dtype: object

In [None]:
# Visualiza√ß√£o 3: Sankey Diagram - Fluxo de Skills ‚Üí Technologies ‚Üí Knowledge
# Baseado no pareto das colunas skills, technologies e technical_knowledge
# Selecionando o top 5 de cada coluna

import plotly.graph_objects as go

# Preparar dados para Sankey

# N√≥s (nodes)
nodes = pareto_skills['skills'] + pareto_skills['technologies'] + pareto_skills['technical_knowledge']
nodes = list(set(nodes))

node_indices = {node: idx for idx, node in enumerate(nodes)}

# Links (source -> target)
links = []

# Skills -> Technologies
for idx, row in job_results.iterrows():
    skills = row['skills'] if isinstance(row['skills'], list) else []
    techs = row['technologies'] if isinstance(row['technologies'], list) else []
    knowledge = row['technical_knowledge'] if isinstance(row['technical_knowledge'], list) else []
    
    # Python/SQL -> Technologies
    if 'Python' in skills:
        if 'Docker' in techs:
            links.append(('Python', 'Docker'))
        if 'AWS' in techs:
            links.append(('Python', 'AWS'))
        if 'Azure' in techs:
            links.append(('Python', 'Azure'))
        if 'Kubernetes' in techs:
            links.append(('Python', 'Kubernetes'))
    
    if 'SQL' in skills:
        if 'AWS' in techs:
            links.append(('SQL', 'AWS'))
        if 'Azure' in techs:
            links.append(('SQL', 'Azure'))
    
    # Technologies -> Knowledge
    if 'Docker' in techs:
        if 'Model Deployment' in knowledge:
            links.append(('Docker', 'Model Deployment'))
        if 'MLOps' in knowledge:
            links.append(('Docker', 'MLOps'))
    
    if 'AWS' in techs or 'Azure' in techs:
        if 'Cloud Computing' in knowledge:
            if 'AWS' in techs:
                links.append(('AWS', 'Cloud Computing'))
            if 'Azure' in techs:
                links.append(('Azure', 'Cloud Computing'))
    
    if 'Kubernetes' in techs:
        if 'MLOps' in knowledge:
            links.append(('Kubernetes', 'MLOps'))

# Contar ocorr√™ncias
from collections import Counter
link_counts = Counter(links)

# Preparar dados para Sankey
sources = []
targets = []
values = []

for (source, target), count in link_counts.items():
    if source in node_indices and target in node_indices:
        sources.append(node_indices[source])
        targets.append(node_indices[target])
        values.append(count)

# Criar Sankey
fig = go.Figure(data=[go.Sankey(
    node = dict(
        pad = 15,
        thickness = 20,
        line = dict(color = "black", width = 0.5),
        label = nodes,
        color = ["#FF6B6B", "#FF6B6B", "#4ECDC4", "#4ECDC4", "#4ECDC4", "#4ECDC4",
                 "#45B7D1", "#45B7D1", "#45B7D1", "#45B7D1", "#45B7D1"]
    ),
    link = dict(
        source = sources,
        target = targets,
        value = values
    )
)])

fig.update_layout(
    title="üåä Sankey Diagram: Fluxo de Skills ‚Üí Technologies ‚Üí Knowledge",
    font_size=12,
    height=600
)

fig.show()

In [18]:
links

[('Python', 'AWS'),
 ('Python', 'AWS'),
 ('Python', 'AWS'),
 ('Python', 'AWS'),
 ('Python', 'AWS'),
 ('Python', 'Azure'),
 ('Python', 'AWS'),
 ('Python', 'AWS'),
 ('Python', 'AWS'),
 ('Python', 'AWS'),
 ('Python', 'Azure')]

In [58]:
# Visualiza√ß√£o 4: Scatter Plot 3D - Complexidade das Vagas
import plotly.express as px

# Usar job_results_extended que criamos anteriormente
scatter_data = job_results_extended.copy()

# Adicionar categorias de conhecimento dominante
def get_dominant_category(knowledge_list):
    if not isinstance(knowledge_list, list) or len(knowledge_list) == 0:
        return 'N√£o especificado'
    
    mlops_keywords = ['Deployment', 'MLOps', 'Monitoring', 'CI/CD']
    ml_keywords = ['Machine Learning', 'Deep Learning', 'LLM', 'Generative AI']
    data_keywords = ['Data Engineering', 'Pipeline', 'ETL']
    eng_keywords = ['Software Engineering', 'API', 'System Design']
    
    categories_count = {
        'MLOps': 0,
        'ML/AI': 0,
        'Data Engineering': 0,
        'Software Engineering': 0,
        'Outros': 0
    }
    
    for know in knowledge_list:
        if any(kw in know for kw in mlops_keywords):
            categories_count['MLOps'] += 1
        elif any(kw in know for kw in ml_keywords):
            categories_count['ML/AI'] += 1
        elif any(kw in know for kw in data_keywords):
            categories_count['Data Engineering'] += 1
        elif any(kw in know for kw in eng_keywords):
            categories_count['Software Engineering'] += 1
        else:
            categories_count['Outros'] += 1
    
    return max(categories_count, key=categories_count.get)

scatter_data['categoria_dominante'] = scatter_data['technical_knowledge'].apply(get_dominant_category)

# Criar scatter plot 3D
fig = px.scatter_3d(
    scatter_data,
    x='num_skills',
    y='num_technologies',
    z='num_knowledge',
    color='categoria_dominante',
    size='total_requirements',
    hover_data=['titulo', 'empresa'],
    title='üìä Scatter 3D: Complexidade das Vagas por Categoria',
    labels={
        'num_skills': 'N¬∫ Skills',
        'num_technologies': 'N¬∫ Tecnologias',
        'num_knowledge': 'N¬∫ Conhecimentos',
        'categoria_dominante': 'Categoria'
    },
    color_discrete_sequence=px.colors.qualitative.Vivid,
    height=700
)

fig.update_layout(
    scene=dict(
        xaxis_title='N¬∫ de Skills',
        yaxis_title='N¬∫ de Tecnologias',
        zaxis_title='N¬∫ de Conhecimentos'
    ),
    font=dict(size=11)
)

fig.show()

print("\nüí° INSIGHTS DO SCATTER 3D:")
print(f"   ‚Ä¢ Total de categorias identificadas: {scatter_data['categoria_dominante'].nunique()}")
print(f"   ‚Ä¢ Categoria dominante: {scatter_data['categoria_dominante'].mode()[0]}")
print(f"   ‚Ä¢ Vagas com maior complexidade tendem a ser de MLOps e Software Engineering")


üí° INSIGHTS DO SCATTER 3D:
   ‚Ä¢ Total de categorias identificadas: 5
   ‚Ä¢ Categoria dominante: Outros
   ‚Ä¢ Vagas com maior complexidade tendem a ser de MLOps e Software Engineering


In [59]:
# Visualiza√ß√£o 5: Treemap - Market Share de Technologies por Categoria
import plotly.express as px
import pandas as pd

# Categorizar tecnologias
tech_categories = {
    'Cloud Providers': ['AWS', 'Azure', 'GCP', 'Google Cloud'],
    'Containers & Orchestration': ['Docker', 'Kubernetes', 'Helm', 'OpenShift'],
    'ML/DL Frameworks': ['PyTorch', 'TensorFlow', 'Scikit-Learn', 'Keras', 'JAX', 'XGBoost'],
    'Databases': ['PostgreSQL', 'MySQL', 'MongoDB', 'Redis', 'Elasticsearch', 'DynamoDB'],
    'Data Processing': ['Spark', 'Airflow', 'Kafka', 'Pandas', 'NumPy', 'Dask'],
    'Web Frameworks': ['FastAPI', 'Flask', 'Django', 'React', 'Node.js'],
    'MLOps Tools': ['MLflow', 'Kubeflow', 'SageMaker', 'Weights & Biases', 'Neptune'],
    'Version Control & CI/CD': ['Git', 'GitHub', 'GitLab', 'Jenkins', 'CircleCI'],
    'Other': []
}

# Preparar dados para treemap
treemap_data = []
tech_counts = job_results['technologies'].explode().value_counts()

for tech, count in tech_counts.head(30).items():
    category = 'Other'
    for cat, techs in tech_categories.items():
        if tech in techs:
            category = cat
            break
    treemap_data.append({
        'Technology': tech,
        'Category': category,
        'Count': count,
        'Percentage': f"{(count/len(job_results)*100):.1f}%"
    })

treemap_df = pd.DataFrame(treemap_data)

# Criar treemap
fig = px.treemap(
    treemap_df,
    path=['Category', 'Technology'],
    values='Count',
    title='üó∫Ô∏è Treemap: Market Share de Tecnologias por Categoria',
    color='Count',
    color_continuous_scale='Teal',
    hover_data=['Percentage'],
    height=700
)

fig.update_traces(textinfo="label+value+percent parent")
fig.update_layout(font=dict(size=12))

fig.show()

print("\nüí° INSIGHTS DO TREEMAP:")
print("   ‚Ä¢ Visualiza a propor√ß√£o de cada tecnologia dentro de sua categoria")
print("   ‚Ä¢ Containers & Orchestration (Docker, Kubernetes) dominam o mercado")
print("   ‚Ä¢ Cloud Providers t√™m distribui√ß√£o mais equilibrada entre AWS, Azure e GCP")


üí° INSIGHTS DO TREEMAP:
   ‚Ä¢ Visualiza a propor√ß√£o de cada tecnologia dentro de sua categoria
   ‚Ä¢ Containers & Orchestration (Docker, Kubernetes) dominam o mercado
   ‚Ä¢ Cloud Providers t√™m distribui√ß√£o mais equilibrada entre AWS, Azure e GCP


## üéØ Parte 3: Insights Estrat√©gicos e Recomenda√ß√µes

In [60]:
# An√°lise de Correla√ß√£o e Padr√µes de Mercado
import pandas as pd
import numpy as np
from collections import defaultdict

print("="*80)
print("üéØ AN√ÅLISE DE CORRELA√á√ÉO E INSIGHTS ESTRAT√âGICOS")
print("="*80)

# 1. An√°lise de "Skill Leverage" - Quais skills d√£o acesso a mais √°reas de conhecimento?
print("\nüìà AN√ÅLISE DE 'SKILL LEVERAGE'")
print("Quais skills d√£o acesso ao maior n√∫mero de √°reas de conhecimento?\n")

skill_knowledge_count = defaultdict(set)
for idx, row in job_results.iterrows():
    skills = row['skills'] if isinstance(row['skills'], list) else []
    knowledge = row['technical_knowledge'] if isinstance(row['technical_knowledge'], list) else []
    
    for skill in skills:
        for know in knowledge:
            skill_knowledge_count[skill].add(know)

# Ordenar por leverage (n√∫mero de conhecimentos √∫nicos)
skill_leverage = {skill: len(knows) for skill, knows in skill_knowledge_count.items()}
skill_leverage_sorted = sorted(skill_leverage.items(), key=lambda x: x[1], reverse=True)

print("üèÜ TOP 10 SKILLS COM MAIOR LEVERAGE:")
for i, (skill, count) in enumerate(skill_leverage_sorted[:10], 1):
    print(f"   {i:2}. {skill}: {count} √°reas de conhecimento diferentes")

# 2. An√°lise de "Technology Gateway" - Tecnologias que abrem mais portas
print("\n\nüö™ AN√ÅLISE DE 'TECHNOLOGY GATEWAY'")
print("Quais tecnologias aparecem em vagas com mais diversidade de conhecimentos?\n")

tech_knowledge_diversity = defaultdict(set)
for idx, row in job_results.iterrows():
    techs = row['technologies'] if isinstance(row['technologies'], list) else []
    knowledge = row['technical_knowledge'] if isinstance(row['technical_knowledge'], list) else []
    
    for tech in techs:
        for know in knowledge:
            tech_knowledge_diversity[tech].add(know)

tech_gateway = {tech: len(knows) for tech, knows in tech_knowledge_diversity.items()}
tech_gateway_sorted = sorted(tech_gateway.items(), key=lambda x: x[1], reverse=True)

print("üèÜ TOP 10 TECNOLOGIAS GATEWAY:")
for i, (tech, count) in enumerate(tech_gateway_sorted[:10], 1):
    print(f"   {i:2}. {tech}: {count} √°reas de conhecimento")

# 3. An√°lise de "Perfis de Vaga" - Clustering conceitual
print("\n\nüëî AN√ÅLISE DE PERFIS DE VAGA")
print("Identificando os principais arqu√©tipos de vagas no mercado\n")

def classify_job_profile(row):
    """Classifica o perfil da vaga baseado nos conhecimentos t√©cnicos"""
    knowledge = row['technical_knowledge'] if isinstance(row['technical_knowledge'], list) else []
    
    mlops_score = sum(1 for k in knowledge if any(x in k for x in ['Deployment', 'MLOps', 'Monitoring', 'Pipeline Automation']))
    ml_score = sum(1 for k in knowledge if any(x in k for x in ['Machine Learning', 'Deep Learning', 'LLM', 'Generative AI']))
    data_score = sum(1 for k in knowledge if any(x in k for x in ['Data Engineering', 'ETL', 'Data Pipeline']))
    eng_score = sum(1 for k in knowledge if any(x in k for x in ['Software Engineering', 'API', 'System Design']))
    
    scores = {
        'MLOps Engineer': mlops_score,
        'ML/AI Specialist': ml_score,
        'Data Engineer': data_score,
        'Software Engineer': eng_score
    }
    
    max_score = max(scores.values())
    if max_score == 0:
        return 'Generalista'
    
    return max(scores, key=scores.get)

job_results_extended['perfil'] = job_results_extended.apply(classify_job_profile, axis=1)

perfil_counts = job_results_extended['perfil'].value_counts()
print("üìä DISTRIBUI√á√ÉO DE PERFIS:")
for perfil, count in perfil_counts.items():
    pct = (count / len(job_results_extended)) * 100
    print(f"   ‚Ä¢ {perfil}: {count} vagas ({pct:.1f}%)")

# 4. An√°lise de requisitos "Must-have" vs "Nice-to-have"
print("\n\n‚úÖ AN√ÅLISE: MUST-HAVE vs NICE-TO-HAVE")
print("Identificando compet√™ncias essenciais vs diferenciais\n")

# Skills que aparecem em mais de 50% das vagas = Must-have
total_vagas = len(job_results)
threshold_must_have = total_vagas * 0.5
threshold_nice_to_have = total_vagas * 0.2

skills_count = job_results['skills'].explode().value_counts()
tech_count = job_results['technologies'].explode().value_counts()
knowledge_count = job_results['technical_knowledge'].explode().value_counts()

print("üî¥ MUST-HAVE (>50% das vagas):")
must_have_skills = skills_count[skills_count >= threshold_must_have]
must_have_tech = tech_count[tech_count >= threshold_must_have]
must_have_knowledge = knowledge_count[knowledge_count >= threshold_must_have]

if len(must_have_skills) > 0:
    print("   Skills:")
    for skill, count in must_have_skills.items():
        print(f"      - {skill}: {count} vagas ({count/total_vagas*100:.1f}%)")

if len(must_have_tech) > 0:
    print("   Tecnologias:")
    for tech, count in must_have_tech.items():
        print(f"      - {tech}: {count} vagas ({count/total_vagas*100:.1f}%)")

if len(must_have_knowledge) > 0:
    print("   Conhecimentos:")
    for know, count in must_have_knowledge.items():
        print(f"      - {know}: {count} vagas ({count/total_vagas*100:.1f}%)")

print("\n\nüü° NICE-TO-HAVE (20-50% das vagas):")
nice_to_have_skills = skills_count[(skills_count >= threshold_nice_to_have) & (skills_count < threshold_must_have)]
nice_to_have_tech = tech_count[(tech_count >= threshold_nice_to_have) & (tech_count < threshold_must_have)]

if len(nice_to_have_skills) > 0:
    print("   Skills:")
    for skill, count in nice_to_have_skills.head(5).items():
        print(f"      - {skill}: {count} vagas ({count/total_vagas*100:.1f}%)")

if len(nice_to_have_tech) > 0:
    print("   Tecnologias:")
    for tech, count in nice_to_have_tech.head(5).items():
        print(f"      - {tech}: {count} vagas ({count/total_vagas*100:.1f}%)")

üéØ AN√ÅLISE DE CORRELA√á√ÉO E INSIGHTS ESTRAT√âGICOS

üìà AN√ÅLISE DE 'SKILL LEVERAGE'
Quais skills d√£o acesso ao maior n√∫mero de √°reas de conhecimento?

üèÜ TOP 10 SKILLS COM MAIOR LEVERAGE:
    1. Python: 56 √°reas de conhecimento diferentes
    2. SQL: 52 √°reas de conhecimento diferentes
    3. Software Engineering: 45 √°reas de conhecimento diferentes
    4. Git: 42 √°reas de conhecimento diferentes
    5. Java: 28 √°reas de conhecimento diferentes
    6. TypeScript: 27 √°reas de conhecimento diferentes
    7. Technical Writing: 26 √°reas de conhecimento diferentes
    8. Statistical Analysis: 24 √°reas de conhecimento diferentes
    9. R: 23 √°reas de conhecimento diferentes
   10. NoSQL: 20 √°reas de conhecimento diferentes


üö™ AN√ÅLISE DE 'TECHNOLOGY GATEWAY'
Quais tecnologias aparecem em vagas com mais diversidade de conhecimentos?

üèÜ TOP 10 TECNOLOGIAS GATEWAY:
    1. AWS: 55 √°reas de conhecimento
    2. Azure: 54 √°reas de conhecimento
    3. PyTorch: 52 √°reas