In [2]:
# %% [markdown]
# # üìä An√°lisis Exploratorio del Mercado Laboral de Data Science en Espa√±a
# ## Descubriendo insights y tendencias

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
from collections import Counter
import ast

# Configuraci√≥n
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

# %%
# Cargar datos limpios
df = pd.read_csv("../data/processed/jobs_cleaned.csv")

# Convertir skills de string a lista
df['skills'] = df['skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) and x != '[]' else [])

print(f"üìä Dataset: {len(df)} ofertas √ó {len(df.columns)} columnas")
df.head()

# %% [markdown]
# ## 1Ô∏è‚É£ Panorama General del Mercado

# %%
# KPIs principales
total_jobs = len(df)
avg_salary = df['salary_avg'].mean()
median_salary = df['salary_avg'].median()
cities = df['city'].nunique()
companies = df['company'].nunique()
ai_jobs = df['is_ai_related'].sum()

# Crear dashboard de KPIs
fig = go.Figure()

fig.add_trace(go.Indicator(
    mode = "number",
    value = total_jobs,
    title = {"text": "Total Ofertas"},
    domain = {'x': [0, 0.2], 'y': [0.7, 1]}
))

fig.add_trace(go.Indicator(
    mode = "number",
    value = avg_salary,
    number = {'prefix': "‚Ç¨", 'valueformat': ",.0f"},
    title = {"text": "Salario Promedio"},
    domain = {'x': [0.25, 0.45], 'y': [0.7, 1]}
))

fig.add_trace(go.Indicator(
    mode = "number",
    value = cities,
    title = {"text": "Ciudades"},
    domain = {'x': [0.5, 0.7], 'y': [0.7, 1]}
))

fig.add_trace(go.Indicator(
    mode = "number+delta",
    value = ai_jobs,
    delta = {'reference': total_jobs/2, 'relative': False},
    title = {"text": "Ofertas IA/ML"},
    domain = {'x': [0.75, 0.95], 'y': [0.7, 1]}
))

fig.update_layout(
    title="üìà Mercado Laboral de Data Science en Espa√±a - KPIs",
    height=300
)

fig.show()

# %% [markdown]
# ## 2Ô∏è‚É£ Distribuci√≥n de Roles

# %%
# Distribuci√≥n de categor√≠as de roles
role_counts = df['role_category'].value_counts()

fig = px.bar(
    x=role_counts.values,
    y=role_counts.index,
    orientation='h',
    title="üíº Distribuci√≥n de Roles de Data Science",
    labels={'x': 'N√∫mero de Ofertas', 'y': 'Categor√≠a'},
    color=role_counts.values,
    color_continuous_scale='Blues',
    text=role_counts.values
)

fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(showlegend=False, height=500)
fig.show()

# %%
# Roles por nivel de experiencia
role_seniority = pd.crosstab(df['role_category'], df['seniority'])

fig = px.bar(
    role_seniority,
    title="üìä Distribuci√≥n de Roles por Nivel de Experiencia",
    labels={'value': 'N√∫mero de Ofertas', 'variable': 'Nivel'},
    barmode='stack',
    color_discrete_sequence=px.colors.qualitative.Set2
)

fig.update_layout(
    xaxis_title="Categor√≠a de Rol",
    yaxis_title="N√∫mero de Ofertas",
    legend_title="Nivel de Experiencia",
    height=500
)

fig.show()

# %% [markdown]
# ## 3Ô∏è‚É£ An√°lisis Geogr√°fico

# %%
# Top 15 ciudades
top_cities = df['city'].value_counts().head(15)

fig = px.bar(
    x=top_cities.index,
    y=top_cities.values,
    title="üèôÔ∏è Top 15 Ciudades con M√°s Ofertas de Data Science",
    labels={'x': 'Ciudad', 'y': 'N√∫mero de Ofertas'},
    color=top_cities.values,
    color_continuous_scale='Viridis'
)

fig.update_layout(
    xaxis_tickangle=-45,
    showlegend=False,
    height=500
)

fig.show()

# %%
# Distribuci√≥n de roles por ciudad (Top 5 ciudades)
top5_cities = df['city'].value_counts().head(5).index
df_top5 = df[df['city'].isin(top5_cities)]

city_role_dist = pd.crosstab(df_top5['city'], df_top5['role_category'])

fig = px.bar(
    city_role_dist,
    title="üíº Distribuci√≥n de Roles en las Principales Ciudades",
    labels={'value': 'N√∫mero de Ofertas', 'variable': 'Tipo de Rol'},
    barmode='group',
    color_discrete_sequence=px.colors.qualitative.Pastel
)

fig.update_layout(height=500, xaxis_title="Ciudad", yaxis_title="Ofertas")
fig.show()

# %% [markdown]
# ## 4Ô∏è‚É£ An√°lisis de Skills

# %%
# Top 20 skills m√°s demandadas
all_skills = [skill for skills_list in df['skills'] if skills_list for skill in skills_list]
skill_counts = Counter(all_skills)
top_skills = dict(skill_counts.most_common(20))

fig = px.bar(
    x=list(top_skills.values()),
    y=list(top_skills.keys()),
    orientation='h',
    title="üî• Top 20 Skills M√°s Demandadas",
    labels={'x': 'N√∫mero de Ofertas', 'y': 'Skill'},
    color=list(top_skills.values()),
    color_continuous_scale='Reds',
    text=list(top_skills.values())
)

fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(showlegend=False, height=600)
fig.show()

# %%
# Skills por categor√≠a de rol (Top 5 roles)
top_roles = df['role_category'].value_counts().head(5).index

skills_by_role = {}
for role in top_roles:
    role_skills = [skill for skills_list in df[df['role_category']==role]['skills'] 
                   if skills_list for skill in skills_list]
    skills_by_role[role] = Counter(role_skills).most_common(10)

# Crear subplots
fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=[f"{role}" for role in top_roles],
    specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "bar"}, {"type": "xy"}]]
)

positions = [(1,1), (1,2), (1,3), (2,1), (2,2)]

for idx, role in enumerate(top_roles):
    skills = [s[0] for s in skills_by_role[role]]
    counts = [s[1] for s in skills_by_role[role]]
    
    row, col = positions[idx]
    fig.add_trace(
        go.Bar(y=skills, x=counts, orientation='h', name=role, showlegend=False),
        row=row, col=col
    )

fig.update_layout(
    title_text="üéØ Top 10 Skills por Categor√≠a de Rol",
    height=800,
    showlegend=False
)

fig.show()

# %% [markdown]
# ## 5Ô∏è‚É£ An√°lisis Salarial

# %%
# Filtrar solo ofertas con salario
df_salary = df[df['salary_avg'].notna()].copy()

print(f"üí∞ An√°lisis basado en {len(df_salary)} ofertas con informaci√≥n salarial")

# %%
# Distribuci√≥n de salarios
fig = px.histogram(
    df_salary,
    x='salary_avg',
    nbins=30,
    title="üí∂ Distribuci√≥n de Salarios en Data Science",
    labels={'salary_avg': 'Salario Anual (‚Ç¨)', 'count': 'N√∫mero de Ofertas'},
    color_discrete_sequence=['#3498db']
)

fig.add_vline(x=df_salary['salary_avg'].mean(), line_dash="dash", 
              line_color="red", annotation_text=f"Media: {df_salary['salary_avg'].mean():,.0f}‚Ç¨")
fig.add_vline(x=df_salary['salary_avg'].median(), line_dash="dash", 
              line_color="green", annotation_text=f"Mediana: {df_salary['salary_avg'].median():,.0f}‚Ç¨")

fig.update_layout(height=500)
fig.show()

# %%
# Salarios por categor√≠a de rol
salary_by_role = df_salary.groupby('role_category')['salary_avg'].agg(['mean', 'median', 'count'])
salary_by_role = salary_by_role[salary_by_role['count'] >= 10].sort_values('mean', ascending=False)

fig = go.Figure()

fig.add_trace(go.Bar(
    name='Salario Promedio',
    x=salary_by_role.index,
    y=salary_by_role['mean'],
    text=salary_by_role['mean'].apply(lambda x: f"{x:,.0f}‚Ç¨"),
    textposition='outside'
))

fig.add_trace(go.Bar(
    name='Salario Mediano',
    x=salary_by_role.index,
    y=salary_by_role['median'],
    text=salary_by_role['median'].apply(lambda x: f"{x:,.0f}‚Ç¨"),
    textposition='outside'
))

fig.update_layout(
    title="üíº Salarios por Tipo de Rol",
    xaxis_title="Categor√≠a",
    yaxis_title="Salario Anual (‚Ç¨)",
    barmode='group',
    height=500
)

fig.show()

# %%
# Salarios por nivel de experiencia
salary_by_seniority = df_salary.groupby('seniority')['salary_avg'].agg(['mean', 'median', 'count'])
salary_by_seniority = salary_by_seniority[salary_by_seniority['count'] >= 10].sort_values('mean', ascending=False)

fig = px.bar(
    x=salary_by_seniority.index,
    y=salary_by_seniority['mean'],
    title="üìä Salarios por Nivel de Experiencia",
    labels={'x': 'Nivel', 'y': 'Salario Promedio (‚Ç¨)'},
    color=salary_by_seniority['mean'],
    color_continuous_scale='Greens',
    text=salary_by_seniority['mean'].apply(lambda x: f"{x:,.0f}‚Ç¨")
)

fig.update_traces(textposition='outside')
fig.update_layout(showlegend=False, height=500)
fig.show()

# %%
# Salarios por ciudad (Top 10 con suficientes datos)
salary_by_city = df_salary.groupby('city')['salary_avg'].agg(['mean', 'count'])
salary_by_city = salary_by_city[salary_by_city['count'] >= 10].sort_values('mean', ascending=False).head(10)

fig = px.bar(
    x=salary_by_city.index,
    y=salary_by_city['mean'],
    title="üèôÔ∏è Salarios por Ciudad (Top 10)",
    labels={'x': 'Ciudad', 'y': 'Salario Promedio (‚Ç¨)'},
    color=salary_by_city['mean'],
    color_continuous_scale='Blues',
    text=salary_by_city['mean'].apply(lambda x: f"{x:,.0f}‚Ç¨")
)

fig.update_traces(textposition='outside')
fig.update_layout(showlegend=False, xaxis_tickangle=-45, height=500)
fig.show()

# %%
# Boxplot de salarios por nivel
fig = px.box(
    df_salary,
    x='seniority',
    y='salary_avg',
    title="üì¶ Distribuci√≥n Salarial por Nivel de Experiencia",
    labels={'seniority': 'Nivel', 'salary_avg': 'Salario (‚Ç¨)'},
    color='seniority',
    points='outliers'
)

fig.update_layout(showlegend=False, height=500)
fig.show()

# %% [markdown]
# ## 6Ô∏è‚É£ An√°lisis de IA/ML

# %%
# Comparaci√≥n IA vs No-IA
ai_comparison = df.groupby('is_ai_related').agg({
    'id': 'count',
    'salary_avg': 'mean'
}).rename(columns={'id': 'count'})

ai_comparison.index = ['No-IA', 'IA/ML']

fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("N√∫mero de Ofertas", "Salario Promedio"),
    specs=[[{"type": "pie"}, {"type": "bar"}]]
)

# Pie chart de distribuci√≥n
fig.add_trace(
    go.Pie(labels=ai_comparison.index, values=ai_comparison['count'], 
           marker_colors=['#95a5a6', '#e74c3c']),
    row=1, col=1
)

# Bar chart de salarios
fig.add_trace(
    go.Bar(x=ai_comparison.index, y=ai_comparison['salary_avg'],
           marker_color=['#95a5a6', '#e74c3c'],
           text=ai_comparison['salary_avg'].apply(lambda x: f"{x:,.0f}‚Ç¨" if pd.notna(x) else "N/A"),
           textposition='outside'),
    row=1, col=2
)

fig.update_layout(
    title_text="ü§ñ Comparaci√≥n: Ofertas de IA/ML vs Resto",
    height=400,
    showlegend=False
)

fig.show()

# %%
# Tendencia temporal de ofertas IA/ML
df['created'] = pd.to_datetime(df['created'])
df_temporal = df.set_index('created').resample('M').agg({
    'is_ai_related': ['sum', 'count']
})

df_temporal.columns = ['AI_Jobs', 'Total_Jobs']
df_temporal['AI_Percentage'] = (df_temporal['AI_Jobs'] / df_temporal['Total_Jobs'] * 100)

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=df_temporal.index, y=df_temporal['AI_Jobs'], 
               name="Ofertas IA/ML", line=dict(color='red', width=3)),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=df_temporal.index, y=df_temporal['Total_Jobs'], 
               name="Total Ofertas", line=dict(color='blue', width=2, dash='dash')),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=df_temporal.index, y=df_temporal['AI_Percentage'], 
               name="% IA/ML", line=dict(color='green', width=2)),
    secondary_y=True,
)

fig.update_layout(
    title_text="üìà Evoluci√≥n Temporal de Ofertas IA/ML",
    height=500
)

fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="N√∫mero de Ofertas", secondary_y=False)
fig.update_yaxes(title_text="Porcentaje IA/ML (%)", secondary_y=True)

fig.show()

# %% [markdown]
# ## 7Ô∏è‚É£ Top Empresas que Contratan

# %%
# Top 20 empresas
top_companies = df['company'].value_counts().head(20)

fig = px.bar(
    x=top_companies.values,
    y=top_companies.index,
    orientation='h',
    title="üè¢ Top 20 Empresas que M√°s Contratan en Data Science",
    labels={'x': 'N√∫mero de Ofertas', 'y': 'Empresa'},
    color=top_companies.values,
    color_continuous_scale='Teal',
    text=top_companies.values
)

fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(showlegend=False, height=700)
fig.show()

# %% [markdown]
# ## 8Ô∏è‚É£ Resumen de Insights Clave

# %%
print("=" * 70)
print("üéØ INSIGHTS CLAVE DEL AN√ÅLISIS".center(70))
print("=" * 70)

print("\n1Ô∏è‚É£ PANORAMA GENERAL:")
print(f"   ‚Ä¢ {total_jobs} ofertas activas de Data Science en Espa√±a")
print(f"   ‚Ä¢ Salario promedio: {avg_salary:,.0f}‚Ç¨/a√±o")
print(f"   ‚Ä¢ {ai_jobs} ({ai_jobs/total_jobs*100:.1f}%) son roles de IA/ML")

print("\n2Ô∏è‚É£ ROLES M√ÅS DEMANDADOS:")
for i, (role, count) in enumerate(df['role_category'].value_counts().head(3).items(), 1):
    print(f"   {i}. {role}: {count} ofertas ({count/total_jobs*100:.1f}%)")

print("\n3Ô∏è‚É£ CIUDADES TOP:")
for i, (city, count) in enumerate(df['city'].value_counts().head(5).items(), 1):
    print(f"   {i}. {city}: {count} ofertas")

print("\n4Ô∏è‚É£ SKILLS M√ÅS VALORADAS:")
for i, (skill, count) in enumerate(skill_counts.most_common(5), 1):
    print(f"   {i}. {skill}: {count} menciones ({count/len(df)*100:.1f}%)")

if len(df_salary) > 0:
    print("\n5Ô∏è‚É£ SALARIOS:")
    print(f"   ‚Ä¢ Rango t√≠pico: {df_salary['salary_avg'].quantile(0.25):,.0f}‚Ç¨ - {df_salary['salary_avg'].quantile(0.75):,.0f}‚Ç¨")
    print(f"   ‚Ä¢ Nivel Senior: {df_salary[df_salary['seniority']=='Senior']['salary_avg'].mean():,.0f}‚Ç¨")
    print(f"   ‚Ä¢ Nivel Junior: {df_salary[df_salary['seniority']=='Junior']['salary_avg'].mean():,.0f}‚Ç¨")

print("\n" + "=" * 70)

üìä Dataset: 2056 ofertas √ó 17 columnas


üí∞ An√°lisis basado en 781 ofertas con informaci√≥n salarial


                    üéØ INSIGHTS CLAVE DEL AN√ÅLISIS                     

1Ô∏è‚É£ PANORAMA GENERAL:
   ‚Ä¢ 2056 ofertas activas de Data Science en Espa√±a
   ‚Ä¢ Salario promedio: 84,577‚Ç¨/a√±o
   ‚Ä¢ 1256 (61.1%) son roles de IA/ML

2Ô∏è‚É£ ROLES M√ÅS DEMANDADOS:
   1. Data/ML Engineer: 544 ofertas (26.5%)
   2. Data Scientist: 430 ofertas (20.9%)
   3. Data Analyst: 348 ofertas (16.9%)

3Ô∏è‚É£ CIUDADES TOP:
   1. Espa√±a: 703 ofertas
   2. Madrid: 598 ofertas
   3. Barcelona: 511 ofertas
   4. M√°laga: 59 ofertas
   5. Valencia: 18 ofertas

4Ô∏è‚É£ SKILLS M√ÅS VALORADAS:
   1. Python: 72 menciones (3.5%)
   2. Azure: 69 menciones (3.4%)
   3. SQL: 54 menciones (2.6%)
   4. AWS: 38 menciones (1.8%)
   5. Spark: 30 menciones (1.5%)

5Ô∏è‚É£ SALARIOS:
   ‚Ä¢ Rango t√≠pico: 80,000‚Ç¨ - 80,000‚Ç¨
   ‚Ä¢ Nivel Senior: 85,468‚Ç¨
   ‚Ä¢ Nivel Junior: 86,667‚Ç¨

