# AI Job Market Insights — EDA & Baselines (Phase 2)
Explora top skills, co-ocurrencias y baseline.


In [None]:
from collections import Counter
from itertools import combinations

import pandas as pd

df = pd.read_parquet('data/processed/jobs_features_skills.parquet')
df.head()


## Top 20 skills

In [None]:
all_skills = [s for lst in df.get('skills', []) for s in (lst or [])]
top = Counter(all_skills).most_common(20)
pd.DataFrame(top, columns=['skill','count'])


## Skill co-occurrence (top pairs)

In [None]:
pairs = Counter()
for lst in df.get('skills', []):
    if not lst:
        continue
    for a,b in combinations(sorted(set(lst)), 2):
        pairs[(a,b)] += 1
pd.DataFrame([(f"{a} & {b}", c) for (a,b), c in pairs.most_common(25)], columns=['pair','count'])


## Role label distribution (heuristic from title)

In [None]:
import re

ROLE_PATTERNS = [
    ("data_scientist", re.compile(r"(?i)\\bdata\\s*scientist\\b")),
    ("ml_engineer", re.compile(r"(?i)\\b(ml|machine\\s*learning)\\s*engineer\\b")),
    ("data_engineer", re.compile(r"(?i)\\bdata\\s*engineer\\b")),
    ("risk_analyst", re.compile(r"(?i)\\b(risk|credit)\\s*(data\\s*)?analyst\\b")),
    ("quant_researcher", re.compile(r"(?i)\\bquant(itative)?\\s*(researcher|analyst)?\\b")),
    ("mlops_engineer", re.compile(r"(?i)\\bmlops\\s*engineer\\b")),
    ("cv_engineer", re.compile(r"(?i)\\b(computer\\s*vision|cv)\\s*engineer\\b")),
]
def infer_role_label(title: str) -> str:
    if not isinstance(title, str):
        return "other"
    for label, pat in ROLE_PATTERNS:
        if pat.search(title):
            return label
    return "other"
df['role_label'] = df['title'].map(infer_role_label)
df['role_label'].value_counts()
