In [1]:
import spacy
from spacy.pipeline import EntityRuler

def build_tech_pipeline():
    # Cargamos el modelo grande para mejor contexto
    nlp = spacy.load("es_core_news_lg")
    
    # Creamos el EntityRuler
    ruler = nlp.add_pipe("entity_ruler", before="ner")
    
    # Definimos patrones específicos
    patterns = [
        {"label": "TECH", "pattern": "Python"},
        {"label": "TECH", "pattern": "Docker"},
        {"label": "TECH", "pattern": "AWS"},
        {"label": "TECH", "pattern": [{"LOWER": "react"}]}, # Captura React, react, REACT
        {"label": "TECH", "pattern": "C++"},
        {"label": "TECH", "pattern": ".NET"},
        {"label": "TECH", "pattern": [{"LOWER": "node"}, {"IS_PUNCT": True, "OP": "?"}, {"LOWER": "js"}]}, # Nodejs, Node.js, Node-js
        {"label": "TECH", "pattern": "Kubernetes", "id": "K8s"},
    ]
    
    ruler.add_patterns(patterns)
    return nlp

nlp = build_tech_pipeline()

In [None]:
import pandas as pd

df = pd.read_csv("data/dataset_maestro.csv")

def extract_tech(df_column):
    detected_techs = []
    
    # nlp.pipe es la forma eficiente (as_tuple=True permite mantener el índice si fuera necesario)
    for doc in nlp.pipe(df_column.astype(str), batch_size=100, n_process=-1):
        # Extraemos solo las entidades etiquetadas como 'TECH'
        techs = list(set([ent.text for ent in doc.ents if ent.label_ == "TECH"]))
        detected_techs.append(techs)
        
    return detected_techs

# Aplicamos al DataFrame
df['stack_tecnologico'] = extract_tech(df['description'])

# Mostrar resultado
print(df[['description', 'stack_tecnologico']].head())