In [0]:
%run ./utils

In [0]:
# celda clave con las versiones compatibles

print("numpy:", numpy.__version__)
print("spaCy:", spacy.__version__)
print("es_core_news_sm:", es_core_news_sm.__version__)
print("PyPDF2:", PyPDF2.__version__)

In [0]:
# 1. Widgets de entrada/salida
input_path = "/Volumes/kyndryl/default/archivos/web_service_sap_pipeline.pdf"
output_dir = "/Volumes/kyndryl/default/archivos/"

# 2. Leer el PDF con PyPDF2
reader    = PyPDF2.PdfReader(input_path)
full_text = "\n".join(page.extract_text() or "" for page in reader.pages)


# 3. Chunking con solapamiento
chunks = [page.extract_text() or "" for page in reader.pages]

# 4. Carga de spaCy (descarga on-demand)
model_name = "es_core_news_sm"
try:
    nlp = spacy.load(model_name)
except OSError:
    spacy_download(model_name)
    nlp = spacy.load(model_name)

# 5. Enriquecimiento: detección de idioma + entidades
docs = []
base_id = os.path.splitext(os.path.basename(input_path))[0]

for idx, txt in enumerate(chunks):
    lang = detect(txt) if txt.strip() else "und"
    doc_spacy = nlp(txt)
    entities = [{"text": e.text, "label": e.label_} for e in doc_spacy.ents]
    docs.append({
        "id":         f"{base_id}_{idx}",
        "sourceId":   base_id,
        "chunkIndex": idx,
        "language":   lang,
        "text":       txt,
        "entities":   entities
    })

# 6. Serializar y guardar el JSON en DBFS
output_path = os.path.join(output_dir, f"{base_id}.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

print(f"JSON generado en: {output_path}")
