In [0]:
%run ./utils

In [0]:
%pip uninstall -y numpy spacy es_core_news_sm PyPDF2 langdetect

In [0]:
dbutils.library.restartPython()

In [0]:
%pip install numpy==1.24.3
%pip install spacy==3.5.0
%pip install es-core-news-sm==3.1.0
%pip install PyPDF2==3.0.1
%pip install langdetect==1.0.9

In [0]:
dbutils.library.restartPython()

In [0]:
# celda clave con las versiones compatibles
import numpy, spacy, es_core_news_sm, PyPDF2
print("numpy:", numpy.__version__)
print("spaCy:", spacy.__version__)
print("es_core_news_sm:", es_core_news_sm.__version__)
print("PyPDF2:", PyPDF2.__version__)

In [0]:
# 1. Widgets de entrada/salida

input_path = "/Volumes/kyndryl/default/archivos/web_service_sap_pipeline.pdf"
output_dir = "/Volumes/kyndryl/default/archivos/"

# 2. Convierte rutas DBFS a rutas locales (/dbfs/…) para Python estándar
def to_local(path: str) -> str:
    return path.replace("dbfs:/", "/dbfs/") if path.startswith("dbfs:/") else path

local_pdf     = to_local(input_path)
local_out_dir = to_local(output_dir)

import os, json
os.makedirs(local_out_dir, exist_ok=True)

# 3. Leer el PDF con PyPDF2
import PyPDF2
reader    = PyPDF2.PdfReader(local_pdf)
full_text = "\n".join(page.extract_text() or "" for page in reader.pages)

# 4. Chunking con solapamiento
def chunk_text(text, max_len=5000, overlap=500):
    chunks, start = [], 0
    while start < len(text):
        chunks.append(text[start:start+max_len])
        start += max_len - overlap
    return chunks

#chunks = chunk_text(full_text)
chunks = [page.extract_text() or "" for page in reader.pages]

# 5. Carga de spaCy (descarga on-demand)
import spacy
from spacy.cli import download as spacy_download

model_name = "es_core_news_sm"
try:
    nlp = spacy.load(model_name)
except OSError:
    spacy_download(model_name)
    nlp = spacy.load(model_name)

# 6. Enriquecimiento: detección de idioma + entidades
from langdetect import detect
docs = []
base_id = os.path.splitext(os.path.basename(local_pdf))[0]

for idx, txt in enumerate(chunks):
    lang = detect(txt) if txt.strip() else "und"
    doc_spacy = nlp(txt)
    entities = [{"text": e.text, "label": e.label_} for e in doc_spacy.ents]
    docs.append({
        "id":         f"{base_id}_{idx}",
        "sourceId":   base_id,
        "chunkIndex": idx,
        "language":   lang,
        "text":       txt,
        "entities":   entities
    })

# 7. Serializar y guardar el JSON en DBFS
output_path = os.path.join(local_out_dir, f"{base_id}.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(docs, f, ensure_ascii=False, indent=2)

print(f"✓ JSON generado en: {output_path}")
