### Importaciones

In [None]:
import os, csv, fitz
from dotenv import load_dotenv
from datetime import datetime
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient

In [None]:
load_dotenv(override=True)

### Variables de entorno

In [None]:
azure_document_intelligence_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
azure_document_intelligence_endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")

### Funciones

In [None]:
INPUT_PDF = "PoA Uruguay.pdf"   # <-- tu PDF
ZOOM = 2.0                      # factor de zoom al exportar figuras
PADDING = 6                     # px de margen alrededor del polígono
OUT_ROOT = "output"             # carpeta base de salida (pedido 1)

def ensure_env():
    if not azure_document_intelligence_key or not azure_document_intelligence_endpoint:
        raise RuntimeError("Faltan variables de entorno AZURE_DOCUMENT_INTELLIGENCE_KEY o _ENDPOINT")

def make_out_dir(root, input_path):
    base = os.path.splitext(os.path.basename(input_path))[0]
    run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_dir = os.path.join(root, base, f"run_{run_id}")
    os.makedirs(out_dir, exist_ok=True)
    return out_dir

def di_polygon_to_fitz_rect(di_polygon, di_page, fitz_page, padding=4):
    """Escala polígono DI -> Rect en puntos PDF, agrega padding y clampa a la página."""
    fx_w = fitz_page.rect.width  / di_page.width
    fx_h = fitz_page.rect.height / di_page.height
    xs = [di_polygon[i] * fx_w for i in range(0, len(di_polygon), 2)]
    ys = [di_polygon[i] * fx_h for i in range(1, len(di_polygon), 2)]
    r = fitz.Rect(min(xs), min(ys), max(xs), max(ys))
    r = fitz.Rect(r.x0 - padding, r.y0 - padding, r.x1 + padding, r.y1 + padding)
    return (r & fitz_page.rect)

def di_point_to_fitz(pt_x, pt_y, di_page, fitz_page):
    """Punto DI -> punto en puntos PDF."""
    fx_w = fitz_page.rect.width  / di_page.width
    fx_h = fitz_page.rect.height / di_page.height
    return (pt_x * fx_w, pt_y * fx_h)

def polygon_centroid(poly):
    """Centroide simple (promedio) de un polígono [x1,y1,x2,y2,...] en el mismo sistema de coords."""
    xs = poly[0::2]; ys = poly[1::2]
    return (sum(xs) / len(xs), sum(ys) / len(ys))

def save_text(result, out_dir):
    full = []
    for p_idx, page in enumerate(result.pages or []):
        lines = [ln.content for ln in (page.lines or [])]
        page_text = "\n".join(lines)
        full.append(page_text)
        with open(os.path.join(out_dir, f"page_{p_idx+1}.txt"), "w", encoding="utf-8") as f:
            f.write(page_text)
    with open(os.path.join(out_dir, "full_text.txt"), "w", encoding="utf-8") as f:
        f.write("\n\n".join(full))

def save_tables(result, out_dir):
    for t_idx, table in enumerate(result.tables or []):
        grid = [[None for _ in range(table.column_count)] for __ in range(table.row_count)]
        for cell in (table.cells or []):
            grid[cell.row_index][cell.column_index] = cell.content or ""
        with open(os.path.join(out_dir, f"table_{t_idx}.csv"), "w", newline="", encoding="utf-8") as fcsv:
            writer = csv.writer(fcsv)
            writer.writerows([[c or "" for c in row] for row in grid])

def words_inside_rect(result, page_index, rect, fitz_page):
    """Devuelve lista de palabras (strings) cuyo centro cae dentro de rect (en coords PDF)."""
    di_page = result.pages[page_index]
    words = []
    for w in (di_page.words or []):
        cx_di, cy_di = polygon_centroid(w.polygon)
        cx, cy = di_point_to_fitz(cx_di, cy_di, di_page, fitz_page)
        if rect.contains(fitz.Point(cx, cy)):
            words.append((cy, cx, w.content))  # (y, x, texto) para ordenar top-left → bottom-right
    # orden simple por Y luego X
    words.sort(key=lambda t: (round(t[0], 1), t[1]))
    return [t[2] for t in words]

def save_figures_with_text(result, pdf_path, out_dir, zoom=2.0, padding=6):
    figs = getattr(result, "figures", None)
    if not figs:
        print("ℹ️ No hay figuras en el resultado.")
        return

    index_rows = []
    doc = fitz.open(pdf_path)
    try:
        for i, fig in enumerate(figs):
            regions = getattr(fig, "bounding_regions", None) or []
            if not regions: 
                continue
            br = regions[0]
            pnum0 = br.page_number - 1  # 0-based
            fitz_page = doc[pnum0]
            di_page = result.pages[pnum0]

            rect = di_polygon_to_fitz_rect(br.polygon, di_page, fitz_page, padding=padding)
            if rect.is_empty or rect.width == 0 or rect.height == 0:
                continue

            # Render y guardar imagen
            mat = fitz.Matrix(zoom, zoom)
            pix = fitz_page.get_pixmap(matrix=mat, clip=rect)
            fig_name = f"figure_{i}.png"
            fig_path = os.path.join(out_dir, fig_name)
            pix.save(fig_path)

            # Buscar texto dentro del rectángulo
            words = words_inside_rect(result, pnum0, rect, fitz_page)
            text = " ".join(words).strip()
            has_text = bool(text)

            # Guardar sidecar con texto si hay
            if has_text:
                with open(os.path.join(out_dir, f"figure_{i}.txt"), "w", encoding="utf-8") as ft:
                    ft.write(text)

            # Registrar en índice
            index_rows.append([fig_name, pnum0 + 1, "yes" if has_text else "no", text])
    finally:
        doc.close()

    # CSV índice
    with open(os.path.join(out_dir, "figures_index.csv"), "w", newline="", encoding="utf-8") as fcsv:
        writer = csv.writer(fcsv)
        writer.writerow(["figure_file", "page", "has_text", "text"])
        writer.writerows(index_rows)

✅ Listo. Salida: c:\Work\Libra\PoC\output\PoA Uruguay\run_20250808_145233
   (Recuerda: en el tier F0 solo se analizan 2 páginas)


In [None]:
def main():
    ensure_env()
    out_dir = make_out_dir(OUT_ROOT, INPUT_PDF)

    client = DocumentIntelligenceClient(azure_document_intelligence_key, AzureKeyCredential(azure_document_intelligence_endpoint))

    with open(INPUT_PDF, "rb") as f:
        poller = client.begin_analyze_document(
            model_id="prebuilt-layout",
            body=f,
            content_type="application/pdf"
        )
    result = poller.result()

    save_text(result, out_dir)
    save_tables(result, out_dir)
    save_figures_with_text(result, INPUT_PDF, out_dir, zoom=ZOOM, padding=PADDING)

    print(f"✅ Listo. Salida: {os.path.abspath(out_dir)}")
    print("   (Recuerda: en el tier F0 solo se analizan 2 páginas)")

if __name__ == "__main__":
    main()