In [8]:
import json
import re, os, argparse, pdfplumber
from pathlib import Path
from tqdm import tqdm
import logging
from PyPDF2 import PdfReader, PdfWriter

for name in ("pdfminer", "pdfminer.layout", "pdfminer.pdfpage"):
    logging.getLogger(name).setLevel(logging.ERROR)

In [None]:
PATH_GLOBAL = os.getcwd()
PATH = os.path.join(PATH_GLOBAL, "datasets")


# Prueba inicial con solo un mes (febrero 2024 - 10 fallos)
PATH_FALLOS = Path(os.path.join(PATH, "2024/02"))
PATH_RECORTES = Path(os.path.join(PATH, "cropped_pdfs"))
PATH_TXT = Path(os.path.join(PATH, "fallos_txts"))
PATH_JSON = Path(os.path.join(PATH, "fallos_json"))

os.makedirs(PATH_RECORTES, exist_ok=True)
os.makedirs(PATH_TXT, exist_ok=True)
os.makedirs(PATH_JSON, exist_ok=True)

## Recorte de PDFs

In [11]:
HEADER_PT = 82      # quitar SOLO en páginas impares (arriba)
FOOTER_PT = 50      # quitar SIEMPRE (abajo)

for pdf in PATH_FALLOS.glob("*.pdf"):
    reader, writer = PdfReader(str(pdf)), PdfWriter()
    for idx, page in enumerate(reader.pages, start=1):
        box = page.cropbox                       # caja de recorte base
        # -- recorte inferior --
        box.lower_left  = (box.lower_left[0],  box.lower_left[1] + FOOTER_PT)
        # -- recorte superior en páginas impares --
        if idx % 2 == 1:
            box.upper_right = (box.upper_right[0], box.upper_right[1] - HEADER_PT)

        # aplicar a todos los bounding-boxes que respetan los visores/lectores
        page.cropbox  = box
        page.trimbox  = box
        page.mediabox = box
        writer.add_page(page)

    out_file = PATH_RECORTES / pdf.name
    with out_file.open("wb") as f:
        writer.write(f)

    print("✔ Cropped →", out_file)

✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8927.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8926.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8104.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8865.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8142.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8948.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8752.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8971.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8569.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8344.pdf


In [12]:
HEADER_PT = 82      # quitar SOLO en páginas impares (arriba)
FOOTER_PT = 50      # quitar SIEMPRE (abajo)

for pdf in PATH_FALLOS.rglob("*.pdf"):  # Cambio aquí: .rglob() en lugar de .glob()
    reader, writer = PdfReader(str(pdf)), PdfWriter()
    for idx, page in enumerate(reader.pages, start=1):
        box = page.cropbox                       # caja de recorte base
        # -- recorte inferior --
        box.lower_left  = (box.lower_left[0],  box.lower_left[1] + FOOTER_PT)
        # -- recorte superior en páginas impares --
        if idx % 2 == 1:
            box.upper_right = (box.upper_right[0], box.upper_right[1] - HEADER_PT)

        # aplicar a todos los bounding-boxes que respetan los visores/lectores
        page.cropbox  = box
        page.trimbox  = box
        page.mediabox = box
        writer.add_page(page)

    # Mantener la estructura de subcarpetas en el output
    rel_path = pdf.relative_to(PATH_FALLOS)
    out_file = PATH_RECORTES / rel_path
    out_file.parent.mkdir(parents=True, exist_ok=True)  # Crear subcarpetas si no existen
    
    with out_file.open("wb") as f:
        writer.write(f)

    print("✔ Cropped →", out_file)

✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8927.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8926.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8104.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8865.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8142.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8948.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8752.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8971.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8569.pdf
✔ Cropped → /Users/maxi/Downloads/Materias Actuales/NLP/TP_NLP/datasets/cropped_pdfs/8344.pdf


# NORMALIZATION

In [13]:
PATTERNS = [
    r'^Superior Tribunal.*$',
    r'^Sala Civil y Comercial.*$',
    r'^\s*\d+\s*$',
    r'^Poder Judicial.*$',
    r'^Firmado digitalmente.*$',
    r'^Página \s*\d+(\s*de\s*\d+)?',
    r'^\s*\d{4}-\d{2}-\d{2}T\d{2}:',
]
REGEX = re.compile('|'.join(PATTERNS), re.IGNORECASE)


def join_lines_to_paragraphs(text):
    lines = text.split('\n')
    paragraphs = []
    current_para = ""
    for line in lines:
        line = line.strip()
        if not line:
            # Línea vacía indica fin de párrafo
            if current_para:
                paragraphs.append(current_para.strip())
                current_para = ""
        else:
            # Si la línea anterior no termina con punto pero la linea actual no es solamente mayuscula y termina con ':', unir
            if current_para and not current_para.endswith(('.', ':', '?', '!', ';')):
                if line.isupper() and line.endswith(':') and len(line) < 50:
                    # Si la línea es mayúscula y termina con ':', no unir
                    paragraphs.append(current_para.strip())
                    current_para = line
                else:
                    current_para += " " + line
            else:
                if current_para:
                    paragraphs.append(current_para.strip())
                current_para = line
    if current_para:
        paragraphs.append(current_para.strip())
    return "\n\n".join(paragraphs)

def clean_page(text: str) -> str:
    return '\n'.join([ln for ln in (text or "").splitlines() if not REGEX.match(ln)]).strip()


## Normalization to .TXT:

In [14]:
def clean_pdf(pdf_path: Path) -> str:
    pages_clean = []
    with pdfplumber.open(str(pdf_path)) as pdf:
        for page in pdf.pages:
            page_text = clean_page(page.extract_text())
            page_text = join_lines_to_paragraphs(page_text)  # <--- aplicar aquí
            pages_clean.append(page_text)
    return "\n\n".join(pages_clean)


def main(pdf_dir: str, out_dir: str):
    pdf_root = Path(pdf_dir).resolve()
    out_root = Path(out_dir).resolve()
    out_root.mkdir(parents=True, exist_ok=True)

    pdf_files = list(pdf_root.rglob("*.pdf"))
    if not pdf_files:
        print(f"No se encontraron PDFs en {pdf_dir}")
        return

    for path in tqdm(pdf_files, desc="Cleaning PDFs"):
        # —— NUEVO: ruta relativa para reproducir subcarpetas ——
        rel_path = path.relative_to(pdf_root).with_suffix(".txt")
        dst = out_root / rel_path
        dst.parent.mkdir(parents=True, exist_ok=True)  # crea la subcarpeta si falta
        # --------------------------------------------------------
        dst.write_text(clean_pdf(path), encoding="utf-8")

main(PATH_RECORTES, PATH_TXT)

Cleaning PDFs: 100%|██████████| 10/10 [00:02<00:00,  4.46it/s]


## Normalization to JSON:

In [15]:
def extract_materia_preliminar(inicio_paragraphs):
    """
    Busca dentro de los párrafos de INICIO el texto entre comillas,
    y extrae el texto que sigue justo después de 'S/' dentro de esa cadena.
    """
    import re

    for para in inicio_paragraphs:
        # Buscamos todas las cadenas entre comillas
        quoted_texts = re.findall(r'"([^"]+)"', para)
        for qt in quoted_texts:
            # Buscamos 'S/' y capturamos lo que viene después hasta el fin o hasta un guion, coma o fin de línea
            m = re.search(r'S/\s*([^\-\,]+)', qt, re.IGNORECASE)
            if m:
                materia = m.group(1).strip()
                return materia.upper()
    return None

def is_key_line(line: str) -> bool:
    # Es key si toda mayúscula, termina con ':' y longitud < 50 (para evitar líneas muy largas)
    return line.isupper() and line.endswith(':') and len(line) < 50

def split_into_sections(text: str) -> dict:
    """
    Convierte el texto en dict con keys y listas de párrafos.
    La key 'INICIO' contiene todo lo previo a la primera key.
    """
    lines = text.split('\n\n')  # separar párrafos
    sections = {}
    current_key = 'INICIO'
    sections[current_key] = []

    for para in lines:
        para_strip = para.strip()
        if is_key_line(para_strip):
            current_key = para_strip[:-1]  # sacamos ':'
            if current_key not in sections:
                sections[current_key] = []
        else:
            sections[current_key].append(para_strip)
    return sections



def clean_pdf_no_join(pdf_path: Path) -> str:
    """
    Extrae el texto limpio de todo el PDF, concatenando páginas
    sin unir líneas en párrafos todavía para evitar cortar párrafos.
    """
    pages_text = []
    with pdfplumber.open(str(pdf_path)) as pdf:
        for page in pdf.pages:
            page_text = clean_page(page.extract_text())
            pages_text.append(page_text.strip())
    # Unir páginas con salto de línea simple para no cortar párrafos
    return "\n".join(pages_text)

def clean_pdf_to_sections_structured(pdf_path: Path) -> dict:
    full_text = clean_pdf_no_join(pdf_path)
    full_text = join_lines_to_paragraphs(full_text)
    sections = split_into_sections(full_text)
    materia = extract_materia_preliminar(sections.get('INICIO', []))
    
    return {
        "INFORMACION": {
            "MATERIA_PRELIMINAR": materia or "",
            "RESUMEN": ""
        },
        "CONTENIDO": sections
    }


def main(pdf_dir: str, out_dir: str):
    pdf_root = Path(pdf_dir).resolve()
    out_root = Path(out_dir).resolve()
    out_root.mkdir(parents=True, exist_ok=True)

    pdf_files = list(pdf_root.rglob("*.pdf"))
    if not pdf_files:
        print(f"No se encontraron PDFs en {pdf_dir}")
        return

    for path in tqdm(pdf_files, desc="Cleaning PDFs"):
        rel_path = path.relative_to(pdf_root).with_suffix(".json")
        dst = out_root / rel_path
        dst.parent.mkdir(parents=True, exist_ok=True)
        
        sections_structured = clean_pdf_to_sections_structured(path)
        dst.write_text(json.dumps([sections_structured], ensure_ascii=False, indent=2), encoding="utf-8")


main(PATH_RECORTES, PATH_JSON)

Cleaning PDFs: 100%|██████████| 10/10 [00:02<00:00,  4.41it/s]
