In [1]:
import json
import os
from pathlib import Path
from tqdm import tqdm
from openai import AzureOpenAI
from configs.credentials_config import API_KEY, ENDPOINT, MODEL, DEPLOYMENT

In [2]:
# Define paths
PATH_GLOBAL = os.getcwd()
PATH = os.path.join(PATH_GLOBAL, "datasets")
PATH_JSON = Path(os.path.join(PATH, "fallos_json"))
PATH_SUMMARIES = Path(os.path.join(PATH, "summaries"))

In [7]:

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_version="2025-04-01-preview",
    azure_endpoint=ENDPOINT,
    api_key=API_KEY
)

def summarize_legal_document(text: str, section_name: str) -> str:
    """
    Summarize legal document section using Azure OpenAI
    """
    prompt = f"""
    Eres un asistente legal especializado en resumir documentos jurídicos. 
    Resume el siguiente texto de la sección "{section_name}" de un documento legal.

    Instrucciones:
    - Proporciona un resumen conciso pero completo
    - Mantén los aspectos legales más importantes
    - Incluye menciones a leyes y artículos relevantes
    - Usa lenguaje claro y profesional
    - El resumen debe tener menos de 200 palabras

    Texto a resumir:
    {text}
    """

    try:
        response = client.responses.create(
            model=DEPLOYMENT,
            instructions="Eres un experto legal que resume documentos jurídicos de manera precisa y concisa.",
            input=prompt,
            temperature=0.3
        )
        
        return response.output_text
        
    except Exception as e:
        print(f"Error summarizing section {section_name}: {e}")
        return f"Error al resumir la sección {section_name}"

def process_json_for_summary(json_path: Path, output_path: Path):
    """
    Process a single JSON file and create summaries for each section
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)[0]  # First element of the list
        
        # Create output structure
        output_structure = {
            "INFORMACION": data.get("INFORMACION", {}),
            "CONTENIDO_RESUMIDO": {}
        }
        
        # Add original content length for reference
        if "INFORMACION" not in output_structure:
            output_structure["INFORMACION"] = {}
        
        # Process each subsection of CONTENIDO
        if 'CONTENIDO' in data:
            for section_name, paragraphs in data['CONTENIDO'].items():
                if paragraphs:  # Only process non-empty sections
                    # Join paragraphs into single text for summarization
                    if isinstance(paragraphs, list):
                        section_text = "\n\n".join(paragraphs)
                    else:
                        section_text = str(paragraphs)
                    
                    # Skip very short sections
                    if len(section_text.strip()) < 50:
                        output_structure["CONTENIDO_RESUMIDO"][section_name] = {
                            "resumen": section_text.strip(),
                            "longitud_original": len(section_text),
                            "es_contenido_original": True
                        }
                        continue
                    
                    # Generate summary using LLM
                    summary = summarize_legal_document(section_text, section_name)
                    
                    # Store summary with metadata
                    output_structure["CONTENIDO_RESUMIDO"][section_name] = {
                        "resumen": summary,
                        "longitud_original": len(section_text),
                        "longitud_resumen": len(summary),
                        "reduccion_porcentual": round((1 - len(summary)/len(section_text)) * 100, 2)
                    }
                    
                else:
                    # Empty section
                    output_structure["CONTENIDO_RESUMIDO"][section_name] = {
                        "resumen": "",
                        "longitud_original": 0,
                        "longitud_resumen": 0,
                        "reduccion_porcentual": 0
                    }
        
        # Add document-level summary statistics
        total_original = sum(
            section.get("longitud_original", 0) 
            for section in output_structure["CONTENIDO_RESUMIDO"].values()
        )
        total_summary = sum(
            section.get("longitud_resumen", 0) 
            for section in output_structure["CONTENIDO_RESUMIDO"].values()
        )
        
        output_structure["ESTADISTICAS_RESUMEN"] = {
            "total_secciones": len(output_structure["CONTENIDO_RESUMIDO"]),
            "longitud_total_original": total_original,
            "longitud_total_resumen": total_summary,
            "reduccion_total_porcentual": round((1 - total_summary/total_original) * 100, 2) if total_original > 0 else 0
        }
        
        # Save results
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump([output_structure], f, ensure_ascii=False, indent=2)
            
    except Exception as e:
        print(f"❌ Error processing {json_path}: {e}")

def summarize_documents_batch(json_dir: str, output_dir: str):
    """
    Process all JSON files and create summaries
    """
    json_root = Path(json_dir).resolve()
    output_root = Path(output_dir).resolve()
    output_root.mkdir(parents=True, exist_ok=True)
    
    json_files = list(json_root.rglob("*.json"))
    if not json_files:
        print(f"No se encontraron JSONs en {json_dir}")
        return
    
    print(f"📝 Resumiendo {len(json_files)} documentos legales...")
    
    for json_path in tqdm(json_files, desc="Generando resúmenes"):
        # Maintain folder structure
        rel_path = json_path.relative_to(json_root)
        output_path = output_root / rel_path
        
        process_json_for_summary(json_path, output_path)
    
    print(f"✅ Resúmenes completados. Archivos guardados en: {output_root}")


In [8]:
summarize_documents_batch(PATH_JSON, PATH_SUMMARIES)

📝 Resumiendo 296 documentos legales...


Generando resúmenes:   1%|▏         | 4/296 [00:42<52:13, 10.73s/it]  


KeyboardInterrupt: 