In [1]:
import json
import os
from pathlib import Path
from tqdm import tqdm
from openai import AzureOpenAI
from configs.credentials_config import API_KEY, ENDPOINT, MODEL, DEPLOYMENT

In [2]:
# Define paths
PATH_GLOBAL = os.getcwd()
PATH = os.path.join(PATH_GLOBAL, "datasets")
PATH_JSON = Path(os.path.join(PATH, "fallos_json"))
PATH_SUMMARIES = Path(os.path.join(PATH, "summaries"))

In [None]:

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_version="2025-04-01-preview",
    azure_endpoint=ENDPOINT,
    api_key=API_KEY
)

def summarize_judicial_ruling(full_text: str, case_info: dict) -> str:
    """
    Generate core summary of entire judicial ruling using Azure OpenAI
    """
    # Extract case info for context
    case_context = ""
    if case_info:
        case_context = f"Información del caso: {json.dumps(case_info, ensure_ascii=False, indent=2)}\n\n"
    
    prompt = f"""
    Eres un experto en derecho que analiza fallos judiciales. 
    Analiza el siguiente fallo judicial completo y extrae la IDEA CENTRAL del fallo.

    {case_context}

    Instrucciones:
    - Identifica la decisión judicial principal y su fundamento
    - Incluye los aspectos legales más relevantes del caso
    - Menciona las leyes y artículos clave que sustentan la decisión
    - Explica brevemente el razonamiento del tribunal
    - Mantén un lenguaje jurídico preciso pero comprensible
    - El resumen debe ser conciso pero completo (máximo 300 palabras)

    Contenido completo del fallo:
    {full_text}
    """

    try:
        response = client.responses.create(
            model=DEPLOYMENT,
            instructions="Eres un experto jurista que extrae la esencia de fallos judiciales, identificando la decisión principal y su fundamento legal.",
            input=prompt,
            temperature=0.2  # Lower temperature for more consistent legal analysis
        )
        
        return response.output_text
        
    except Exception as e:
        print(f"Error generating core summary: {e}")
        return "Error al generar el resumen central del fallo judicial"

def process_json_for_core_summary(json_path: Path, output_path: Path):
    """
    Process a single JSON file and create a core summary of the entire ruling
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)[0]  # First element of the list
        
        # Gather all content from all sections
        full_content = []
        
        if 'CONTENIDO' in data:
            for section_name, paragraphs in data['CONTENIDO'].items():
                if paragraphs:  # Only process non-empty sections
                    section_header = f"\n--- {section_name} ---\n"
                    full_content.append(section_header)
                    
                    if isinstance(paragraphs, list):
                        full_content.extend(paragraphs)
                    else:
                        full_content.append(str(paragraphs))
        
        # Join all content into single text
        full_text = "\n\n".join(full_content)
        
        # Skip if no content
        if len(full_text.strip()) < 100:
            print(f"⚠️  Skipping {json_path.name} - insufficient content")
            return
        
        # Generate core summary
        case_info = data.get("INFORMACION", {})
        core_summary = summarize_judicial_ruling(full_text, case_info)
        
        # Create simplified output structure
        output_structure = {
            "INFORMACION": case_info,
            "CORE_SUMMARY": core_summary,
            "METADATA": {
                "longitud_documento_original": len(full_text),
                "longitud_resumen_central": len(core_summary),
                "reduccion_porcentual": round((1 - len(core_summary)/len(full_text)) * 100, 2),
                "secciones_analizadas": list(data.get('CONTENIDO', {}).keys()) if 'CONTENIDO' in data else []
            }
        }
        
        # Save results
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump([output_structure], f, ensure_ascii=False, indent=2)
            
    except Exception as e:
        print(f"❌ Error processing {json_path}: {e}")

def generate_core_summaries_batch(json_dir: str, output_dir: str):
    """
    Process all JSON files and create core summaries for each judicial ruling
    """
    json_root = Path(json_dir).resolve()
    output_root = Path(output_dir).resolve()
    output_root.mkdir(parents=True, exist_ok=True)
    
    # Debug: Check what folders exist
    print(f"📁 Folders found in {json_root}:")
    for folder in sorted(json_root.iterdir()):
        if folder.is_dir():
            json_count = len(list(folder.glob("*.json")))
            print(f"  {folder.name}: {json_count} JSON files")
    
    json_files = list(json_root.rglob("*.json"))
    
    # Debug: Sort files to process in order
    json_files = sorted(json_files)
    
    if not json_files:
        print(f"No se encontraron JSONs en {json_dir}")
        return
    
    # Debug: Show first few files to be processed
    print(f"\n📋 First 5 files to process:")
    for i, file in enumerate(json_files[:5]):
        print(f"  {i+1}. {file.relative_to(json_root)}")
    
    print(f"\n⚖️  Generando resúmenes centrales de {len(json_files)} fallos judiciales...")
    
    for json_path in tqdm(json_files, desc="Analizando fallos"):
        # Maintain folder structure
        rel_path = json_path.relative_to(json_root)
        output_path = output_root / rel_path
        
        # Debug: Show which file is being processed
        if "02/" in str(rel_path):
            print(f"🔍 Processing 02 folder file: {rel_path}")
        
        process_json_for_core_summary(json_path, output_path)
    
    print(f"✅ Resúmenes centrales completados. Archivos guardados en: {output_root}")

In [4]:
generate_core_summaries_batch(PATH_JSON, PATH_SUMMARIES)

⚖️  Generando resúmenes centrales de 296 fallos judiciales...


Analizando fallos:   3%|▎         | 8/296 [00:33<19:49,  4.13s/it]


KeyboardInterrupt: 