In [19]:
import json
import os
from pathlib import Path
from tqdm import tqdm
from openai import AzureOpenAI
from configs.credentials_config import API_KEY, ENDPOINT, MODEL, DEPLOYMENT

In [20]:
# Define paths
PATH_GLOBAL = os.getcwd()
PATH = os.path.join(PATH_GLOBAL, "datasets")
PATH_JSON = Path(os.path.join(PATH, "fallos_json"))
PATH_SUMMARIES = Path(os.path.join(PATH, "summaries"))

In [21]:

# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_version="2025-04-01-preview",
    azure_endpoint=ENDPOINT,
    api_key=API_KEY
)

def summarize_judicial_ruling(full_text: str, case_info: dict) -> str:
    """
    Generate core summary of entire judicial ruling using Azure OpenAI
    """
    # Extract case info for context
    case_context = ""
    if case_info:
        case_context = f"Información del caso: {json.dumps(case_info, ensure_ascii=False, indent=2)}\n\n"
    
    prompt = f"""
    Eres un experto en derecho que analiza fallos judiciales. 
    Analiza el siguiente fallo judicial completo y extrae la IDEA CENTRAL del fallo.

    {case_context}

    Instrucciones:
    - Identifica la decisión judicial principal y su fundamento
    - Incluye los aspectos legales más relevantes del caso
    - Menciona las leyes y artículos clave que sustentan la decisión
    - Explica brevemente el razonamiento del tribunal
    - Mantén un lenguaje jurídico preciso pero comprensible
    - El resumen debe ser conciso pero completo (máximo 300 palabras)

    Contenido completo del fallo:
    {full_text}
    """

    try:
        response = client.responses.create(
            model=DEPLOYMENT,
            instructions="Eres un experto jurista que extrae la esencia de fallos judiciales, identificando la decisión principal y su fundamento legal.",
            input=prompt,
            temperature=0.2  # Lower temperature for more consistent legal analysis
        )
        
        return response.output_text
        
    except Exception as e:
        print(f"Error generating core summary: {e}")
        return "Error al generar el resumen central del fallo judicial"

def process_json_for_core_summary(json_path: Path, output_path: Path):
    """
    Process a single JSON file and create a core summary of the entire ruling
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)[0]  # First element of the list
        
        # Gather all content from all sections
        full_content = []
        
        if 'CONTENIDO' in data:
            for section_name, paragraphs in data['CONTENIDO'].items():
                if paragraphs:  # Only process non-empty sections
                    section_header = f"\n--- {section_name} ---\n"
                    full_content.append(section_header)
                    
                    if isinstance(paragraphs, list):
                        full_content.extend(paragraphs)
                    else:
                        full_content.append(str(paragraphs))
        
        # Join all content into single text
        full_text = "\n\n".join(full_content)
        
        # Skip if no content
        if len(full_text.strip()) < 100:
            print(f"⚠️  Skipping {json_path.name} - insufficient content")
            return
        
        # Generate core summary
        case_info = data.get("INFORMACION", {})
        core_summary = summarize_judicial_ruling(full_text, case_info)
        
        # Create simplified output structure
        output_structure = {
            "INFORMACION": case_info,
            "CORE_SUMMARY": core_summary,
            "METADATA": {
                "longitud_documento_original": len(full_text),
                "longitud_resumen_central": len(core_summary),
                "reduccion_porcentual": round((1 - len(core_summary)/len(full_text)) * 100, 2),
                "secciones_analizadas": list(data.get('CONTENIDO', {}).keys()) if 'CONTENIDO' in data else []
            }
        }
        
        # Save results
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump([output_structure], f, ensure_ascii=False, indent=2)
            
    except Exception as e:
        print(f"❌ Error processing {json_path}: {e}")

def generate_core_summaries_batch(json_dir: str, output_dir: str):
    """
    Process all JSON files and create core summaries for each judicial ruling
    """
    json_root = Path(json_dir).resolve()
    output_root = Path(output_dir).resolve()
    output_root.mkdir(parents=True, exist_ok=True)
    
    # Debug: Check what folders exist
    print(f"📁 Folders found in {json_root}:")
    for folder in sorted(json_root.iterdir()):
        if folder.is_dir():
            json_count = len(list(folder.glob("*.json")))
            print(f"  {folder.name}: {json_count} JSON files")
    
    json_files = list(json_root.rglob("*.json"))
    
    # Debug: Sort files to process in order
    json_files = sorted(json_files)
    
    if not json_files:
        print(f"No se encontraron JSONs en {json_dir}")
        return
    
    # Debug: Show first few files to be processed
    print(f"\n📋 First 5 files to process:")
    for i, file in enumerate(json_files[:5]):
        print(f"  {i+1}. {file.relative_to(json_root)}")
    
    print(f"\n⚖️  Generando resúmenes centrales de {len(json_files)} fallos judiciales...")
    
    for json_path in tqdm(json_files, desc="Analizando fallos"):
        # Maintain folder structure
        rel_path = json_path.relative_to(json_root)
        output_path = output_root / rel_path
        
        # Debug: Show which file is being processed
        if "02/" in str(rel_path):
            print(f"🔍 Processing 02 folder file: {rel_path}")
        
        process_json_for_core_summary(json_path, output_path)
    
    print(f"✅ Resúmenes centrales completados. Archivos guardados en: {output_root}")

In [17]:
generate_core_summaries_batch(PATH_JSON, PATH_SUMMARIES)

📁 Folders found in /Users/brunocr/Documents/UDESA/NLP/TP_NLP/datasets/fallos_json:
  02: 10 JSON files
  03: 30 JSON files
  04: 9 JSON files
  05: 34 JSON files
  06: 45 JSON files
  07: 12 JSON files
  08: 36 JSON files
  09: 19 JSON files
  10: 40 JSON files
  11: 30 JSON files
  12: 31 JSON files

📋 First 5 files to process:
  1. 02/8104.json
  2. 02/8142.json
  3. 02/8344.json
  4. 02/8569.json
  5. 02/8752.json

⚖️  Generando resúmenes centrales de 296 fallos judiciales...


Analizando fallos:   0%|          | 0/296 [00:00<?, ?it/s]

🔍 Processing 02 folder file: 02/8104.json


Analizando fallos:   0%|          | 1/296 [00:04<24:08,  4.91s/it]

🔍 Processing 02 folder file: 02/8142.json


Analizando fallos:   1%|          | 2/296 [00:08<20:13,  4.13s/it]

🔍 Processing 02 folder file: 02/8344.json


Analizando fallos:   1%|          | 3/296 [00:14<23:58,  4.91s/it]

🔍 Processing 02 folder file: 02/8569.json


Analizando fallos:   1%|▏         | 4/296 [00:18<22:24,  4.60s/it]

🔍 Processing 02 folder file: 02/8752.json


Analizando fallos:   2%|▏         | 5/296 [00:22<20:49,  4.29s/it]

🔍 Processing 02 folder file: 02/8865.json


Analizando fallos:   2%|▏         | 6/296 [00:25<19:53,  4.11s/it]

🔍 Processing 02 folder file: 02/8926.json


Analizando fallos:   2%|▏         | 7/296 [00:31<22:24,  4.65s/it]

🔍 Processing 02 folder file: 02/8927.json


Analizando fallos:   3%|▎         | 8/296 [00:35<20:57,  4.37s/it]

🔍 Processing 02 folder file: 02/8948.json


Analizando fallos:   3%|▎         | 9/296 [00:40<21:36,  4.52s/it]

🔍 Processing 02 folder file: 02/8971.json


Analizando fallos:  70%|███████   | 208/296 [15:15<08:08,  5.55s/it]

❌ Error processing /Users/brunocr/Documents/UDESA/NLP/TP_NLP/datasets/fallos_json/10/9093.json: Expecting value: line 73 column 5 (char 16919)


Analizando fallos: 100%|██████████| 296/296 [21:45<00:00,  4.41s/it]

✅ Resúmenes centrales completados. Archivos guardados en: /Users/brunocr/Documents/UDESA/NLP/TP_NLP/datasets/summaries





In [22]:
def regenerate_single_summary(relative_file_path: str):
    """
    Regenerate summary for a specific file
    
    Args:
        relative_file_path: Path relative to the json root, e.g., "03/5024.json"
    """
    json_root = Path(PATH_JSON).resolve()
    output_root = Path(PATH_SUMMARIES).resolve()
    
    # Construct full paths
    json_path = json_root / relative_file_path
    output_path = output_root / relative_file_path
    
    # Check if source file exists
    if not json_path.exists():
        print(f"❌ Source file not found: {json_path}")
        return
    
    print(f"🔄 Regenerating summary for: {relative_file_path}")
    
    # Process the file
    process_json_for_core_summary(json_path, output_path)
    
    if output_path.exists():
        print(f"✅ Summary regenerated successfully: {output_path}")
    else:
        print(f"❌ Failed to regenerate summary for: {relative_file_path}")


In [40]:

# Example usage:
regenerate_single_summary("12/9180.json")


🔄 Regenerating summary for: 12/9180.json
✅ Summary regenerated successfully: /Users/brunocr/Documents/UDESA/NLP/TP_NLP/datasets/summaries/12/9180.json
