In [1]:
import json
import re
from pathlib import Path
from tqdm import tqdm
import os

In [2]:
# Definir rutas
PATH_GLOBAL = os.getcwd()
PATH = os.path.join(PATH_GLOBAL, "datasets")
PATH_JSON = Path(os.path.join(PATH, "fallos_json"))
PATH_ARTICULOS_CITADOS = Path(os.path.join(PATH, "articulos_citados_hard"))

# Ejecutar extracci√≥n (puedes modificar los patrones regex despu√©s)


In [3]:
def extract_cited_articles_and_laws(json_dir: str, output_dir: str, regex_patterns: list = None):
    """
    Extrae art√≠culos y leyes citados de cada subsecci√≥n de CONTENIDO en los JSONs.
    
    Args:
        json_dir: Directorio con los JSONs originales (ej: datasets/fallos_json)
        output_dir: Directorio de salida (ej: datasets/articulos_citados_hard)
        regex_patterns: Lista de patrones regex para extraer citas (opcional)
    """
    json_root = Path(json_dir).resolve()
    output_root = Path(output_dir).resolve()
    output_root.mkdir(parents=True, exist_ok=True)
    
    # Patrones regex mejorados
    if regex_patterns is None:
        regex_patterns = [
            # Art√≠culos con diferentes formatos
            r'-?arts?\.\s*(\d+(?:¬∫|¬∞)?)(?:\s*[,y]\s*(\d+(?:¬∫|¬∞)?))*',  # arts. 3, 14, 29 y 94 o -arts. 1¬∫ y 4¬∫
            r'Art(?:√≠culo)?s?\.\s*(\d+(?:¬∫|¬∞)?)(?:\s*[,y]\s*(\d+(?:¬∫|¬∞)?))*',  # Art. 28, Art√≠culo 45
            r'del\s+art\.?\s*(\d+(?:¬∫|¬∞)?)',  # del art.114
            r'art√≠culos?\s+(\d+(?:¬∫|¬∞)?)(?:\s*[,y]\s*(\d+(?:¬∫|¬∞)?))*',  # art√≠culo 123 y 456
            
            # Leyes con diferentes formatos
            r'ley\s+n?¬∫?\s*(\d+(?:/\d+)?)',  # ley 7046, ley n¬∫ 5678/90
            r'leyes?\s+n?¬∫?\s*(\d+(?:/\d+)?)(?:\s*[,y]\s*(\d+(?:/\d+)?))*',  # leyes 123 y 456
            
            # N√∫meros standalone despu√©s de menciones de art√≠culos (para capturar secuencias)
            r'(?:arts?\.|art√≠culos?|Art\.)\s*[^\d]*(\d+(?:¬∫|¬∞)?(?:\s*[,y]\s*\d+(?:¬∫|¬∞)?)*(?:\s*y\s*\d+(?:¬∫|¬∞)?)?)',
        ]
    
    def extract_numbers_from_match(match_groups):
        """Extrae todos los n√∫meros de los grupos de una coincidencia regex"""
        numbers = []
        for group in match_groups:
            if group:  # Si el grupo no es None
                # Buscar todos los n√∫meros en el grupo
                nums = re.findall(r'\d+(?:¬∫|¬∞)?', group)
                numbers.extend(nums)
        return numbers
    
    def extract_articles_from_text(text):
        """Extrae art√≠culos de un texto usando m√∫ltiples estrategias"""
        all_articles = set()  # Usar set para evitar duplicados
        
        # Estrategia 1: Patrones espec√≠ficos
        for pattern in compiled_patterns:
            matches = pattern.finditer(text)
            for match in matches:
                numbers = extract_numbers_from_match(match.groups())
                all_articles.update(numbers)
        
        # Estrategia 2: Buscar secuencias espec√≠ficas como "3, 14, 29, 30, 63, 64, 71 y 94"
        # Patr√≥n para capturar listas de n√∫meros despu√©s de "arts." o similar
        sequence_pattern = r'(?:-?arts?\.|art√≠culos?|Art\.)\s*([0-9¬∫¬∞,\s\-y]+?)(?:\s+de\s+la\s+ley|\s+Ac\.|\.|\s|$)'
        seq_matches = re.finditer(sequence_pattern, text, re.IGNORECASE)
        for match in seq_matches:
            sequence = match.group(1)
            # Extraer todos los n√∫meros de la secuencia
            nums = re.findall(r'\d+(?:¬∫|¬∞)?', sequence)
            all_articles.update(nums)
        
        return list(all_articles)
    
    # Compilar patrones
    compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in regex_patterns]
    
    json_files = list(json_root.rglob("*.json"))
    if not json_files:
        print(f"No se encontraron JSONs en {json_dir}")
        return
    
    print(f"üîç Extrayendo citas de {len(json_files)} archivos JSON...")
    
    for json_path in tqdm(json_files, desc="Extrayendo citas"):
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)[0]  # Primer elemento de la lista
            
            # Crear estructura de salida
            output_structure = {
                "INFORMACION": data.get("INFORMACION", {}),
                "CONTENIDO": {}
            }
            
            # Procesar cada subsecci√≥n de CONTENIDO
            if 'CONTENIDO' in data:
                for section_name, paragraphs in data['CONTENIDO'].items():
                    cited_articles = []
                    
                    # Buscar citas en cada p√°rrafo de la subsecci√≥n
                    for paragraph in paragraphs:
                        if isinstance(paragraph, str):
                            articles = extract_articles_from_text(paragraph)
                            cited_articles.extend(articles)
                    
                    # Remover duplicados y ordenar
                    cited_articles = sorted(list(set(cited_articles)))
                    
                    # Guardar lista de citas para esta subsecci√≥n
                    output_structure["CONTENIDO"][section_name] = cited_articles
            
            # Crear archivo de salida manteniendo estructura de carpetas
            rel_path = json_path.relative_to(json_root)
            output_path = output_root / rel_path
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Guardar JSON con citas extra√≠das
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump([output_structure], f, ensure_ascii=False, indent=2)
                
        except Exception as e:
            print(f"‚ùå Error procesando {json_path}: {e}")
    
    print(f"‚úÖ Extracci√≥n completada. Archivos guardados en: {output_root}")

In [4]:
extract_cited_articles_and_laws(PATH_JSON, PATH_ARTICULOS_CITADOS)

üîç Extrayendo citas de 296 archivos JSON...


Extrayendo citas: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 296/296 [00:00<00:00, 763.67it/s]

‚ùå Error procesando /Users/brunocr/Documents/UDESA/NLP/TP_NLP/datasets/fallos_json/10/9093.json: Expecting value: line 73 column 5 (char 16919)
‚úÖ Extracci√≥n completada. Archivos guardados en: /Users/brunocr/Documents/UDESA/NLP/TP_NLP/datasets/articulos_citados_hard





In [5]:
import json
import os
from pathlib import Path
from tqdm import tqdm
from typing import List, Optional
from pydantic import BaseModel, Field
from openai import AzureOpenAI
from configs.credentials_config import API_KEY, ENDPOINT, MODEL, DEPLOYMENT

In [9]:
class SubsectionCitations(BaseModel):
    """Schema for citations found in a subsection"""
    verbatim_citations: List[str] = Field(description="List of exact citation phrases as they appear in the text (e.g., 'arts. 1¬∫ y 4¬∫ Ac. Gral. 15/18 SNE', 'arts. 3, 29, 30, 63, 64, 70 y 94 de la ley 7046')", default_factory=list)


In [12]:
# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_version="2025-04-01-preview",
    azure_endpoint=ENDPOINT,
    api_key=API_KEY
)

def extract_citations_with_llm(text: str, section_name: str) -> SubsectionCitations:
    """
    Extract legal citations from text using LLM with structured output
    """
    prompt = f"""
    You are a legal text analyzer. Extract all legal citations as complete verbatim phrases from the following text from the "{section_name}" section of a legal document.

    IMPORTANT: Extract the COMPLETE citation phrases exactly as they appear in the text, including:
    - Article numbers with their source (e.g., "art. 45 del CPCC", "arts. 1¬∫ y 4¬∫ Ac. Gral. 15/18 SNE")
    - Laws with articles (e.g., "arts. 3, 29, 30, 63, 64, 70 y 94 de la ley 7046")
    - Constitutional articles (e.g., "art. 14 de la Constituci√≥n Nacional")
    - Code articles (e.g., "art. 163 del C√≥digo Civil", "art. 280 del CPCC")
    - Procedural references (e.g., "conforme arts. 1¬∫ y 4¬∫ Ac. Gral. 15/18 SNE")
    - Any legal reference with acronyms (CPCC, CN, CC, etc.)

    Text to analyze:
    {text}

    Return ONLY the complete verbatim phrases as they appear in the text. Do NOT break them down or parse them - capture the entire citation phrase including the source/acronym when present.

    Examples of what to extract:
    - "arts. 1¬∫ y 4¬∫ Ac. Gral. 15/18 SNE"
    - "arts. 3, 29, 30, 63, 64, 70 y 94 de la ley 7046"
    - "del art. 114"
    - "art. 45 del CPCC"
    - "Art. 28"
    """

    try:
        response = client.responses.parse(
            model=DEPLOYMENT, #modify this to use responses api, as in Doc's work
            instructions = "You are a precise legal citation extractor. Extract all article and law numbers mentioned in legal texts. Be thorough and accurate.",
            input        = prompt,
            text_format=SubsectionCitations,
            temperature=0.1  # Low temperature for consistency
        )
        
        return response.output_parsed
        
    except Exception as e:
        print(f"Error processing section {section_name}: {e}")
        return SubsectionCitations()

def process_json_with_llm(json_path: Path, output_path: Path):
    """
    Process a single JSON file and extract citations using LLM
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)[0]  # First element of the list
        
        # Create output structure
        output_structure = {
            "INFORMACION": data.get("INFORMACION", {}),
            "CONTENIDO": {}
        }
        
        # Process each subsection of CONTENIDO
        if 'CONTENIDO' in data:
            for section_name, paragraphs in data['CONTENIDO'].items():
                if paragraphs:  # Only process non-empty sections
                    # Join paragraphs into single text for analysis
                    section_text = "\n\n".join(paragraphs) if isinstance(paragraphs, list) else str(paragraphs)
                    
                    # Extract citations using LLM
                    citations = extract_citations_with_llm(section_text, section_name)
                    
                    # Convert to dict format for JSON serialization
                    output_structure["CONTENIDO"][section_name] = citations.model_dump()
                    print(citations.model_dump())
                else:
                    # Empty section
                    output_structure["CONTENIDO"][section_name] = SubsectionCitations().model_dump()
        
        # Save results
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump([output_structure], f, ensure_ascii=False, indent=2)
            
    except Exception as e:
        print(f"‚ùå Error processing {json_path}: {e}")

def extract_citations_with_llm_batch(json_dir: str, output_dir: str):
    """
    Process all JSON files and extract citations using LLM
    """
    json_root = Path(json_dir).resolve()
    output_root = Path(output_dir).resolve()
    output_root.mkdir(parents=True, exist_ok=True)
    
    json_files = list(json_root.rglob("*.json"))
    if not json_files:
        print(f"No se encontraron JSONs en {json_dir}")
        return
    
    print(f"ü§ñ Extrayendo citas con LLM de {len(json_files)} archivos JSON...")
    
    for json_path in tqdm(json_files, desc="Procesando con LLM"):
        # Maintain folder structure
        rel_path = json_path.relative_to(json_root)
        output_path = output_root / rel_path
        
        process_json_with_llm(json_path, output_path)
    
    print(f"‚úÖ Extracci√≥n con LLM completada. Archivos guardados en: {output_root}")


In [None]:

# Define paths
PATH_GLOBAL = os.getcwd()
PATH = os.path.join(PATH_GLOBAL, "datasets")
PATH_JSON = Path(os.path.join(PATH, "fallos_json"))
PATH_ARTICULOS_LLM = Path(os.path.join(PATH, "articulos_extraidos_llm"))


# Execute LLM extraction
extract_citations_with_llm_batch(PATH_JSON, PATH_ARTICULOS_LLM)

In [14]:

class RulingCitation(BaseModel):
    """Schema for a single citation source in a ruling"""
    main_source: str = Field(description="Main legal source (e.g., 'ley 7046', 'Constituci√≥n Nacional', 'C√≥digo Civil', 'CPCC', 'Ac. Gral. 15/18 SNE')")
    cited_articles: Optional[List[int]] = Field(description="List of article numbers cited from this source", default=None)
    extra: Optional[str] = Field(description="Additional information that couldn't be captured in source or articles (e.g., ordinal indicators like '1¬∫', '4¬∫', specific sections)", default=None)

class RulingCitations(BaseModel):
    """Schema for all citations in an entire ruling"""
    citations: List[RulingCitation] = Field(description="List of all legal sources cited in the ruling", default_factory=list)
    
    class Config:
        schema_extra = {
            "example": {
                "citations": [
                    {
                        "main_source": "ley 7046",
                        "cited_articles": [3, 29, 30, 63, 64, 70, 94],
                        "extra": None
                    },
                    {
                        "main_source": "Ac. Gral. 15/18 SNE",
                        "cited_articles": [1, 4],
                        "extra": "1¬∫ y 4¬∫"
                    },
                    {
                        "main_source": "CPCC",
                        "cited_articles": [114],
                        "extra": None
                    }
                ]
            }
        }

* 'schema_extra' has been renamed to 'json_schema_extra'


In [17]:
def aggregate_ruling_citations_with_llm(verbatim_citations_list: List[str]) -> RulingCitations:
    """
    Aggregate all verbatim citations from a ruling into structured RulingCitations
    """
    # Join all citations for analysis
    all_citations_text = "\n".join(verbatim_citations_list)
    
    prompt = f"""
    You are a legal citation analyzer. Take these verbatim legal citations from a judicial ruling and organize them into structured legal sources with their cited articles.

    Verbatim citations to analyze:
    {all_citations_text}

    Your task:
    1. Identify the main legal sources (laws, codes, constitutions, agreements, etc.)
    2. Group article numbers by their source
    3. Capture any additional information that doesn't fit in source/articles

    Examples of how to structure:
    - "arts. 3, 29, 30, 63, 64, 70 y 94 de la ley 7046" ‚Üí main_source: "ley 7046", cited_articles: [3, 29, 30, 63, 64, 70, 94]
    - "arts. 1¬∫ y 4¬∫ Ac. Gral. 15/18 SNE" ‚Üí main_source: "Ac. Gral. 15/18 SNE", cited_articles: [1, 4], extra: "1¬∫ y 4¬∫"
    - "art. 45 del CPCC" ‚Üí main_source: "CPCC", cited_articles: [45]
    - "del art. 114" ‚Üí main_source: "unknown", cited_articles: [114]


    Some of the strings may only contain article numbers without a clear source, like "arts. 1¬∫ y 4¬∫". In these cases, ignore the orphan articles altogether.
    Combine duplicates and organize by source.
    """

    try:
        response = client.responses.parse(
            model=DEPLOYMENT,
            instructions="You are a legal citation organizer. Structure verbatim citations into organized legal sources with their articles.",
            input=prompt,
            text_format=RulingCitations,
            temperature=0.01
        )
        
        return response.output_parsed
        
    except Exception as e:
        print(f"Error aggregating ruling citations: {e}")
        return RulingCitations()

def process_verbatim_to_structured(input_json_path: Path, output_path: Path):
    """
    Process a JSON with verbatim citations and create structured ruling citations
    """
    try:
        with open(input_json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)[0]  # First element of the list
        
        # Collect all verbatim citations from all sections
        all_verbatim_citations = []
        
        if 'CONTENIDO' in data:
            for section_name, section_data in data['CONTENIDO'].items():
                if 'verbatim_citations' in section_data:
                    all_verbatim_citations.extend(section_data['verbatim_citations'])
        
        # Skip if no citations found
        if not all_verbatim_citations:
            print(f"‚ö†Ô∏è  No citations found in {input_json_path.name}")
            return
        
        # Aggregate citations using LLM
        structured_citations = aggregate_ruling_citations_with_llm(all_verbatim_citations)
        
        # Create output structure
        output_structure = {
            "INFORMACION": data.get("INFORMACION", {}),
            "RULING_CITATIONS": structured_citations.model_dump(),
            "METADATA": {
                "total_verbatim_citations": len(all_verbatim_citations),
                "total_structured_sources": len(structured_citations.citations),
                "original_citations": all_verbatim_citations
            }
        }
        
        # Save results
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump([output_structure], f, ensure_ascii=False, indent=2)
            
    except Exception as e:
        print(f"‚ùå Error processing {input_json_path}: {e}")

def create_structured_citations_batch(input_dir: str, output_dir: str):
    """
    Process all verbatim citation JSONs and create structured ruling citations
    """
    input_root = Path(input_dir).resolve()
    output_root = Path(output_dir).resolve()
    output_root.mkdir(parents=True, exist_ok=True)
    
    json_files = list(input_root.rglob("*.json"))
    if not json_files:
        print(f"No se encontraron JSONs en {input_dir}")
        return
    
    print(f"üîÑ Estructurando citas de {len(json_files)} fallos judiciales...")
    
    for json_path in tqdm(json_files, desc="Estructurando citas"):
        # Maintain folder structure
        rel_path = json_path.relative_to(input_root)
        output_path = output_root / rel_path
        
        process_verbatim_to_structured(json_path, output_path)
    
    print(f"‚úÖ Estructuraci√≥n completada. Archivos guardados en: {output_root}")



In [18]:
# Define paths and execute
PATH_ARTICULOS_ESTRUCTURADOS = Path(os.path.join(PATH, "articulos_estructurados"))

# Execute structured citation creation
create_structured_citations_batch(PATH_ARTICULOS_LLM, PATH_ARTICULOS_ESTRUCTURADOS)

üîÑ Estructurando citas de 295 fallos judiciales...


Estructurando citas:  33%|‚ñà‚ñà‚ñà‚ñé      | 96/295 [04:26<09:12,  2.78s/it]


KeyboardInterrupt: 