In [1]:
import json
import re
from pathlib import Path
from tqdm import tqdm
import os

In [2]:
# Definir rutas
PATH_GLOBAL = os.getcwd()
PATH = os.path.join(PATH_GLOBAL, "datasets")
PATH_JSON = Path(os.path.join(PATH, "fallos_json"))
PATH_ARTICULOS_CITADOS = Path(os.path.join(PATH, "articulos_citados_hard"))

# Ejecutar extracción (puedes modificar los patrones regex después)


In [3]:
def extract_cited_articles_and_laws(json_dir: str, output_dir: str, regex_patterns: list = None):
    """
    Extrae artículos y leyes citados de cada subsección de CONTENIDO en los JSONs.
    
    Args:
        json_dir: Directorio con los JSONs originales (ej: datasets/fallos_json)
        output_dir: Directorio de salida (ej: datasets/articulos_citados_hard)
        regex_patterns: Lista de patrones regex para extraer citas (opcional)
    """
    json_root = Path(json_dir).resolve()
    output_root = Path(output_dir).resolve()
    output_root.mkdir(parents=True, exist_ok=True)
    
    # Patrones regex mejorados
    if regex_patterns is None:
        regex_patterns = [
            # Artículos con diferentes formatos
            r'-?arts?\.\s*(\d+(?:º|°)?)(?:\s*[,y]\s*(\d+(?:º|°)?))*',  # arts. 3, 14, 29 y 94 o -arts. 1º y 4º
            r'Art(?:ículo)?s?\.\s*(\d+(?:º|°)?)(?:\s*[,y]\s*(\d+(?:º|°)?))*',  # Art. 28, Artículo 45
            r'del\s+art\.?\s*(\d+(?:º|°)?)',  # del art.114
            r'artículos?\s+(\d+(?:º|°)?)(?:\s*[,y]\s*(\d+(?:º|°)?))*',  # artículo 123 y 456
            
            # Leyes con diferentes formatos
            r'ley\s+n?º?\s*(\d+(?:/\d+)?)',  # ley 7046, ley nº 5678/90
            r'leyes?\s+n?º?\s*(\d+(?:/\d+)?)(?:\s*[,y]\s*(\d+(?:/\d+)?))*',  # leyes 123 y 456
            
            # Números standalone después de menciones de artículos (para capturar secuencias)
            r'(?:arts?\.|artículos?|Art\.)\s*[^\d]*(\d+(?:º|°)?(?:\s*[,y]\s*\d+(?:º|°)?)*(?:\s*y\s*\d+(?:º|°)?)?)',
        ]
    
    def extract_numbers_from_match(match_groups):
        """Extrae todos los números de los grupos de una coincidencia regex"""
        numbers = []
        for group in match_groups:
            if group:  # Si el grupo no es None
                # Buscar todos los números en el grupo
                nums = re.findall(r'\d+(?:º|°)?', group)
                numbers.extend(nums)
        return numbers
    
    def extract_articles_from_text(text):
        """Extrae artículos de un texto usando múltiples estrategias"""
        all_articles = set()  # Usar set para evitar duplicados
        
        # Estrategia 1: Patrones específicos
        for pattern in compiled_patterns:
            matches = pattern.finditer(text)
            for match in matches:
                numbers = extract_numbers_from_match(match.groups())
                all_articles.update(numbers)
        
        # Estrategia 2: Buscar secuencias específicas como "3, 14, 29, 30, 63, 64, 71 y 94"
        # Patrón para capturar listas de números después de "arts." o similar
        sequence_pattern = r'(?:-?arts?\.|artículos?|Art\.)\s*([0-9º°,\s\-y]+?)(?:\s+de\s+la\s+ley|\s+Ac\.|\.|\s|$)'
        seq_matches = re.finditer(sequence_pattern, text, re.IGNORECASE)
        for match in seq_matches:
            sequence = match.group(1)
            # Extraer todos los números de la secuencia
            nums = re.findall(r'\d+(?:º|°)?', sequence)
            all_articles.update(nums)
        
        return list(all_articles)
    
    # Compilar patrones
    compiled_patterns = [re.compile(pattern, re.IGNORECASE) for pattern in regex_patterns]
    
    json_files = list(json_root.rglob("*.json"))
    if not json_files:
        print(f"No se encontraron JSONs en {json_dir}")
        return
    
    print(f"🔍 Extrayendo citas de {len(json_files)} archivos JSON...")
    
    for json_path in tqdm(json_files, desc="Extrayendo citas"):
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)[0]  # Primer elemento de la lista
            
            # Crear estructura de salida
            output_structure = {
                "INFORMACION": data.get("INFORMACION", {}),
                "CONTENIDO": {}
            }
            
            # Procesar cada subsección de CONTENIDO
            if 'CONTENIDO' in data:
                for section_name, paragraphs in data['CONTENIDO'].items():
                    cited_articles = []
                    
                    # Buscar citas en cada párrafo de la subsección
                    for paragraph in paragraphs:
                        if isinstance(paragraph, str):
                            articles = extract_articles_from_text(paragraph)
                            cited_articles.extend(articles)
                    
                    # Remover duplicados y ordenar
                    cited_articles = sorted(list(set(cited_articles)))
                    
                    # Guardar lista de citas para esta subsección
                    output_structure["CONTENIDO"][section_name] = cited_articles
            
            # Crear archivo de salida manteniendo estructura de carpetas
            rel_path = json_path.relative_to(json_root)
            output_path = output_root / rel_path
            output_path.parent.mkdir(parents=True, exist_ok=True)
            
            # Guardar JSON con citas extraídas
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump([output_structure], f, ensure_ascii=False, indent=2)
                
        except Exception as e:
            print(f"❌ Error procesando {json_path}: {e}")
    
    print(f"✅ Extracción completada. Archivos guardados en: {output_root}")

In [4]:
extract_cited_articles_and_laws(PATH_JSON, PATH_ARTICULOS_CITADOS)

🔍 Extrayendo citas de 296 archivos JSON...


Extrayendo citas: 100%|██████████| 296/296 [00:00<00:00, 860.19it/s]

✅ Extracción completada. Archivos guardados en: /Users/brunocr/Documents/UDESA/NLP/TP_NLP/datasets/articulos_citados_hard





In [5]:
import json
import os
from pathlib import Path
from tqdm import tqdm
from typing import List, Optional
from pydantic import BaseModel, Field
from openai import AzureOpenAI
from configs.credentials_config import API_KEY, ENDPOINT, MODEL, DEPLOYMENT

In [6]:

# Pydantic schemas
class CitedLaw(BaseModel):
    """Schema for a cited law with its articles"""
    number: str = Field(description="Law number (e.g., '7046', '123/90')")
    articles: List[str] = Field(description="List of article numbers cited from this law", default_factory=list)

class SubsectionCitations(BaseModel):
    """Schema for citations found in a subsection"""
    cited_laws: List[CitedLaw] = Field(description="List of laws cited with their articles", default_factory=list)
    standalone_articles: List[str] = Field(description="Articles mentioned without specific law reference", default_factory=list)

class ContentCitations(BaseModel):
    """Schema for all citations in CONTENIDO sections"""
    sections: dict[str, SubsectionCitations] = Field(description="Citations organized by section name", default_factory=dict)


In [9]:
# Initialize Azure OpenAI client
client = AzureOpenAI(
    api_version="2025-04-01-preview",
    azure_endpoint=ENDPOINT,
    api_key=API_KEY
)

def extract_citations_with_llm(text: str, section_name: str) -> SubsectionCitations:
    """
    Extract legal citations from text using LLM with structured output
    """
    prompt = f"""
        You are a legal text analyzer. Extract all legal citations from the following text from the "{section_name}" section of a legal document.

        Find and structure:
        1. Laws with their specific articles (e.g., "ley 7046" with "arts. 3, 14, 29")
        2. Articles mentioned without specific law reference (e.g., "Art. 28", "del art.114")

        Text to analyze:
        {text}

        Extract ALL article and law numbers mentioned. Include ordinal numbers (1º, 4º) and regular numbers.
        Be thorough and don't miss any citations, especially in sequences like "arts. 3, 14, 29, 30, 63, 64, 71 y 94".
        """

    try:
        response = client.responses.parse(
            model=DEPLOYMENT, #modify this to use responses api, as in Doc's work
            instructions = "You are a precise legal citation extractor. Extract all article and law numbers mentioned in legal texts. Be thorough and accurate.",
            input        = prompt,
            text_format=SubsectionCitations,
            temperature=0.1  # Low temperature for consistency
        )
        
        return response.output_parsed
        
    except Exception as e:
        print(f"Error processing section {section_name}: {e}")
        return SubsectionCitations()

def process_json_with_llm(json_path: Path, output_path: Path):
    """
    Process a single JSON file and extract citations using LLM
    """
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)[0]  # First element of the list
        
        # Create output structure
        output_structure = {
            "INFORMACION": data.get("INFORMACION", {}),
            "CONTENIDO": {}
        }
        
        # Process each subsection of CONTENIDO
        if 'CONTENIDO' in data:
            for section_name, paragraphs in data['CONTENIDO'].items():
                if paragraphs:  # Only process non-empty sections
                    # Join paragraphs into single text for analysis
                    section_text = "\n\n".join(paragraphs) if isinstance(paragraphs, list) else str(paragraphs)
                    
                    # Extract citations using LLM
                    citations = extract_citations_with_llm(section_text, section_name)
                    
                    # Convert to dict format for JSON serialization
                    output_structure["CONTENIDO"][section_name] = citations.model_dump()
                    print(citations.model_dump())
                else:
                    # Empty section
                    output_structure["CONTENIDO"][section_name] = SubsectionCitations().model_dump()
        
        # Save results
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump([output_structure], f, ensure_ascii=False, indent=2)
            
    except Exception as e:
        print(f"❌ Error processing {json_path}: {e}")

def extract_citations_with_llm_batch(json_dir: str, output_dir: str):
    """
    Process all JSON files and extract citations using LLM
    """
    json_root = Path(json_dir).resolve()
    output_root = Path(output_dir).resolve()
    output_root.mkdir(parents=True, exist_ok=True)
    
    json_files = list(json_root.rglob("*.json"))
    if not json_files:
        print(f"No se encontraron JSONs en {json_dir}")
        return
    
    print(f"🤖 Extrayendo citas con LLM de {len(json_files)} archivos JSON...")
    
    for json_path in tqdm(json_files, desc="Procesando con LLM"):
        # Maintain folder structure
        rel_path = json_path.relative_to(json_root)
        output_path = output_root / rel_path
        
        process_json_with_llm(json_path, output_path)
    
    print(f"✅ Extracción con LLM completada. Archivos guardados en: {output_root}")


In [11]:

# Define paths
PATH_GLOBAL = os.getcwd()
PATH = os.path.join(PATH_GLOBAL, "datasets")
PATH_JSON = Path(os.path.join(PATH, "fallos_json"))
PATH_ARTICULOS_LLM = Path(os.path.join(PATH, "articulos_extraidos_llm"))


# Execute LLM extraction
extract_citations_with_llm_batch(PATH_JSON, PATH_ARTICULOS_LLM)

🤖 Extrayendo citas con LLM de 296 archivos JSON...


Procesando con LLM:   0%|          | 0/296 [00:00<?, ?it/s]

{'cited_laws': [], 'standalone_articles': ['Art. 28', 'del art. 114']}
{'cited_laws': [], 'standalone_articles': ['Art. 28', 'del art. 114']}
{'cited_laws': [{'number': 'Ley 7046', 'articles': ['arts. 112', 'arts. 113']}, {'number': 'Ley 24.522', 'articles': ['art. 267', 'art. 257']}, {'number': 'LCQ', 'articles': ['art. 267', 'art. 257']}], 'standalone_articles': ['art. 267', 'art. 257', 'art. 112', 'art. 113']}
{'cited_laws': [], 'standalone_articles': []}
{'cited_laws': [], 'standalone_articles': ['Art. 1º']}
{'cited_laws': [{'number': '8985', 'articles': []}], 'standalone_articles': ['Art. 1º', 'Art. 4º']}


Procesando con LLM:   0%|          | 1/296 [00:12<1:02:20, 12.68s/it]

{'cited_laws': [{'number': 'Ac. Gral. 15/18 SNE', 'articles': ['1º', '4º']}, {'number': 'Acuerdo Gral. 11/20', 'articles': ['23', '6', '2020']}], 'standalone_articles': []}
{'cited_laws': [], 'standalone_articles': ['Art. 1º', 'Art. 4º', 'arts. 3', 'arts. 14', 'arts. 29', 'arts. 30', 'arts. 63', 'arts. 64', 'arts. 71', 'arts. 94']}
{'cited_laws': [], 'standalone_articles': ['Art. 1', 'Art. 3', 'Art. 4', 'Art. 14', 'Art. 29', 'Art. 30', 'Art. 63', 'Art. 64', 'Art. 71', 'Art. 94', 'del art. 114']}
{'cited_laws': [{'number': 'CPCC', 'articles': ['280', '269', '281', '42']}, {'number': 'CN', 'articles': ['18']}, {'number': 'Expte.', 'articles': ['8946', '2095', '7567', '8272']}], 'standalone_articles': ['art. 280', 'art. 18', 'art. 42']}


Procesando con LLM:   1%|          | 2/296 [00:19<46:18,  9.45s/it]  

{'cited_laws': [{'number': 'Ac. Gral 15/18 SNE', 'articles': ['1', '4']}, {'number': 'Acuerdo Gral. 11/20', 'articles': ['23', '6', '2020']}], 'standalone_articles': ['art. 1', 'art. 4', 'Punto 4°']}
{'cited_laws': [], 'standalone_articles': ['Art. 1', 'Art. 4', 'Art. 14', 'Art. 29', 'Art. 30', 'Art. 63', 'Art. 64', 'Art. 71', 'Art. 94', 'del art. 114']}
{'cited_laws': [{'number': '11843', 'articles': ['1']}], 'standalone_articles': ['Art. 28', 'del art. 114']}
{'cited_laws': [{'number': 'LOPJ', 'articles': ['61', '62', '63', '75']}, {'number': 'CPCC', 'articles': ['3', '5', '6', '38']}, {'number': 'LF nro.', 'articles': ['11843']}], 'standalone_articles': ['art. 38', 'art. 3', 'art. 75', 'art. 205 inc. 1 i)']}


Procesando con LLM:   1%|          | 3/296 [00:26<39:44,  8.14s/it]

{'cited_laws': [{'number': 'LOPJ', 'articles': ['75']}, {'number': 'Ac. Gral. 15/18 SNE', 'articles': ['1º', '4º']}, {'number': 'Acuerdo Gral. 11/20', 'articles': ['23', '4°']}], 'standalone_articles': []}
{'cited_laws': [], 'standalone_articles': ['Art. 1º', 'Art. 4º']}
{'cited_laws': [{'number': 'Ley 10065', 'articles': ['arts. 1', 'arts. 2', 'arts. 3', 'arts. 4']}, {'number': 'Ley 921', 'articles': ['arts. 1', 'arts. 2']}], 'standalone_articles': ['Art. 28', 'del art. 114']}
{'cited_laws': [{'number': 'LOPJ', 'articles': ['61', '62', '63', '75']}, {'number': 'CPP', 'articles': ['38']}, {'number': 'CPCC', 'articles': ['3', '5', '6']}, {'number': 'Constitución Provincial', 'articles': ['205']}], 'standalone_articles': ['Art. 38', 'art. 75', 'artículo 61', 'artículo 62', 'artículo 63', 'artículo 3', 'artículo 5', 'artículo 6', 'artículo 205']}


Procesando con LLM:   1%|▏         | 4/296 [00:33<36:40,  7.54s/it]

{'cited_laws': [{'number': 'LOPJ', 'articles': ['75']}, {'number': 'Ac. Gral. 15/18 SNE', 'articles': ['1º', '4º']}, {'number': 'Acuerdo Gral. 11/20', 'articles': ['23', '4°']}], 'standalone_articles': ['art. 75', 'arts. 1º', '4º']}
{'cited_laws': [], 'standalone_articles': ['Art. 1553', 'Art. 9024']}
{'cited_laws': [], 'standalone_articles': ['Art. 1º', 'Art. 4º', 'Art. 28', 'del art. 114']}
{'cited_laws': [{'number': 'CPP', 'articles': ['38']}, {'number': 'LOPJ', 'articles': ['61', '62', '63', '75']}, {'number': 'CPCC', 'articles': ['5', '6']}, {'number': 'Constitución Provincial', 'articles': ['205']}], 'standalone_articles': ['art. 3', 'art. 38', 'art. 75', 'art. 61', 'art. 62', 'art. 63', 'art. 5', 'art. 6', 'art. 205']}


Procesando con LLM:   2%|▏         | 5/296 [00:39<34:10,  7.05s/it]

{'cited_laws': [{'number': 'LOPJ', 'articles': ['75']}, {'number': 'Ac. Gral. 15/18 SNE', 'articles': ['1º', '4º']}, {'number': 'Acuerdo Gral. 11/20', 'articles': ['23', '4']}], 'standalone_articles': ['art. 75', 'arts. 1º', '4º', 'Punto 4°']}
{'cited_laws': [{'number': '12780', 'articles': []}], 'standalone_articles': []}
{'cited_laws': [{'number': '12780', 'articles': ['1']}], 'standalone_articles': ['Art. 1', 'art. 3', 'art. 14', 'art. 29', 'art. 30', 'art. 63', 'art. 64', 'art. 71', 'art. 94']}
{'cited_laws': [{'number': 'LOPJ', 'articles': ['61', '62', '63', '75']}, {'number': 'CPCC', 'articles': ['3', '5', '6', '38']}, {'number': 'Constitución Provincial', 'articles': ['205']}], 'standalone_articles': ['art. 38', 'art. 3', 'art. 75']}


Procesando con LLM:   2%|▏         | 6/296 [00:46<34:42,  7.18s/it]

{'cited_laws': [{'number': 'LOPJ', 'articles': ['75']}, {'number': 'Ac. Gral. 15/18 SNE', 'articles': ['1º', '4º']}, {'number': 'Acuerdo Gral. 11/20', 'articles': ['23', '4°']}, {'number': 'L F nro. 12780', 'articles': []}], 'standalone_articles': ['art. 75', 'arts. 1º', 'arts. 4º', 'art. 4°']}
{'cited_laws': [], 'standalone_articles': ['Art. 1º', 'Art. 4º', 'Art. 3', 'Art. 14', 'Art. 29', 'Art. 30', 'Art. 63', 'Art. 64', 'Art. 71', 'Art. 94']}
{'cited_laws': [], 'standalone_articles': ['Art. 28', 'del art. 114']}
{'cited_laws': [{'number': 'CPCC', 'articles': ['301', '248', '282', '65']}, {'number': 'ley 9776', 'articles': []}, {'number': 'CN', 'articles': ['17', '18', '19']}], 'standalone_articles': ['art. 301 inc. 3', 'art. 301 inc. d)', 'art. 248', 'art. 282']}
{'cited_laws': [], 'standalone_articles': ['Art. 28', 'del art. 114']}
{'cited_laws': [{'number': 'CIV 073468/2011/1/RH001', 'articles': []}, {'number': 'FPO 6333/2014/1/RH1', 'articles': []}], 'standalone_articles': ['Art. 

Procesando con LLM:   2%|▏         | 7/296 [00:58<41:29,  8.61s/it]

{'cited_laws': [{'number': 'CPCC', 'articles': ['65']}, {'number': 'Ac. Gral. 15/18 SNE', 'articles': ['1º', '4º']}, {'number': 'Acuerdo Gral. 11/20', 'articles': ['4°']}], 'standalone_articles': ['art. 65']}
{'cited_laws': [], 'standalone_articles': ['Art. 1º', 'Art. 4º']}


Procesando con LLM:   2%|▏         | 7/296 [01:01<42:22,  8.80s/it]


KeyboardInterrupt: 