# 1. Preparación del entorno

In [None]:
# Install
!pip install tabula-py
!pip install pyPDF2
# Import
import tabula as tb
import pandas as pd
import requests
import base64
import json
from datetime import datetime, timedelta
import time
import os
import PyPDF2
import re

# 2. Descarga de PDFs de BORA
BORA genera PDFs de los boletines de cada día.
La primera sección es la que hace mención a Legislación y avisos oficiales

In [None]:
class BoletinDownloader:
    def __init__(self, download_folder="boletin_pdfs", cookie_file="cookies.txt"):
        self.session = requests.Session()
        self.base_url = "https://www.boletinoficial.gob.ar"
        self.download_url = f"{self.base_url}/pdf/download_section"
        self.download_folder = download_folder
        self.base_cookies = self._load_cookies_from_file(cookie_file)

        os.makedirs(download_folder, exist_ok=True)

        self.user_agent = (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )

    def _load_cookies_from_file(self, cookie_file):
        """Load browser cookies (Netscape format) so the session keeps the date."""
        jar = requests.cookies.RequestsCookieJar()
        if not cookie_file:
            return jar
        if not os.path.exists(cookie_file):
            print(f"Cookie file not found, continuing without it: {cookie_file}")
            return jar

        with open(cookie_file, "r", encoding="utf-8") as f:
            for raw_line in f:
                line = raw_line.strip()
                if not line or (line.startswith("#") and not line.startswith("#HttpOnly_")):
                    continue
                if line.startswith("#HttpOnly_"):
                    line = line.replace("#HttpOnly_", "", 1)
                parts = line.split("	")
                if len(parts) != 7:
                    continue
                domain, flag, path, secure_flag, expiration, name, value = parts
                cookie = requests.cookies.create_cookie(
                    domain=domain,
                    name=name,
                    value=value,
                    path=path,
                    secure=secure_flag.upper() == "TRUE",
                    expires=None if expiration == "0" else int(expiration)
                )
                jar.set_cookie(cookie)

        return jar

    def download_section(self, section, date):
        """Download a specific section for a given date (date is read from the session)."""
        date_str = date.strftime("%Y%m%d")
        filename = f"seccion_{section}_{date_str}.pdf"
        filepath = os.path.join(self.download_folder, filename)

        if os.path.exists(filepath):
            print(f"✓ File already exists: {filename}")
            return True

        # Reset cookies so each download starts from the exported browser session
        self.session.cookies.clear()
        if self.base_cookies:
            self.session.cookies.update(self.base_cookies)

        # First hit the date page so the backend stores the selected date in the session
        section_url = f"{self.base_url}/seccion/{section}/{date_str}"
        headers_get = {
            'User-Agent': self.user_agent,
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }

        date_page = self.session.get(
            section_url,
            headers=headers_get,
            timeout=30,
            allow_redirects=True
        )

        if date_page.status_code != 200:
            print(f"✗ Failed to access date page for {date_str}: HTTP {date_page.status_code}")
            return False

        time.sleep(1)

        headers_post = {
            'User-Agent': self.user_agent,
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'X-Requested-With': 'XMLHttpRequest',
            'Origin': self.base_url,
            'Referer': section_url,
            'Accept': 'application/json, text/javascript, */*; q=0.01',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive'
        }

        try:
            response = self.session.post(
                self.download_url,
                data={'nombreSeccion': section},
                headers=headers_post,
                timeout=30
            )

            if response.status_code == 200:
                json_data = response.json()

                if 'pdfBase64' in json_data and json_data['pdfBase64']:
                    pdf_data = base64.b64decode(json_data['pdfBase64'])

                    with open(filepath, 'wb') as f:
                        f.write(pdf_data)

                    print(f"✓ Downloaded: {filename} ({len(pdf_data)} bytes)")
                    return True
                else:
                    print(f"✗ No PDF available for {date_str}")
                    return False
            else:
                print(f"✗ HTTP {response.status_code} for {date_str}")
                return False

        except requests.exceptions.RequestException as e:
            print(f"✗ Network error for {date_str}: {e}")
            return False
        except json.JSONDecodeError as e:
            print(f"✗ JSON decode error for {date_str}: {e}")
            print(f"Raw response: {response.text[:200]}...")
            return False
        except Exception as e:
            print(f"✗ Unexpected error for {date_str}: {e}")
            return False


fechas_objetivo = [
    '20250108', '20250115', '20250122', '20250205', '20250212', '20250226', '20250305', '20250312', '20250319', '20250402', '20250409', '20250416', '20250507', '20250514', '20250521', '20250604', '20250611', '20250618', '20250702', '20250709', '20250716', '20250806', '20250813', '20250820', '20250903', '20250910', '20250917', '20251008', '20251015', '20251105'
]

# Usa las cookies exportadas del navegador (cookies.txt) para que el backend reciba la fecha correcta
# También podés apuntar a cookies_20250108.txt si querés ese set específico.
downloader = BoletinDownloader(cookie_file="cookies.txt")

print(f"Iniciando descarga de {len(fechas_objetivo)} fechas...")

for fecha_str in fechas_objetivo:
    fecha_dt = datetime.strptime(fecha_str, "%Y%m%d")
    downloader.download_section("primera", fecha_dt)

print("✓ Proceso de descarga finalizado.")


# 3. Implementando Bag of words
Bag of words es un modelo muy utilizado en text mining.
Tiene la particularidad de adaptarse bien a corpus textuales muy diversos permitiendo describir el corpus sin ser especialista en el dominio

## 3.1 Preparar dataset e importarinstalar librerias

In [None]:
# import and install

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
spanish_stopwords = stopwords.words('spanish')
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import locale
import string
locale.getpreferredencoding = lambda: "UTF-8"
! spacy download es_core_news_sm
nlp = spacy.load("es_core_news_sm")

stop_words_adicionales = [
    'artículo', 'art', 'ley', 'nacional', 'decreto',
    'resolución', 'disposición', 'expediente', 'anexo',
    'público', 'oficial', 'ministerio', 'boletín',
    'dirección', 'secretaría', 'presente', 'fecha',
    'buenos', 'aires', 'conforme', 'firma', 'administrativo',
    'conforme', 'establecido', 'medida', 'registro', 'regulación',
    'modificatoria', 'sección', 'dispuesto', 'dictar'
]

# Actualizamos el modelo de spaCy para que marque estas palabras como stop words
for palabra in stop_words_adicionales:
    # Agregamos la palabra tal cual
    nlp.vocab[palabra].is_stop = True
    # También es útil agregar la versión en minúscula por las dudas
    nlp.vocab[palabra.lower()].is_stop = True


In [None]:
# Create a dataframe organized in paragraphs
def extract_paragraphs_to_dataframe(pdf_path):
    """
    Extract text at paragraph level for better context
    """
    paragraphs_data = []

    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text = page.extract_text()

            # Split into paragraphs
            page_paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]

            for para_num, paragraph in enumerate(page_paragraphs):
                paragraphs_data.append({
                    'page': page_num + 1,
                    'paragraph_id': f"p{page_num+1}_{para_num+1}",
                    'text': paragraph,
                    'char_count': len(paragraph),
                    'word_count': len(paragraph.split())
                })

    return pd.DataFrame(paragraphs_data)

lista_dfs = []
carpeta_pdf = "boletin_pdfs"

print("Comenzando extracción de texto...")

for fecha_str in fechas_objetivo:
    # Reconstruimos el nombre del archivo según la lógica del downloader
    nombre_archivo = f"seccion_primera_{fecha_str}.pdf"
    path_archivo = os.path.join(carpeta_pdf, nombre_archivo)

    if os.path.exists(path_archivo):
        try:
            # Extraemos data
            df_temp = extract_paragraphs_to_dataframe(path_archivo)

            # Agregamos columna de fecha para trazabilidad (útil si luego cambias de opinión sobre agrupar)
            df_temp['fecha_boletin'] = fecha_str

            lista_dfs.append(df_temp)
            print(f"✓ Procesado: {nombre_archivo} ({len(df_temp)} párrafos)")

        except Exception as e:
            print(f"⚠ Error leyendo {nombre_archivo}: {e}")
    else:
        # Esto pasa si era fin de semana o feriado y el archivo no se bajó
        pass

# Unimos todo en una sola bolsa
if lista_dfs:
    df_consolidado = pd.concat(lista_dfs, ignore_index=True)
    print(f"\nTOTAL: {len(df_consolidado)} párrafos extraídos de {len(lista_dfs)} documentos.")
    print(df_consolidado.head())
else:
    print("Error: No se encontraron PDFs para procesar.")

## 3.2 Funciones para BOW

In [None]:
def preprocess_spanish_text(text):
    """
    Preprocess a single Spanish text with lemmatization
    """
    if pd.isna(text) or text == '':
        return ""

    doc = nlp(text.lower())

    lemmas = []
    for token in doc:
        if (not token.is_stop and
            not token.is_punct and
            not token.is_space and
            len(token.text) > 2 and
            token.is_alpha):
            lemmas.append(token.lemma_)

    return " ".join(lemmas)

def apply_text_preprocessing(df, text_column):
    """
    Apply preprocessing to a DataFrame column
    Returns a new DataFrame with processed text
    """
    df_processed = df.copy()
    df_processed['processed_text'] = df_processed[text_column].apply(preprocess_spanish_text)
    df_processed['original_word_count'] = df_processed[text_column].str.split().str.len()
    df_processed['processed_word_count'] = df_processed['processed_text'].str.split().str.len()

    return df_processed

def calculate_text_statistics(df, original_col, processed_col):
    """
    Calculate text statistics for comparison
    """
    total_original = df[original_col].str.split().str.len().sum()
    total_processed = df[processed_col].str.split().str.len().sum()

    all_original = ' '.join(df[original_col].astype(str))
    all_processed = ' '.join(df[processed_col].astype(str))

    unique_original = len(set(all_original.lower().split()))
    unique_processed = len(set(all_processed.split()))

    stats = {
        'total_paragraphs': len(df),
        'total_original_words': total_original,
        'total_processed_words': total_processed,
        'unique_original_words': unique_original,
        'unique_processed_words': unique_processed,
        'word_reduction_pct': ((total_original - total_processed) / total_original * 100),
        'vocab_reduction_pct': ((unique_original - unique_processed) / unique_original * 100)
    }

    return stats

def create_bow_analysis(text_series, top_n=20, **vectorizer_kwargs):
    """
    Create Bag-of-Words analysis from a text series
    """
    # Combine all text
    all_text = ' '.join(text_series.astype(str))

    # Default vectorizer parameters
    default_params = {
        'max_features': top_n * 2,
        'lowercase': True,
        'token_pattern': r'\b[a-zA-ZáéíóúñÁÉÍÓÚÑ]+\b'
    }
    default_params.update(vectorizer_kwargs)

    vectorizer = CountVectorizer(**default_params)
    bow_matrix = vectorizer.fit_transform([all_text])

    # Get word frequencies
    word_freq = dict(zip(vectorizer.get_feature_names_out(), bow_matrix.sum(axis=0).tolist()[0]))
    top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:top_n]

    return pd.DataFrame(top_words, columns=['word', 'frequency'])


def print_preprocessing_report(df, original_col='text', processed_col='processed_text'):
    """
    Print a comprehensive preprocessing report
    """
    stats = calculate_text_statistics(df, original_col, processed_col)

    print("="*60)
    print("TEXT PREPROCESSING REPORT")
    print("="*60)

    print(f"Total paragraphs: {stats['total_paragraphs']:,}")
    print(f"Total words - Original: {stats['total_original_words']:,}")
    print(f"Total words - Processed: {stats['total_processed_words']:,}")
    print(f"Word reduction: {stats['word_reduction_pct']:.1f}%")
    print(f"Unique words - Original: {stats['unique_original_words']:,}")
    print(f"Unique words - Processed: {stats['unique_processed_words']:,}")
    print(f"Vocabulary reduction: {stats['vocab_reduction_pct']:.1f}%")

    return stats

def print_sample_transformations(df, original_col='text', processed_col='processed_text', sample_size=3):
    """
    Print sample transformations for quality check
    """
    print("\n" + "="*60)
    print("SAMPLE TRANSFORMATIONS")
    print("="*60)

    for i in range(min(sample_size, len(df))):
        original = df.iloc[i][original_col]
        processed = df.iloc[i][processed_col]

        # Try to get page number if available
        page_info = f"Page {df.iloc[i]['page']}" if 'page' in df.columns else f"Row {i+1}"

        print(f"\nSample {i+1} ({page_info}):")
        print(f"ORIGINAL: {original[:150]}...")
        print(f"PROCESSED: {processed[:150]}...")

        if 'original_word_count' in df.columns and 'processed_word_count' in df.columns:
            print(f"Words: {df.iloc[i]['original_word_count']} → {df.iloc[i]['processed_word_count']}")


def process_paragraphs_dataframe(df_paragraphs, text_column='text'):
    """
    Specific function to process your paragraphs DataFrame
    """
    print("Applying lemmatization to paragraphs...")
    df_processed = apply_text_preprocessing(df_paragraphs, text_column)

    # Filter out empty processed paragraphs
    df_processed = df_processed[df_processed['processed_text'].str.len() > 0].copy()

    print(f"Original paragraphs: {len(df_paragraphs)}")
    print(f"Paragraphs after processing: {len(df_processed)}")

    return df_processed

def analyze_paragraphs_bow(df_processed, text_column='processed_text', top_n=20):
    """
    Specific function to analyze preprocessed paragraphs with BOW
    """
    print("\n" + "="*60)
    print("BAG OF WORDS ANALYSIS - PROCESSED PARAGRAPHS")
    print("="*60)

    top_words_df = create_bow_analysis(df_processed[text_column], top_n=top_n)

    print(f"Top {len(top_words_df)} most frequent lemmas:")
    print(top_words_df.to_string(index=False))

    return top_words_df


def run_paragraphs_analysis_pipeline(df_paragraphs, text_column='text', top_n=20):
    """
    Complete pipeline for analyzing your paragraphs data
    """
    # Step 1: Preprocess data
    df_processed = process_paragraphs_dataframe(df_paragraphs, text_column)

    # Step 2: Generate reports
    stats = print_preprocessing_report(df_processed)
    print_sample_transformations(df_processed)

    # Step 3: BOW analysis
    top_words = analyze_paragraphs_bow(df_processed, top_n=top_n)

    return df_processed, top_words, stats

# =============================================================================
# USO
# =============================================================================

# Complete pipeline (your main use case)
if 'df_consolidado' in locals() and not df_consolidado.empty:
    print(f"Analizando corpus de {df_consolidado['fecha_boletin'].nunique()} fechas...\n")

    df_processed, top_words, stats = run_paragraphs_analysis_pipeline(df_consolidado)

    # Opcional: Ver si el top 20 cambia mucho al tener más fechas
    print("\nTop palabras del periodo completo:")
    print(top_words)
else:
    print("No hay datos consolidados para analizar.")

# Just BOW analysis on already processed data
#top_words = analyze_paragraphs_bow(df_processed)

# Compare original vs processed BOW
print("\n" + "="*60)
print("COMPARISON: ORIGINAL vs PROCESSED BOW")
print("="*60)

bow_original = create_bow_analysis(df_processed['text'], top_n=30)
bow_processed = create_bow_analysis(df_processed['processed_text'], top_n=30)

print("Top 30 words - ORIGINAL:")
print(bow_original.to_string(index=False))

print("\nTop 30 words - PROCESSED (lemmatized):")
print(bow_processed.to_string(index=False))

In [None]:
from google.colab import files

# 1. Generamos el dataframe con TODAS las palabras (o un número muy alto, ej: 10.000)
# Usamos tu función existente 'create_bow_analysis' pero pedimos más palabras
print("Generando reporte completo...")
df_frecuencias_full = create_bow_analysis(df_processed['processed_text'], top_n=10000)

# 2. Definimos el nombre del archivo
nombre_archivo = "frecuencias_palabras_bora.csv"

# 3. Guardamos en CSV
# Usamos encoding='utf-8-sig' para que Excel reconozca bien los acentos (ñ, á, é...)
df_frecuencias_full.to_csv(nombre_archivo, index=False, encoding='utf-8-sig')

# 4. Descargamos el archivo a tu PC
files.download(nombre_archivo)

print(f"✓ Archivo '{nombre_archivo}' generado y descargado.")

# 4. Otros

Algunos scripts para explorar el contenido de los PDFs

In [None]:
import tabula
import pandas as pd
import re
from collections import Counter
import requests

# Since we're in Colab, let's work with the downloaded PDF file
pdf_path = "boletin_pdfs/seccion_primera_20250927.pdf"

# First, let's get some basic information about the PDF
# Get the number of pages
import PyPDF2
with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    num_pages = len(pdf_reader.pages)
    print(f"Number of pages in the PDF: {num_pages}")

# Now let's extract all the text to search for "derechos humanos"
full_text = ""
for page_num in range(num_pages):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        page = pdf_reader.pages[page_num]
        full_text += page.extract_text()

# Count occurrences of "derechos humanos"
derechos_humanos_count = full_text.lower().count("derechos humanos")
print(f"Occurrences of 'derechos humanos': {derechos_humanos_count}")

# Now let's use tabula to extract tables and look for resoluciones and disposiciones
# Let's try to extract all tables from the PDF
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)
print(f"Number of tables found: {len(tables)}")

# Let's examine the structure of the first few tables to understand the content
for i, table in enumerate(tables[:5]):
    print(f"\nTable {i+1} shape: {table.shape}")
    print(f"Table {i+1} columns: {table.columns.tolist()}")
    if not table.empty:
        print(table.head(3))

In [None]:
# Listar resoluciones

# Extract text from page 2 specifically
with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    page2_text = pdf_reader.pages[1].extract_text()  # Page index 1 is page 2

print("Content from page 2 (SUMARIO):")
print(page2_text[:2000])  # First 2000 characters

# Let's search for resolution patterns in page 2 specifically
resoluciones_page2 = re.findall(resoluciones_pattern, page2_text, re.IGNORECASE)
print(f"\nResoluciones found in SUMARIO page: {len(set(resoluciones_page2))}")
for resol in sorted(set(resoluciones_page2)):
    print(f"  - {resol}")

# Let's also check if we can find the specific sections for Resoluciones and Disposiciones
# by looking at the table of contents structure

In [None]:
# Dataframe de prueba
# Cambiar el pdf path al que se haya descargado en 📁
pdf_path = "boletin_pdfs/seccion_primera_20251020.pdf"
with open(pdf_path, 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    num_pages = len(pdf_reader.pages)
    print(f"Number of pages in the PDF: {num_pages}")

full_text = ""
for page_num in range(num_pages):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        page = pdf_reader.pages[page_num]
        full_text += page.extract_text()
df_test =  full_text