In [49]:
import requests
import re
import html
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time

BASE_URL = "https://www.gob.mx"
ARCHIVE_URL = f"{BASE_URL}/presidencia/es/archivo/articulos?filter_origin=archive&idiom=es&order=DESC&page="


In [48]:
def get_articles_from_page(page_num):
    """
    Extract clean titles, URLs, and dates from Gob.mx dynamic HTML.
    Args:
        page_num (int): Page number to scrape.
    Returns:
        list of dict: Each dict contains 'title', 'url', and 'date' keys
    """
    url = f"{ARCHIVE_URL}{page_num}"
    r = requests.get(url)
    r.raise_for_status()
    text = r.text

    # Extract JS-embedded HTML fragments
    fragments = re.findall(r"\$\('#prensa'\)\.append\('(.+?)'\);", text, flags=re.DOTALL)

    articles_out = []

    for frag in fragments:
        # Step 1: Decode HTML entities (e.g. &lt;, &quot;)
        frag_clean = html.unescape(frag)
        # Step 2: Replace escaped quotes \" → "
        frag_clean = frag_clean.replace('\\"', '"').replace("\\'", "'")
        # Step 3: Remove stray backslashes that break tags
        frag_clean = frag_clean.replace("\\n", "").replace("\\", "")
        # Step 4: Parse
        soup = BeautifulSoup(frag_clean, "html.parser")

        # Extract all article cards
        for art in soup.find_all("article"):
            title_el = art.find("h2")
            link_el = art.find("a", class_="small-link")
            date_el = art.find("time")

            title = title_el.get_text(strip=True) if title_el else None
            date = date_el.get_text(strip=True) if date_el else None

            # Some hrefs may end with ?idiom=es
            if link_el and link_el.has_attr("href"):
                href = link_el["href"].strip('"')
                if href.startswith("/"):
                    href = BASE_URL + href
            else:
                href = None

            if title or href:
                articles_out.append({
                    "title": title,
                    "url": href,
                    "date": date
                })
    return articles_out

In [47]:
def scrape_all_articles(max_pages=50):
    """
    Scrape articles from multiple pages.
    Args:
        max_pages (int): Maximum number of pages to scrape.
    Returns:
        pd.DataFrame: DataFrame containing all scraped articles.
    """
    all_data = []
    # Loop through pages with progress bar
    for page in tqdm(range(1, max_pages + 1)):
        page_data = get_articles_from_page(page)   
        if not page_data:
            print(f"No more data after page {page}.") # Debug info
            break
        all_data.extend(page_data) # Accumulate data
        time.sleep(1) # Be polite to the server
    return pd.DataFrame(all_data)