In [56]:
import requests
import re
import html
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import time
import os
import random


BASE_URL = "https://www.gob.mx"
ARCHIVE_URL = f"{BASE_URL}/presidencia/es/archivo/articulos?filter_origin=archive&idiom=es&order=DESC&page="


In [57]:
def get_articles_from_page(page_num):
    """
    Extract clean titles, URLs, and dates from Gob.mx dynamic HTML.
    Args:
        page_num (int): Page number to scrape.
    Returns:
        list of dict: Each dict contains 'title', 'url', and 'date' keys
    """
    url = f"{ARCHIVE_URL}{page_num}"
    r = requests.get(url)
    r.raise_for_status()
    text = r.text

    # Extract JS-embedded HTML fragments
    fragments = re.findall(r"\$\('#prensa'\)\.append\('(.+?)'\);", text, flags=re.DOTALL)

    articles_out = []

    for frag in fragments:
        # Step 1: Decode HTML entities (e.g. &lt;, &quot;)
        frag_clean = html.unescape(frag)
        # Step 2: Replace escaped quotes \" ‚Üí "
        frag_clean = frag_clean.replace('\\"', '"').replace("\\'", "'")
        # Step 3: Remove stray backslashes that break tags
        frag_clean = frag_clean.replace("\\n", "").replace("\\", "")
        # Step 4: Parse
        soup = BeautifulSoup(frag_clean, "html.parser")

        # Extract all article cards
        for art in soup.find_all("article"):
            title_el = art.find("h2")
            link_el = art.find("a", class_="small-link")
            date_el = art.find("time")

            title = title_el.get_text(strip=True) if title_el else None
            date = date_el.get_text(strip=True) if date_el else None

            # Some hrefs may end with ?idiom=es
            if link_el and link_el.has_attr("href"):
                href = link_el["href"].strip('"')
                if href.startswith("/"):
                    href = BASE_URL + href
            else:
                href = None

            if title or href:
                articles_out.append({
                    "title": title,
                    "url": href,
                    "date": date
                })
    return articles_out

In [58]:
def scrape_all_articles(max_pages=50, retries=3):
    """
    Scrape articles from multiple pages and save metadata to CSV.
    
    Args:
        max_pages (int): Maximum number of pages to scrape.

    Returns:
        pd.DataFrame: DataFrame containing all scraped articles.
    """
    all_data = []
    output_dir = os.path.join("..", "data", "raw")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "article_metadata.csv")

    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; Ma√±anerasScraper/1.0; +https://github.com/yourproject)"
    }

    for page in tqdm(range(1, max_pages + 1)):
        success = False
        for attempt in range(retries):
            try:
                # Add headers to avoid being mistaken for a bot
                page_data = get_articles_from_page(page)
                if not page_data:
                    print(f"No more data after page {page}.")
                    success = True
                    break

                all_data.extend(page_data)
                success = True
                break  # Exit retry loop

            except requests.exceptions.HTTPError as e:
                print(f"‚ö†Ô∏è HTTP error on page {page}: {e}. Retrying ({attempt+1}/{retries})...")
                time.sleep(random.uniform(5, 10))  # wait before retry
            except Exception as e:
                print(f"‚ö†Ô∏è Other error on page {page}: {e}. Retrying...")
                time.sleep(random.uniform(5, 10))

        if not success:
            print(f"‚ùå Failed to fetch page {page} after {retries} retries. Skipping...")
            continue

        # Random polite delay between pages
        time.sleep(random.uniform(2, 5))

        # Save progress every 5 pages
        if page % 5 == 0:
            df_tmp = pd.DataFrame(all_data)
            df_tmp.to_csv(output_path, index=False)
            print(f"üíæ Checkpoint saved at page {page}")

    # Final save
    df = pd.DataFrame(all_data)
    df.to_csv(output_path, index=False)
    print(f"\n‚úÖ Saved {len(df)} articles to {output_path}")

    return df

In [59]:
scrape_all_articles(max_pages=59)

  8%|‚ñä         | 5/59 [00:18<03:29,  3.88s/it]

üíæ Checkpoint saved at page 5


 17%|‚ñà‚ñã        | 10/59 [01:37<18:15, 22.36s/it]

üíæ Checkpoint saved at page 10


 25%|‚ñà‚ñà‚ñå       | 15/59 [02:59<18:27, 25.17s/it]

üíæ Checkpoint saved at page 15


 34%|‚ñà‚ñà‚ñà‚ñç      | 20/59 [03:16<04:34,  7.04s/it]

üíæ Checkpoint saved at page 20


 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 25/59 [03:37<02:44,  4.85s/it]

üíæ Checkpoint saved at page 25


 51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 30/59 [03:54<01:43,  3.58s/it]

üíæ Checkpoint saved at page 30


 59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 35/59 [04:18<01:48,  4.53s/it]

üíæ Checkpoint saved at page 35


 68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 40/59 [05:38<04:07, 13.01s/it]

üíæ Checkpoint saved at page 40


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 45/59 [05:56<01:08,  4.90s/it]

üíæ Checkpoint saved at page 45


 85%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 50/59 [06:16<00:36,  4.00s/it]

üíæ Checkpoint saved at page 50


 93%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé| 55/59 [06:37<00:15,  3.92s/it]

üíæ Checkpoint saved at page 55


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 59/59 [06:55<00:00,  7.04s/it]


‚úÖ Saved 531 articles to ../data/raw/article_metadata.csv





Unnamed: 0,title,url,date
0,Versi√≥n estenogr√°fica.¬†Mensaje de la President...,https://www.gob.mx/presidencia/es/articulos/ve...,"jueves, 16 de octubre de 2025Fecha de publicaci√≥n"
1,Versi√≥n estenogr√°fica. Conferencia de prensa d...,https://www.gob.mx/presidencia/es/articulos/ve...,"jueves, 16 de octubre de 2025Fecha de publicaci√≥n"
2,Versi√≥n estenogr√°fica. Conferencia de prensa d...,https://www.gob.mx/presidencia/es/articulos/ve...,"mi√©rcoles, 15 de octubre de 2025Fecha de publi..."
3,Versi√≥n estenogr√°fica. Conferencia de prensa d...,https://www.gob.mx/presidencia/es/articulos/ve...,"martes, 14 de octubre de 2025Fecha de publicaci√≥n"
4,Versi√≥n estenogr√°fica. Conferencia de prensa d...,https://www.gob.mx/presidencia/es/articulos/ve...,"lunes, 13 de octubre de 2025Fecha de publicaci√≥n"
...,...,...,...
526,Versi√≥n estenogr√°fica. Conferencia de prensa d...,https://www.gob.mx/presidencia/es/articulos/ve...,"jueves, 03 de octubre de 2024Fecha de publicaci√≥n"
527,Versi√≥n estenogr√°fica.¬†Conferencia de prensa d...,https://www.gob.mx/presidencia/es/articulos/ve...,"mi√©rcoles, 02 de octubre de 2024Fecha de publi..."
528,Versi√≥n estenogr√°fica. Toma de protesta ante e...,https://www.gob.mx/presidencia/es/articulos/ve...,"martes, 01 de octubre de 2024Fecha de publicaci√≥n"
529,Versi√≥n estenogr√°fica. Mensaje de la president...,https://www.gob.mx/presidencia/es/articulos/ve...,"martes, 01 de octubre de 2024Fecha de publicaci√≥n"
