In [1]:
# --- FAST, CORRECT SITEMAP LOOP: only "topics" XML sitemaps ---
import requests, itertools, time
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept": "application/xml,text/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

INDEX_URL = "https://www.apa.org/sitemap.xml"

def fetch_xml(url: str) -> BeautifulSoup:
    r = requests.get(url, headers=HEADERS, timeout=30, allow_redirects=True)
    ctype = (r.headers.get("Content-Type") or "").lower()
    print(f"[sitemap] GET {url} -> {r.status_code} ({ctype})")
    r.raise_for_status()
    # only proceed if it really looks like XML
    if "xml" not in ctype:
        raise ValueError(f"Not XML: {url} ({ctype})")
    return BeautifulSoup(r.text, "xml")

# 1) Load the sitemap index
idx = fetch_xml(INDEX_URL)

# 2) Extract ONLY child sitemap URLs that end with .xml, then keep just those with "topics"
child_maps = [
    loc.get_text(strip=True)
    for loc in idx.find_all("loc")
    if loc.get_text(strip=True).lower().endswith(".xml")
]
child_maps = [u for u in child_maps if "topics" in u.lower()]

print(f"[sitemap] topics sitemaps found: {len(child_maps)}")
for u in itertools.islice(child_maps, 0, 10):
    print(" -", u)

# Optional: limit while testing so it runs fast
# child_maps = child_maps[:5]

# 3) Visit each topics sitemap and collect /topics URLs
all_topics = []
for sm in child_maps:
    try:
        sm_soup = fetch_xml(sm)
        for loc in sm_soup.find_all("loc"):
            u = loc.get_text(strip=True)
            if "/topics" in u.lower() and not u.lower().endswith(
                (".pdf",".jpg",".png",".gif",".svg",".jpeg",".webp",".avif")
            ):
                all_topics.append(u)
        time.sleep(0.2)  # be polite
    except Exception as e:
        print("[warn] skipped:", sm, "->", e)

# 4) Deduplicate preserving order
seen, deduped = set(), []
for u in all_topics:
    if u not in seen:
        seen.add(u)
        deduped.append(u)

print(f"[result] Total Topics URLs from topics sitemaps: {len(deduped)}")
for u in itertools.islice(deduped, 0, 10):
    print(" -", u)


[sitemap] GET https://www.apa.org/sitemap.xml -> 200 (application/xml; charset=utf-8)
[sitemap] topics sitemaps found: 0
[result] Total Topics URLs from topics sitemaps: 0
