In [None]:
# --- SITEMAP FIX FOR APA ---
# Correct flow:
# 1) Fetch sitemap index: https://www.apa.org/sitemap.xml
# 2) Iterate child sitemaps
# 3) Collect <loc> URLs that contain "/topics"

import requests, itertools
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept": "application/xml,text/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
}

def fetch_xml(url):
    r = requests.get(url, headers=HEADERS, timeout=30)
    print(f"[sitemap] GET {url} -> {r.status_code} ({r.headers.get('Content-Type')})")
    r.raise_for_status()
    return BeautifulSoup(r.text, "xml")

index_url = "https://www.apa.org/sitemap.xml"

# 1) Fetch sitemap index
idx = fetch_xml(index_url)
child_maps = [loc.get_text(strip=True) for loc in idx.find_all("loc")]
print("[sitemap] child sitemaps in index:", len(child_maps))
for u in itertools.islice(child_maps, 0, 10):
    print(" -", u)

# 2) Visit each child sitemap and gather /topics URLs
all_topics = []
for sm in child_maps:
    try:
        sm_soup = fetch_xml(sm)
        locs = [loc.get_text(strip=True) for loc in sm_soup.find_all("loc")]
        for u in locs:
            if "/topics" in u and not u.lower().endswith(
                (".pdf",".jpg",".png",".gif",".svg",".jpeg",".webp",".avif")
            ):
                all_topics.append(u)
    except Exception as e:
        print("[warn]", sm, e)

# 3) Deduplicate preserving order
deduped, seen = [], set()
for u in all_topics:
    if u not in seen:
        seen.add(u)
        deduped.append(u)

print("[result] Total Topics URLs from all sitemaps:", len(deduped))
for u in itertools.islice(deduped, 0, 10):
    print(" -", u)

# Optional: save
# import pandas as pd
# pd.DataFrame({"url": deduped}).to_csv("apa_topics_urls.csv", index=False)
# print("Saved apa_topics_urls.csv")


[sitemap] GET https://www.apa.org/sitemap.xml -> 200 (application/xml; charset=utf-8)
[sitemap] child sitemaps in index: 24871
 - https://www.apa.org
 - https://www.apa.org/advertising
 - https://www.apa.org/store
 - https://www.apa.org/search
 - https://www.apa.org/sitemap
 - https://www.apa.org/apa_branding_include
 - https://www.apa.org/apa_header
 - https://www.apa.org/apapractice_header
 - https://www.apa.org/search2
 - https://www.apa.org/common_include
[sitemap] GET https://www.apa.org -> 200 (text/html)
[sitemap] GET https://www.apa.org/advertising -> 200 (text/html)
[sitemap] GET https://www.apa.org/store -> 200 (text/html)
[sitemap] GET https://www.apa.org/search -> 200 (text/html)
[sitemap] GET https://www.apa.org/sitemap -> 200 (text/html)
[sitemap] GET https://www.apa.org/apa_branding_include -> 200 (text/html)
[sitemap] GET https://www.apa.org/apa_header -> 200 (text/html)
[sitemap] GET https://www.apa.org/apapractice_header -> 200 (text/html)
[sitemap] GET https://www.ap