# NHANES 2021-2023 Demographics Data Link Finder

This notebook programmatically locates the Demographics (DEMO) SAS transport (.XPT) data file link for the NHANES 2021-2023 cycle. It:

- Fetches and validates the NHANES component listing page
- Parses all anchors and scores them using pattern heuristics
- Selects the best candidate DEMO .XPT file URL
- Provides unit-style tests with mocked HTML
- Optionally performs a live integration + file header inspection

Cycle targeted: **2021-2023**  
Component: **Demographics (DEMO)**

## 1. Import Libraries

In [3]:
# Core imports
import re
import time
import json
import math
import typing as t
from dataclasses import dataclass
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd

# For lightweight inline testing without external pytest runner
from types import SimpleNamespace

USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": USER_AGENT})

TARGET_URL = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics&Cycle=2021-2023"

## 2. Set Target URL and Matching Patterns

In [4]:
# Patterns & heuristics for identifying DEMO XPT link
DEMO_PATTERNS = [
    re.compile(r"DEMO", re.IGNORECASE),
    re.compile(r"Demographic", re.IGNORECASE),
]
XPT_PATTERN = re.compile(r"\.xpt$", re.IGNORECASE)

# Candidate exclusion substrings (avoid documentation PDFs, etc.)
EXCLUDE_SUBSTRINGS = [".pdf", "DataDocs", "/tutorial", "/about"]

# Expected file naming shape (heuristic; may adapt)
EXPECTED_PREFIX = "DEMO"

print("Configured DEMO patterns and filters.")

Configured DEMO patterns and filters.


## 3. HTTP Fetch with Requests

In [5]:
def fetch_page(url: str, retries: int = 3, backoff: float = 1.0) -> requests.Response:
    """Fetch a URL with simple retry/backoff.
    Raises requests.HTTPError if non-200 after retries.
    """
    last_exc = None
    for attempt in range(1, retries + 1):
        try:
            resp = SESSION.get(url, timeout=20)
            if resp.status_code == 200:
                return resp
            else:
                last_exc = requests.HTTPError(f"Status {resp.status_code} on attempt {attempt}")
        except Exception as e:  # noqa: BLE001
            last_exc = e
        time.sleep(backoff * attempt)
    raise last_exc if last_exc else RuntimeError("Unknown fetch failure")

# Quick smoke test (won't assert yet)
_resp_preview = fetch_page(TARGET_URL)
print("Fetched bytes:", len(_resp_preview.content))

Fetched bytes: 381776


## 4. Validate HTTP Response

In [6]:
def validate_html_response(resp: requests.Response) -> None:
    assert resp.status_code == 200, f"Unexpected status {resp.status_code}"
    ctype = resp.headers.get('Content-Type', '')
    assert 'text' in ctype.lower() or 'html' in ctype.lower(), f"Unexpected content type: {ctype}"
    assert resp.text.strip(), "Empty body received"

validate_html_response(_resp_preview)
print("Response validation passed.")

Response validation passed.


## 5. Parse HTML and Collect Candidate Links

In [7]:
@dataclass
class LinkCandidate:
    href: str
    text: str
    title: str
    score: float
    raw: t.Dict[str, t.Any]


def extract_link_candidates(html: str) -> t.List[LinkCandidate]:
    soup = BeautifulSoup(html, 'html.parser')
    anchors = soup.find_all('a')
    candidates: t.List[LinkCandidate] = []
    for a in anchors:
        href = (a.get('href') or '').strip()
        text = (a.get_text() or '').strip()
        title = (a.get('title') or '').strip()
        raw = {k: v for k, v in a.attrs.items()}
        if not href:
            continue
        # Basic exclusion
        if any(substr.lower() in href.lower() for substr in EXCLUDE_SUBSTRINGS):
            continue
        # Initial score
        score = 0.0
        if XPT_PATTERN.search(href):
            score += 5
        for pat in DEMO_PATTERNS:
            if pat.search(href) or pat.search(text) or pat.search(title):
                score += 3
        if EXPECTED_PREFIX.lower() in href.lower():
            score += 2
        if EXPECTED_PREFIX.lower() in text.lower():
            score += 1
        if href.lower().endswith('.xpt') and 'demo' in href.lower():
            score += 2
        candidates.append(LinkCandidate(href=href, text=text, title=title, score=score, raw=raw))
    return candidates

cands = extract_link_candidates(_resp_preview.text)
print(f"Extracted {len(cands)} raw link candidates")
print("Top 5 by raw order:")
for c in cands[:5]:
    print(" -", c.href[:80], "|", c.text[:40], "| score", c.score)

Extracted 673 raw link candidates
Top 5 by raw order:
 - #content | Skip directly to site content | score 0.0
 - https://www.cdc.gov | Centers for Disease Control and Preventi | score 0.0
 - https://www.cdc.gov/nchs/ | National Center for Health Statistics | score 0.0
 - #nav-group-about-nhanes | plus icon | score 0.0
 - https://www.cdc.gov/nchs/nhanes/nhanes-story.htm | Video: The NHANES Story | score 0.0


## 6. Filter for Demographic Data File Link

In [8]:
def rank_demographic_links(candidates: t.List[LinkCandidate]) -> t.List[LinkCandidate]:
    # Only consider those with some base score & XPT extension
    filtered = [c for c in candidates if c.href and XPT_PATTERN.search(c.href)]
    # Additional penalty for suspiciously long query strings
    for c in filtered:
        if '?' in c.href:
            c.score -= 1
    # Sort descending by score then by shorter href length
    return sorted(filtered, key=lambda c: (-c.score, len(c.href)))

ranked = rank_demographic_links(cands)
print(f"Ranked {len(ranked)} candidate XPT links")
for r in ranked[:10]:
    print(f"Score {r.score:.1f} | {r.href} | text='{r.text[:30]}'")

Ranked 1 candidate XPT links
Score 13.0 | /Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt | text='DEMO_L Data [XPT - 2.5 MB]'


## 7. Normalize and Build Absolute URL

In [9]:
def normalize_and_dedupe(ranked: t.List[LinkCandidate], page_url: str) -> t.List[LinkCandidate]:
    seen = set()
    out: t.List[LinkCandidate] = []
    for c in ranked:
        abs_url = urljoin(page_url, c.href)
        if abs_url not in seen:
            seen.add(abs_url)
            out.append(LinkCandidate(href=abs_url, text=c.text, title=c.title, score=c.score, raw=c.raw))
    return out

normalized = normalize_and_dedupe(ranked, TARGET_URL)
print("After normalization & dedupe:", len(normalized))
for c in normalized[:5]:
    print(" *", c.href)

After normalization & dedupe: 1
 * https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt


## 8. Encapsulate Search Logic in Function

In [10]:
class DemographicFileNotFound(Exception):
    pass


def find_demographic_file_link(page_url: str = TARGET_URL) -> t.Dict[str, t.Any]:
    resp = fetch_page(page_url)
    validate_html_response(resp)
    candidates = extract_link_candidates(resp.text)
    ranked = rank_demographic_links(candidates)
    normalized = normalize_and_dedupe(ranked, page_url)
    if not normalized:
        raise DemographicFileNotFound("No DEMO XPT link candidates found")
    best = normalized[0]
    return {
        "url": best.href,
        "anchor_text": best.text,
        "title": best.title,
        "score": best.score,
        "all_candidates": [c.href for c in normalized],
    }

# Try the function (non-fatal if fails)
try:
    result_preview = find_demographic_file_link()
    print("Best candidate URL:", result_preview["url"])
except Exception as e:  # noqa: BLE001
    print("Lookup failed (will test later):", e)

Best candidate URL: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt


## 9. Unit Tests with Mocked HTML (pytest-like)

In [11]:
def _mock_response(html: str, status: int = 200) -> SimpleNamespace:
    return SimpleNamespace(status_code=status, text=html, headers={"Content-Type": "text/html"}, content=html.encode())

# Minimal mock test suite
def test_single_valid_link():
    html = '<a href="/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt">Demographics File</a>'
    cands = extract_link_candidates(html)
    ranked = rank_demographic_links(cands)
    normalized = normalize_and_dedupe(ranked, TARGET_URL)
    assert any("DEMO" in c.href.upper() for c in normalized)


def test_multiple_links_choose_demo():
    html = '\n'.join([
        '<a href="/some/other/file.XPT">Other File</a>',
        '<a href="/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt">DEMO File</a>',
        '<a href="/docs/manual.pdf">Manual</a>'
    ])
    cands = extract_link_candidates(html)
    ranked = rank_demographic_links(cands)
    normalized = normalize_and_dedupe(ranked, TARGET_URL)
    assert normalized[0].href.lower().endswith('.xpt')
    assert 'demo' in normalized[0].href.lower()


def test_no_links():
    html = '<html><body><p>No anchors here</p></body></html>'
    cands = extract_link_candidates(html)
    ranked = rank_demographic_links(cands)
    normalized = normalize_and_dedupe(ranked, TARGET_URL)
    assert normalized == []

# Run tests
test_single_valid_link()
test_multiple_links_choose_demo()
test_no_links()
print("Unit-style tests passed.")

Unit-style tests passed.


## 10. Live Integration Test (Guarded)

In [12]:
RUN_LIVE_TEST = False  # Set to True to enable the live integration test

if RUN_LIVE_TEST:
    live_result = find_demographic_file_link()
    print("Live link found:", live_result["url"])  
    assert live_result["url"].lower().endswith('.xpt')
else:
    print("Live integration test skipped. Set RUN_LIVE_TEST=True to run.")

Live integration test skipped. Set RUN_LIVE_TEST=True to run.


## 11. Optional: Download and Inspect File Header

In [17]:
import io
INSPECT_FILE = True  # Guard heavy download

if INSPECT_FILE:
    try:
        meta = find_demographic_file_link()
        file_url = meta['url']
        print("Downloading file header from:", file_url)
        file_resp = SESSION.get(file_url, timeout=60)
        file_resp.raise_for_status()
        # Load into pandas
        demo_df = pd.read_sas(io.BytesIO(file_resp.content), format='xport')
        print("Loaded DataFrame shape:", demo_df.shape)
        print("Columns:", list(demo_df.columns)[:15])
    except Exception as e:  # noqa: BLE001
        print("Failed to inspect file:", e)
else:
    print("File inspection skipped. Set INSPECT_FILE=True to enable.")

Downloading file header from: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt
Loaded DataFrame shape: (11933, 27)
Columns: ['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR', 'RIDAGEMN', 'RIDRETH1', 'RIDRETH3', 'RIDEXMON', 'RIDEXAGM', 'DMQMILIZ', 'DMDBORN4', 'DMDYRUSR', 'DMDEDUC2', 'DMDMARTZ']


## 12. Display Final Discovered Link

In [14]:
try:
    final_meta = find_demographic_file_link()
    print("Resolved Demographics XPT URL:\n", final_meta['url'])
    print("Anchor Text:", final_meta['anchor_text'])
    print("Score:", final_meta['score'])
    print("Total candidate URLs considered:", len(final_meta['all_candidates']))
except Exception as e:  # noqa: BLE001
    print("Final lookup failed:", e)

Resolved Demographics XPT URL:
 https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt
Anchor Text: DEMO_L Data [XPT - 2.5 MB]
Score: 13.0
Total candidate URLs considered: 1


## 13. Multi-Component Page & File Link Discovery

Goal: Starting from the Demographics component page (2021-2023 cycle), automatically locate and traverse the sibling component pages:
- Demographics Data page
- Examination Data page
- Laboratory Data page
- Dietary Data page
- Questionnaire Data page

Then extract every data file (.XPT) link listed on each of those pages.

Approach:
1. Parse the initial page for navigation or panel links referencing target components (Demographics, Examination, Laboratory, Dietary, Questionnaire).
2. Normalize and fetch each component page.
3. Extract all `.xpt` links + associated anchor text, scoring & grouping.
4. Produce a structured dictionary: `{component: [{file_url, anchor_text, score}]}`.

Assumptions:
- Component links contain keywords: `Examination`, `Laboratory`, `Demographics`, `Dietary`, `Questionnaire`.
- Data file links end with `.XPT` or `.xpt`.
- Relative URLs are resolved with `urljoin`.

Fallback: If direct component page links not found, attempt heuristic construction (replacing `Component=Demographics` with target component name) and validate with HTTP 200.

In [None]:
import re
from urllib.parse import urlparse

COMPONENT_KEYWORDS = {
    'Demographics': re.compile(r'demograph', re.I),
    'Examination': re.compile(r'examin', re.I),
    'Laboratory': re.compile(r'laborat', re.I),
    'Dietary': re.compile(r'diet', re.I),
    'Questionnaire': re.compile(r'question', re.I),
}

XPT_PATTERN = re.compile(r'\.xpt$', re.I)


def discover_component_pages(html: str, base_url: str) -> dict:
    """Discover component page URLs (Demographics, Examination, Laboratory, Dietary, Questionnaire) from a base component page HTML.
    Strategy:
      - Scan all anchors for keyword matches.
      - Prefer links that contain 'Data' or 'data' in text or query string.
      - Normalize & dedupe.
      - Heuristic fallback: construct URLs by replacing Component=Demographics with other component names if missing.
    Returns: {component: url or None}
    """
    soup = BeautifulSoup(html, 'html.parser')
    found = {k: None for k in COMPONENT_KEYWORDS}
    anchors = soup.find_all('a')

    def qualify(a_text: str, href: str, comp: str) -> int:
        score = 0
        if not href:
            return -1
        if COMPONENT_KEYWORDS[comp].search(a_text or '') or COMPONENT_KEYWORDS[comp].search(href):
            score += 5
        if 'Component=' in href and comp in href:
            score += 5
        if 'Data' in a_text or 'data' in a_text:
            score += 2
        if 'Data' in href or 'data' in href:
            score += 1
        return score

    candidates = {k: [] for k in COMPONENT_KEYWORDS}
    for a in anchors:
        text = (a.get_text() or '').strip()
        href = a.get('href')
        if not href:
            continue
        for comp in COMPONENT_KEYWORDS:
            s = qualify(text, href, comp)
            if s > 0:
                full = urljoin(base_url, href)
                candidates[comp].append((s, text, full))

    for comp, rows in candidates.items():
        if rows:
            rows.sort(key=lambda r: (-r[0], len(r[2])))
            found[comp] = rows[0][2]

    parsed = urlparse(base_url)
    qs = parsed.query
    if 'Component=Demographics' in qs:
        for comp in COMPONENT_KEYWORDS:
            if found[comp] is None:
                heuristic = base_url.replace('Component=Demographics', f'Component={comp}')
                try:
                    r = requests.get(heuristic, timeout=10)
                    if r.status_code == 200 and COMPONENT_KEYWORDS[comp].search(r.text):
                        found[comp] = heuristic
                except Exception:
                    pass
    return found


def extract_xpt_links(page_url: str) -> list:
    """Fetch a component page and return list of dicts with XPT file link details."""
    try:
        r = requests.get(page_url, timeout=20)
        if r.status_code != 200:
            return []
    except Exception:
        return []
    soup = BeautifulSoup(r.text, 'html.parser')
    out = []
    for a in soup.find_all('a'):
        href = a.get('href')
        if not href:
            continue
        if XPT_PATTERN.search(href):
            full = urljoin(page_url, href)
            text = (a.get_text() or '').strip()
            score = 0
            upper_href = href.upper()
            if 'DEMO' in upper_href:
                score += 3
            if 'LAB' in upper_href:
                score += 2
            if 'EXAM' in upper_href or 'BMX' in upper_href:
                score += 2
            if 'DIET' in upper_href or 'DR1' in upper_href or 'DR2' in upper_href:
                score += 2
            if 'QQ' in upper_href or 'Q' in upper_href:
                score += 1
            if 'Questionnaire' in text:
                score += 1
            out.append({
                'file_url': full,
                'anchor_text': text,
                'raw_href': href,
                'score': score
            })
    dedup = {}
    for rec in out:
        u = rec['file_url']
        if u not in dedup or rec['score'] > dedup[u]['score']:
            dedup[u] = rec
    return list(dedup.values())


def aggregate_component_files(component_pages: dict) -> dict:
    result = {}
    for comp, url in component_pages.items():
        if not url:
            result[comp] = []
            continue
        files = extract_xpt_links(url)
        for f in files:
            f['component'] = comp
        result[comp] = files
    return result

In [None]:
BASE_DEMO_URL = "https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics&Cycle=2021-2023"

print("Fetching base demographics page ...")
resp = requests.get(BASE_DEMO_URL, timeout=20)
print(f"Status: {resp.status_code}")
component_pages = {}
if resp.status_code == 200:
    component_pages = discover_component_pages(resp.text, BASE_DEMO_URL)
else:
    print("Failed to fetch base page; aborting component discovery.")

print("Discovered component pages:")
for k, v in component_pages.items():
    print(f"  {k}: {v}")

print("\nExtracting XPT file links per component ...")
component_files = aggregate_component_files(component_pages)

# Summarize counts
for comp, files in component_files.items():
    print(f"{comp}: {len(files)} XPT link(s)")

# Flatten for DataFrame if pandas available
try:
    import pandas as pd
    flat_rows = []
    for comp, files in component_files.items():
        for rec in files:
            flat_rows.append({
                'component': comp,
                'file_url': rec['file_url'],
                'anchor_text': rec['anchor_text'],
                'score': rec['score']
            })
    if flat_rows:
        df_files = pd.DataFrame(flat_rows).sort_values(['component','score'], ascending=[True, False])
        display(df_files.head(50))
    else:
        print("No file links discovered.")
except Exception as e:
    print("Pandas not available or error creating DataFrame:", e)

Fetching base demographics page ...
Status: 200
Discovered component pages:
  Demographics: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics
  Examination: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination
  Laboratory: https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory

Extracting XPT file links per component ...
Demographics: 12 XPT link(s)
Examination: 182 XPT link(s)
Laboratory: 749 XPT link(s)


Unnamed: 0,component,file_url,anchor_text,score
0,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_D Data [XPT - 3.4 MB],3
1,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_E Data [XPT - 3.3 MB],3
2,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_C Data [XPT - 3.4 MB],3
3,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_B Data [XPT - 3.1 MB],3
4,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1...,DEMO Data [XPT - 11 MB],3
5,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_F Data [XPT - 3.5 MB],3
6,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_G Data [XPT - 3.6 MB],3
7,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_H Data [XPT - 3.7 MB],3
8,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_I Data [XPT - 3.6 MB],3
9,Demographics,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,DEMO_J Data [XPT - 3.3 MB],3


In [None]:
# Optional: export manifest to JSON for reuse
import json, time
manifest = {
    'cycle': '2021-2023',
    'timestamp': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()),
    'component_pages': component_pages,
    'files': component_files,
    'total_files': sum(len(v) for v in component_files.values()),
    'components_covered': list(component_files.keys()),
}

output_path = 'component_files_manifest_2021_2023_all_components.json'
try:
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(manifest, f, indent=2)
    print(f"Manifest written to {output_path} (components: {len(component_pages)}; total files: {manifest['total_files']})")
except Exception as e:
    print("Failed to write manifest:", e)

print('Components in manifest:', list(component_files.keys()))

Manifest written to component_files_manifest_2021_2023.json (components: 3; total files: 943)
Components in manifest: ['Demographics', 'Examination', 'Laboratory']


In [21]:
manifest


{'cycle': '2021-2023',
 'timestamp': '2025-09-13T22:48:56Z',
 'component_pages': {'Demographics': 'https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics',
  'Examination': 'https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination',
  'Laboratory': 'https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory'},
 'files': {'Demographics': [{'file_url': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2005/DataFiles/DEMO_D.xpt',
    'anchor_text': 'DEMO_D Data [XPT - 3.4 MB]',
    'raw_href': '/Nchs/Data/Nhanes/Public/2005/DataFiles/DEMO_D.xpt',
    'score': 3,
    'component': 'Demographics'},
   {'file_url': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2007/DataFiles/DEMO_E.xpt',
    'anchor_text': 'DEMO_E Data [XPT - 3.3 MB]',
    'raw_href': '/Nchs/Data/Nhanes/Public/2007/DataFiles/DEMO_E.xpt',
    'score': 3,
    'component': 'Demographics'},
   {'file_url': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2003/DataFiles/DEMO_C.xpt'