# NHANES URL Pattern Testing Tool

This notebook provides diagnostic tools to test various URL patterns for accessing NHANES data files across different survey cycles. Use this to verify access patterns and troubleshoot specific components.

In [25]:
import pandas as pd
import requests
import io
from typing import List, Dict
import time

# Configure display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

## URL Pattern Tester Functions

In [26]:
def test_url(url: str, timeout: int = 10) -> Dict:
    """Test a single URL and return status information."""
    start_time = time.time()
    try:
        response = requests.get(url, timeout=timeout)
        elapsed = time.time() - start_time
        
        result = {
            "url": url,
            "status_code": response.status_code,
            "time": f"{elapsed:.2f}s",
            "size": f"{len(response.content)/1024:.1f} KB" if response.status_code == 200 else "N/A",
            "success": response.status_code == 200
        }
        
        if response.status_code == 200:
            try:
                # Try to parse as XPT file
                df = pd.read_sas(io.BytesIO(response.content), format='xport')
                result["rows"] = len(df)
                result["columns"] = len(df.columns)
                result["first_cols"] = list(df.columns)[:5]
            except Exception as e:
                result["parse_error"] = str(e)
        
        return result
    except Exception as e:
        return {"url": url, "status_code": "Error", "error": str(e), "success": False}

def generate_url_patterns(cycle: str, component: str, letter: str = None) -> List[str]:
    """Generate different URL patterns for a cycle/component to test."""
    # If letter is not provided, try to determine from cycle
    if letter is None:
        cycle_letter_map = {
            '2021-2022': 'L',
            '2017-2018': 'J',
            '2015-2016': 'I',
            '2013-2014': 'H',
            '2011-2012': 'G',
            '2009-2010': 'F',
            '2007-2008': 'E',
            '2005-2006': 'D',
            '2003-2004': 'C',
            '2001-2002': 'B',
            '1999-2000': 'A'
        }
        letter = cycle_letter_map.get(cycle, '')
    
    cycle_year = cycle.split('-')[0] if '-' in cycle else cycle
    
    patterns = [
        # 2021+ pattern with Public subdirectory
        f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{cycle_year}/DataFiles/{component}_{letter}.xpt",
        # Standard pattern (2007-2018)
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component}_{letter}.XPT",
        # Lowercase variant
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component}_{letter}.xpt",
        # Pre-2007 pattern (lowercase component)
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component.lower()}_{letter}.XPT",
        # Pre-2007 pattern (lowercase component and extension)
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component.lower()}_{letter}.xpt",
        # Alternative Data/Nhanes path
        f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/{cycle}/{component}_{letter}.XPT",
        # Variant with cycle year suffix
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component}_{cycle[-2:]}.XPT",
    ]
    
    return patterns

def test_all_patterns(cycle: str, component: str, letter: str = None) -> pd.DataFrame:
    """Test all URL patterns for a given cycle/component and return results as DataFrame."""
    patterns = generate_url_patterns(cycle, component, letter)
    results = []
    
    print(f"Testing {len(patterns)} URL patterns for {cycle} {component}...")
    for url in patterns:
        print(f"Testing: {url}")
        result = test_url(url)
        results.append(result)
        if result["success"]:
            print(f"✓ SUCCESS! Found working URL pattern")
        else:
            print(f"✗ Failed with status {result['status_code']}")
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    return df

## Test Specific Cycles and Components

In [27]:
# Test demographics for 2021-2022 cycle (newest)
demo_2021_results = test_all_patterns('2021-2022', 'DEMO', 'L')
demo_2021_results[['url', 'status_code', 'success', 'size']]

Testing 7 URL patterns for 2021-2022 DEMO...
Testing: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt


✓ SUCCESS! Found working URL pattern
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEMO_L.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEMO_L.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEMO_L.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/demo_L.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/demo_L.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/demo_L.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/demo_L.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Data/Nhanes/2021-2022/DEMO_L.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Data/Nhanes/2021-2022/DEMO_L.XPT
✗ Failed with status 404
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEMO_22.XPT
✗ Failed with status 404
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEMO_22.XPT
✗ Failed

Unnamed: 0,url,status_code,success,size
0,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,200,True,2521.6 KB
1,https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEM...,503,False,
2,https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEM...,503,False,
3,https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/dem...,503,False,
4,https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/dem...,503,False,
5,https://wwwn.cdc.gov/Nchs/Data/Nhanes/2021-202...,404,False,
6,https://wwwn.cdc.gov/Nchs/Nhanes/2021-2022/DEM...,503,False,


In [28]:
# Test demographics for 2017-2018 cycle (mid-range)
demo_2017_results = test_all_patterns('2017-2018', 'DEMO', 'J')
demo_2017_results[['url', 'status_code', 'success', 'size']]

Testing 7 URL patterns for 2017-2018 DEMO...
Testing: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/DEMO_J.xpt
✓ SUCCESS! Found working URL pattern
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT
✓ SUCCESS! Found working URL pattern
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_J.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/demo_J.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/demo_J.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Data/Nhanes/2017-2018/DEMO_J.XPT
✗ Failed with status 404
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEMO_18.XPT
✗ Failed with status 503


Unnamed: 0,url,status_code,success,size
0,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...,200,True,3332.7 KB
1,https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEM...,503,False,
2,https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEM...,503,False,
3,https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/dem...,503,False,
4,https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/dem...,503,False,
5,https://wwwn.cdc.gov/Nchs/Data/Nhanes/2017-201...,404,False,
6,https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/DEM...,503,False,


In [29]:
# Test demographics for 1999-2000 cycle (oldest)
demo_1999_results = test_all_patterns('1999-2000', 'DEMO', 'A')
demo_1999_results[['url', 'status_code', 'success', 'size']]

Testing 7 URL patterns for 1999-2000 DEMO...
Testing: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1999/DataFiles/DEMO_A.xpt
✗ Failed with status 404
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO_A.XPT
✗ Failed with status 404
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO_A.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO_A.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO_A.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/demo_A.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/demo_A.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/demo_A.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/demo_A.xpt
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Data/Nhanes/1999-2000/DEMO_A.XPT
✗ Failed with status 503
Testing: https://wwwn.cdc.gov/Nchs/Data/Nhanes/1999-2

Unnamed: 0,url,status_code,success,size
0,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/1...,404,False,
1,https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEM...,503,False,
2,https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEM...,503,False,
3,https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/dem...,503,False,
4,https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/dem...,503,False,
5,https://wwwn.cdc.gov/Nchs/Data/Nhanes/1999-200...,404,False,
6,https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEM...,503,False,


## Test Custom URL

In [30]:
def test_custom_url(url):
    """Test a single custom URL and display detailed results."""
    result = test_url(url)
    
    print(f"URL: {result['url']}")
    print(f"Status: {result['status_code']}")
    print(f"Success: {result['success']}")
    
    if result['success']:
        print(f"Size: {result['size']}")
        print(f"Rows: {result.get('rows', 'N/A')}")
        print(f"Columns: {result.get('columns', 'N/A')}")
        print(f"First few columns: {result.get('first_cols', 'N/A')}")
    else:
        print(f"Error: {result.get('error', 'Unknown error')}")
    
    return result

In [31]:
# Test a specific URL that you want to verify
test_custom_url("https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt")

URL: https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt
Status: 200
Success: True
Size: 2521.6 KB
Rows: 11933
Columns: 27
First few columns: ['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR']


{'url': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt',
 'status_code': 200,
 'time': '0.82s',
 'size': '2521.6 KB',
 'success': True,
 'rows': 11933,
 'columns': 27,
 'first_cols': ['SEQN', 'SDDSRVYR', 'RIDSTATR', 'RIAGENDR', 'RIDAGEYR']}

## Test Multiple Components for a Cycle

In [32]:
def test_cycle_components(cycle, components):
    """Test multiple components for a specific cycle and show working URLs."""
    cycle_letter_map = {
        '2021-2022': 'L',
        '2019-2020': 'K',
        '2017-2018': 'J',
        '2015-2016': 'I',
        '2013-2014': 'H',
        '2011-2012': 'G',
        '2009-2010': 'F',
        '2007-2008': 'E',
        '2005-2006': 'D',
        '2003-2004': 'C',
        '2001-2002': 'B',
        '1999-2000': 'A'
    }
    letter = cycle_letter_map.get(cycle, '')
    
    results = []
    for component in components:
        patterns = generate_url_patterns(cycle, component, letter)
        component_results = {"component": component, "success": False, "working_url": None}
        
        for url in patterns:
            try:
                response = requests.head(url, timeout=10)
                if response.status_code == 200:
                    component_results["success"] = True
                    component_results["working_url"] = url
                    break
            except:
                continue
                
        results.append(component_results)
    
    return pd.DataFrame(results)

In [33]:
components = ['DEMO', 'BMX', 'BPX', 'TCHOL', 'GLU', 'DR1TOT', 'PAQ', 'SMQ', 'ALQ']

# Test newest cycle components
test_cycle_components('2021-2022', components)

Unnamed: 0,component,success,working_url
0,DEMO,True,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...
1,BMX,True,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...
2,BPX,False,
3,TCHOL,True,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...
4,GLU,True,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...
5,DR1TOT,True,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...
6,PAQ,True,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...
7,SMQ,True,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...
8,ALQ,True,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2...


In [34]:
# Test older cycle components
test_cycle_components('1999-2000', components)

Unnamed: 0,component,success,working_url
0,DEMO,False,
1,BMX,False,
2,BPX,False,
3,TCHOL,False,
4,GLU,False,
5,DR1TOT,False,
6,PAQ,False,
7,SMQ,False,
8,ALQ,False,


## URL Pattern Summary

In [35]:
def summarize_url_patterns():
    """Print a summary of URL patterns that work for different NHANES cycle ranges"""
    summary = [
        {"cycle_range": "2021-present", 
         "example": "2021-2022",
         "pattern": "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{YEAR}/DataFiles/{COMPONENT}_{LETTER}.xpt",
         "notes": "Newest format with Public subdirectory and lowercase xpt extension"},
        
        {"cycle_range": "2007-2018", 
         "example": "2017-2018",
         "pattern": "https://wwwn.cdc.gov/Nchs/Nhanes/{CYCLE}/{COMPONENT}_{LETTER}.XPT",
         "notes": "Standard format with uppercase XPT extension"},
        
        {"cycle_range": "1999-2006", 
         "example": "1999-2000",
         "pattern": "https://wwwn.cdc.gov/Nchs/Nhanes/{CYCLE}/{component_lower}_{LETTER}.xpt",
         "notes": "Older format often uses lowercase component names"}
    ]
    
    return pd.DataFrame(summary)

In [36]:
summarize_url_patterns()

Unnamed: 0,cycle_range,example,pattern,notes
0,2021-present,2021-2022,https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{...,Newest format with Public subdirectory and low...
1,2007-2018,2017-2018,https://wwwn.cdc.gov/Nchs/Nhanes/{CYCLE}/{COMP...,Standard format with uppercase XPT extension
2,1999-2006,1999-2000,https://wwwn.cdc.gov/Nchs/Nhanes/{CYCLE}/{comp...,Older format often uses lowercase component names


## Local Data Storage & Management

This section outlines a system for working with locally stored NHANES data files. This approach is useful when:
1. You have intermittent internet connectivity
2. You want to ensure reproducibility with specific dataset versions
3. You want to avoid repeatedly downloading large datasets
4. You're experiencing issues with CDC's data servers

In [37]:
import os
import shutil
from pathlib import Path

# Define the local data directory structure
DATA_ROOT = Path('data')  # Relative to notebook location
os.makedirs(DATA_ROOT, exist_ok=True)

def setup_cycle_directories(cycles=None):
    """Create directory structure for storing NHANES data by cycle"""
    if cycles is None:
        cycles = [
            '2021-2022',
            '2017-2018',
            '2015-2016',
            '2013-2014', 
            '2011-2012',
            '2009-2010',
            '1999-2000'
        ]
        
    for cycle in cycles:
        cycle_dir = DATA_ROOT / cycle
        os.makedirs(cycle_dir, exist_ok=True)
        print(f"Created directory: {cycle_dir}")
    
    print(f"\nDirectory structure created at {DATA_ROOT.absolute()}")
    print("You can now download and store NHANES data files in these directories.")

# Create the directory structure
setup_cycle_directories()

Created directory: data\2021-2022
Created directory: data\2017-2018
Created directory: data\2015-2016
Created directory: data\2013-2014
Created directory: data\2011-2012
Created directory: data\2009-2010
Created directory: data\1999-2000

Directory structure created at c:\Users\User\Documents\NHANES\data
You can now download and store NHANES data files in these directories.


In [38]:
def download_and_save_file(url, cycle, component, timeout=30):
    """Download a file from URL and save it to the appropriate local directory"""
    try:
        print(f"Downloading {component} for {cycle}...")
        response = requests.get(url, timeout=timeout)
        
        if response.status_code != 200:
            print(f"Failed to download: {url}")
            print(f"Status code: {response.status_code}")
            return None
            
        # Determine file extension from URL (.XPT or .xpt)
        ext = os.path.splitext(url)[-1]
        if not ext:
            ext = '.xpt'  # Default extension
            
        # Create filename and path
        filename = f"{component}_{ext}"
        save_dir = DATA_ROOT / cycle
        os.makedirs(save_dir, exist_ok=True)
        save_path = save_dir / filename
        
        # Save the file
        with open(save_path, 'wb') as f:
            f.write(response.content)
            
        print(f"✓ Successfully saved to {save_path}")
        return save_path
    
    except Exception as e:
        print(f"Error downloading/saving file: {e}")
        return None

def download_component_to_local(cycle, component, letter=None):
    """Try different URL patterns and download the first successful one to local storage"""
    patterns = generate_url_patterns(cycle, component, letter)
    
    for url in patterns:
        try:
            print(f"Trying URL: {url}")
            response = requests.head(url, timeout=10)
            
            if response.status_code == 200:
                save_path = download_and_save_file(url, cycle, component)
                if save_path:
                    return save_path
        except Exception as e:
            print(f"Error checking URL {url}: {e}")
            continue
            
    print(f"Failed to download {component} for {cycle} from any URL pattern")
    return None

In [39]:
def list_local_data_files():
    """List all available local NHANES data files by cycle"""
    if not os.path.exists(DATA_ROOT):
        print(f"Data directory {DATA_ROOT} does not exist")
        return {}
    
    available_files = {}
    
    for item in os.listdir(DATA_ROOT):
        cycle_dir = DATA_ROOT / item
        if os.path.isdir(cycle_dir):
            cycle_files = [f for f in os.listdir(cycle_dir) 
                          if f.lower().endswith('.xpt')]
            if cycle_files:
                available_files[item] = cycle_files
    
    # Print a summary
    print(f"Found {sum(len(files) for files in available_files.values())} data files across {len(available_files)} cycles")
    for cycle, files in available_files.items():
        print(f"\n{cycle} ({len(files)} files):")
        for f in files:
            print(f"  - {f}")
    
    return available_files

def load_local_data(cycle, component):
    """Load a local NHANES data file if it exists"""
    cycle_dir = DATA_ROOT / cycle
    
    if not os.path.exists(cycle_dir):
        print(f"Cycle directory {cycle_dir} does not exist")
        return None
        
    # Try multiple extensions
    for ext in ['.xpt', '.XPT']:
        filepath = cycle_dir / f"{component}{ext}"
        if os.path.exists(filepath):
            try:
                print(f"Loading local file: {filepath}")
                df = pd.read_sas(filepath, format='xport')
                print(f"✓ Successfully loaded {len(df)} rows and {len(df.columns)} columns")
                return df
            except Exception as e:
                print(f"Error loading {filepath}: {e}")
                return None
    
    # Try looking for files starting with the component name
    potential_files = [f for f in os.listdir(cycle_dir) 
                      if f.lower().startswith(component.lower()) and 
                      f.lower().endswith(('.xpt', '.XPT'))]
    
    if potential_files:
        try:
            filepath = cycle_dir / potential_files[0]
            print(f"Loading local file: {filepath}")
            df = pd.read_sas(filepath, format='xport')
            print(f"✓ Successfully loaded {len(df)} rows and {len(df.columns)} columns")
            return df
        except Exception as e:
            print(f"Error loading {filepath}: {e}")
            
    print(f"No local file found for {component} in cycle {cycle}")
    return None

## Manual Data Download Instructions

To manually download NHANES data files from the CDC website:

1. Visit the main NHANES website: https://wwwn.cdc.gov/nchs/nhanes/Default.aspx
2. Select the survey cycle you need (e.g., 2021-2022)
3. Click "Data, Documentation, Codebooks"
4. Find the component you need (e.g., Demographics, Body Measures, etc.)
5. Download the .XPT data file
6. Save the file to the appropriate cycle directory in your local data folder:
   - `data/2021-2022/` for 2021-2022 files
   - `data/2017-2018/` for 2017-2018 files, etc.

For consistent naming, save files as `COMPONENT_[LETTER].xpt` (e.g., `DEMO_L.xpt` for 2021-2022 demographics)

In [40]:
# Check what data files we have locally
list_local_data_files()

Found 4 data files across 4 cycles

1999-2000 (1 files):
  - DEMO.xpt

2015-2016 (1 files):
  - DEMO.xpt

2017-2018 (1 files):
  - DEMO.xpt

2021-2023 (1 files):
  - DEMO.xpt


{'1999-2000': ['DEMO.xpt'],
 '2015-2016': ['DEMO.xpt'],
 '2017-2018': ['DEMO.xpt'],
 '2021-2023': ['DEMO.xpt']}

In [41]:
# Example: Download demographics data for a specific cycle
# Uncomment and run this cell to download a file

# cycle = '2021-2022'
# component = 'DEMO'
# download_component_to_local(cycle, component)

In [42]:
# Load and explore a local data file
# Uncomment and modify to load a specific component

# cycle = '2021-2022' 
# component = 'DEMO'
# df = load_local_data(cycle, component)

# if df is not None:
#     print(f"\nColumn names:")
#     for col in df.columns:
#         print(f"  - {col}")
#     
#     print(f"\nFirst 5 rows:")
#     display(df.head())

## Modified PopHealthObservatory with Local Data Support

The code below shows how to modify the PopHealthObservatory class to work with local data files first, falling back to online data only if necessary.

In [43]:
class ModifiedNHANESExplorer:
    """NHANES Explorer that prioritizes local data files."""
    
    def __init__(self, data_root='data'):
        """Initialize with path to data directory"""
        self.data_root = Path(data_root)
        self.data_cache = {}
        
        # Standard component names
        self.components = {
            'demographics': 'DEMO',
            'body_measures': 'BMX',
            'blood_pressure': 'BPX',
            'cholesterol': 'TCHOL',
            'diabetes': 'GLU',
            'dietary': 'DR1TOT',
            'physical_activity': 'PAQ',
            'smoking': 'SMQ',
            'alcohol': 'ALQ'
        }
        
        # Map survey cycle to NHANES file letter suffix
        self.cycle_suffix_map = {
            '2021-2022': 'L',
            '2019-2020': 'K',
            '2017-2018': 'J',
            '2015-2016': 'I',
            '2013-2014': 'H',
            '2011-2012': 'G',
            '2009-2010': 'F',
            '2007-2008': 'E',
            '2005-2006': 'D',
            '2003-2004': 'C',
            '2001-2002': 'B',
            '1999-2000': 'A'
        }
    
    def get_data(self, cycle, component):
        """Get data for a specific cycle and component, prioritizing local files"""
        key = f"{cycle}_{component}"
        
        # Return from cache if already loaded
        if key in self.data_cache:
            return self.data_cache[key]
            
        # Try to load from local storage first
        cycle_dir = self.data_root / cycle
        
        if os.path.exists(cycle_dir):
            # Check for files with component name
            potential_files = [f for f in os.listdir(cycle_dir) 
                              if f.lower().startswith(component.lower()) and 
                              f.lower().endswith(('.xpt', '.XPT'))]
            
            if potential_files:
                try:
                    filepath = cycle_dir / potential_files[0]
                    print(f"Loading local file: {filepath}")
                    df = pd.read_sas(filepath, format='xport')
                    print(f"✓ Successfully loaded local data with {len(df)} rows")
                    self.data_cache[key] = df
                    return df
                except Exception as e:
                    print(f"Error loading local file: {e}")
        
        # If we didn't find or couldn't load local data, inform the user
        print(f"No local data found for {component} in cycle {cycle}")
        print(f"Please download the data file from NHANES website and save to:")
        print(f"  {cycle_dir}")
        
        return pd.DataFrame()  # Return empty DataFrame
        
    def get_demographics_data(self, cycle):
        """Get demographics data with appropriate column renaming"""
        demo_df = self.get_data(cycle, self.components['demographics'])
        
        if demo_df.empty:
            return demo_df
            
        # Map standard variable names to more readable ones
        demo_vars = {
            'SEQN': 'participant_id',
            'RIAGENDR': 'gender',
            'RIDAGEYR': 'age_years',
            'RIDRETH3': 'race_ethnicity',
            'DMDEDUC2': 'education',
            'INDFMPIR': 'poverty_ratio',
            'WTMEC2YR': 'exam_weight',
        }
        
        # Only use columns that are actually in the dataframe
        available = [c for c in demo_vars if c in demo_df.columns]
        demo_clean = demo_df[available].copy().rename(columns={k: v for k, v in demo_vars.items() if k in available})
        
        # Add some useful derived columns
        if 'gender' in demo_clean.columns:
            demo_clean['gender_label'] = demo_clean['gender'].map({1: 'Male', 2: 'Female'})
            
        if 'race_ethnicity' in demo_clean.columns:
            race_labels = {
                1: 'Mexican American',
                2: 'Other Hispanic',
                3: 'Non-Hispanic White',
                4: 'Non-Hispanic Black',
                6: 'Non-Hispanic Asian',
                7: 'Other/Multi-racial'
            }
            demo_clean['race_ethnicity_label'] = demo_clean['race_ethnicity'].map(race_labels)
            
        return demo_clean
        
    def get_body_measures(self, cycle):
        """Get body measurement data with appropriate column renaming"""
        bmx_df = self.get_data(cycle, self.components['body_measures'])
        
        if bmx_df.empty:
            return bmx_df
            
        body_vars = {
            'SEQN': 'participant_id',
            'BMXWT': 'weight_kg',
            'BMXHT': 'height_cm',
            'BMXBMI': 'bmi',
            'BMXWAIST': 'waist_cm'
        }
        
        available = [c for c in body_vars if c in bmx_df.columns]
        body_clean = bmx_df[available].copy().rename(columns={k: v for k, v in body_vars.items() if k in available})
        
        # Add BMI category
        if 'bmi' in body_clean.columns:
            body_clean['bmi_category'] = pd.cut(
                body_clean['bmi'], 
                bins=[0, 18.5, 25, 30, float('inf')], 
                labels=['Underweight', 'Normal', 'Overweight', 'Obese'], 
                right=False
            )
            
        return body_clean

In [44]:
# Example usage of the local data explorer
# Uncomment to test with your downloaded data files

# explorer = ModifiedNHANESExplorer()
# 
# # Test with data from a specific cycle
# cycle = '2021-2022'
# 
# # Get demographics data
# demo_df = explorer.get_demographics_data(cycle)
# if not demo_df.empty:
#     print(f"\nDemographics sample ({len(demo_df)} rows):")
#     display(demo_df.head())
# 
# # Get body measures data
# body_df = explorer.get_body_measures(cycle)
# if not body_df.empty:
#     print(f"\nBody measures sample ({len(body_df)} rows):")
#     display(body_df.head())

## Auto-Loading NHANES Data from Local Folder Structure

The functions below help automate loading data from the structured directory format, which includes:
- Main folders by cycle year (e.g., `2015-2016/`)
- Possible subfolders by data type (e.g., `Laboratory Data/`, `Questionnaire Data/`)
- XPT files with various naming patterns

In [45]:
def scan_data_directory(data_dir=None):
    """Recursively scan the data directory and index all XPT files.
    
    Returns a dictionary with the following structure:
    {
        'cycle_year': {
            'category': {
                'component': 'path/to/file.xpt'
            }
        }
    }
    """
    if data_dir is None:
        data_dir = DATA_ROOT
    else:
        data_dir = Path(data_dir)
    
    data_index = {}
    
    # Walk through all directories and files
    for root, dirs, files in os.walk(data_dir):
        root_path = Path(root)
        
        # Filter for XPT files only
        xpt_files = [f for f in files if f.lower().endswith('.xpt')]
        
        if not xpt_files:
            continue
            
        # Determine which cycle this directory belongs to
        cycle_path = root_path.relative_to(data_dir)
        parts = cycle_path.parts
        
        if not parts:  # Files directly in data_dir
            cycle = "unknown"
            category = "uncategorized"
        elif len(parts) == 1:  # Files directly in a cycle folder
            cycle = parts[0]
            category = "uncategorized"
        else:  # Files in a subdirectory of a cycle folder
            cycle = parts[0]
            category = parts[1]
        
        # Initialize the structure if needed
        if cycle not in data_index:
            data_index[cycle] = {}
        if category not in data_index[cycle]:
            data_index[cycle][category] = {}
        
        # Add each file to the index
        for xpt_file in xpt_files:
            # Extract the component name (remove _X.XPT suffix)
            component = os.path.splitext(xpt_file)[0].split('_')[0]
            data_index[cycle][category][component] = str(root_path / xpt_file)
    
    return data_index

# Scan and print the data directory structure
data_index = scan_data_directory()

# Print summary by cycle
print(f"Found data files across {len(data_index)} cycles:")
for cycle, categories in data_index.items():
    total_files = sum(len(components) for components in categories.values())
    print(f"  {cycle}: {total_files} components across {len(categories)} categories")

Found data files across 4 cycles:
  1999-2000: 1 components across 1 categories
  2015-2016: 151 components across 5 categories
  2017-2018: 1 components across 1 categories
  2021-2023: 1 components across 1 categories


In [46]:
def get_component_file_paths(component, data_index=None):
    """Find all available files for a specific component across all cycles.
    
    Args:
        component: Component code (e.g., 'DEMO', 'BMX')
        data_index: Data index from scan_data_directory() or None to scan
        
    Returns:
        Dictionary of cycle -> file path for the component
    """
    if data_index is None:
        data_index = scan_data_directory()
        
    result = {}
    
    # Search through all cycles and categories
    for cycle, categories in data_index.items():
        for category, components in categories.items():
            # Case-insensitive comparison since files might be lowercase/uppercase
            for comp_name, file_path in components.items():
                if comp_name.upper() == component.upper():
                    result[cycle] = file_path
                    break
    
    return result

def load_component_all_cycles(component, data_index=None):
    """Load a specific component from all available cycles.
    
    Args:
        component: Component code (e.g., 'DEMO', 'BMX')
        data_index: Data index from scan_data_directory() or None to scan
        
    Returns:
        Dictionary of cycle -> DataFrame
    """
    component_paths = get_component_file_paths(component, data_index)
    
    if not component_paths:
        print(f"No data files found for component {component}")
        return {}
        
    print(f"Found {component} data in {len(component_paths)} cycles")
    
    # Load each file
    result = {}
    for cycle, file_path in component_paths.items():
        try:
            print(f"Loading {component} from {cycle}...")
            df = pd.read_sas(file_path, format='xport')
            print(f"✓ Successfully loaded {len(df)} rows and {len(df.columns)} columns")
            result[cycle] = df
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    return result

# Example usage: Get demographic data from all available cycles
demo_data = load_component_all_cycles('DEMO')

Found DEMO data in 4 cycles
Loading DEMO from 1999-2000...
✓ Successfully loaded 9965 rows and 144 columns
Loading DEMO from 2015-2016...
✓ Successfully loaded 9971 rows and 47 columns
Loading DEMO from 2017-2018...
✓ Successfully loaded 9965 rows and 144 columns
Loading DEMO from 2015-2016...
✓ Successfully loaded 9971 rows and 47 columns
Loading DEMO from 2017-2018...
✓ Successfully loaded 9254 rows and 46 columns
Loading DEMO from 2021-2023...
✓ Successfully loaded 11933 rows and 27 columns
✓ Successfully loaded 9254 rows and 46 columns
Loading DEMO from 2021-2023...
✓ Successfully loaded 11933 rows and 27 columns


In [50]:
class EnhancedNHANESExplorer:
    """Enhanced NHANES Explorer that automatically loads all available local data."""
    
    def __init__(self):
        """Initialize the explorer by scanning the data directory."""
        self.data_index = scan_data_directory()
        self.data_cache = {}
        
        # Map component types to component codes
        self.components = {
            'demographics': 'DEMO',
            'body_measures': 'BMX',
            'blood_pressure': 'BPX',
            'cholesterol': 'TCHOL',
            'diabetes': 'GLU',
            'dietary': 'DR1TOT',
            'physical_activity': 'PAQ',
            'smoking': 'SMQ',
            'alcohol': 'ALQ'
        }
        
        # Available cycles based on data found in directory
        self.available_cycles = sorted(list(self.data_index.keys()))
        
        print(f"Enhanced NHANES Explorer initialized with data from {len(self.available_cycles)} cycles")
        print(f"Available cycles: {', '.join(self.available_cycles)}")
    
    def get_component_data(self, component_code, cycle=None):
        """Get data for a specific component from a cycle or all cycles.
        
        Args:
            component_code: Component code (e.g., 'DEMO', 'BMX')
            cycle: Specific cycle to load from or None for all cycles
            
        Returns:
            DataFrame or dictionary of cycle -> DataFrame
        """
        # Check cache first
        cache_key = f"{component_code}_{cycle if cycle else 'all'}"
        if cache_key in self.data_cache:
            return self.data_cache[cache_key]
        
        # Get file paths for the component
        component_paths = get_component_file_paths(component_code, self.data_index)
        
        if not component_paths:
            print(f"No data files found for component {component_code}")
            return pd.DataFrame() if cycle else {}
        
        # If cycle specified, only get that one
        if cycle:
            if cycle not in component_paths:
                print(f"No data file found for {component_code} in cycle {cycle}")
                return pd.DataFrame()
            
            try:
                df = pd.read_sas(component_paths[cycle], format='xport')
                self.data_cache[cache_key] = df
                return df
            except Exception as e:
                print(f"Error loading {component_paths[cycle]}: {e}")
                return pd.DataFrame()
        
        # Otherwise load all cycles
        result = {}
        for cy, path in component_paths.items():
            try:
                df = pd.read_sas(path, format='xport')
                result[cy] = df
            except Exception as e:
                print(f"Error loading {path}: {e}")
        
        self.data_cache[cache_key] = result
        return result
    
    def get_demographics_data(self, cycle=None):
        """Get demographics data with appropriate column renaming."""
        component = self.components['demographics']
        
        if cycle:
            # Get data for a specific cycle
            demo_df = self.get_component_data(component, cycle)
            if demo_df.empty:
                return demo_df
            return self._process_demographics(demo_df)
        else:
            # Get data for all cycles
            demo_dfs = self.get_component_data(component)
            return {cy: self._process_demographics(df) for cy, df in demo_dfs.items() if not df.empty}
    
    def _process_demographics(self, demo_df):
        """Process and standardize a demographics dataframe."""
        # Map standard variable names to more readable ones
        demo_vars = {
            'SEQN': 'participant_id',
            'RIAGENDR': 'gender',
            'RIDAGEYR': 'age_years',
            'RIDRETH3': 'race_ethnicity',
            'RIDRETH1': 'race_ethnicity_old',  # For older cycles
            'DMDEDUC2': 'education',
            'INDFMPIR': 'poverty_ratio',
            'WTMEC2YR': 'exam_weight',
        }
        
        # Only use columns that are actually in the dataframe
        available = [c for c in demo_vars if c in demo_df.columns]
        demo_clean = demo_df[available].copy().rename(columns={k: v for k, v in demo_vars.items() if k in available})
        
        # Add some useful derived columns
        if 'gender' in demo_clean.columns:
            demo_clean['gender_label'] = demo_clean['gender'].map({1: 'Male', 2: 'Female'})
            
        # Different ethnicity coding across cycles
        if 'race_ethnicity' in demo_clean.columns:
            race_labels = {
                1: 'Mexican American',
                2: 'Other Hispanic',
                3: 'Non-Hispanic White',
                4: 'Non-Hispanic Black',
                6: 'Non-Hispanic Asian',
                7: 'Other/Multi-racial'
            }
            demo_clean['race_ethnicity_label'] = demo_clean['race_ethnicity'].map(race_labels)
        elif 'race_ethnicity_old' in demo_clean.columns:
            race_labels_old = {
                1: 'Mexican American',
                2: 'Other Hispanic',
                3: 'Non-Hispanic White',
                4: 'Non-Hispanic Black',
                5: 'Other/Multi-racial'
            }
            demo_clean['race_ethnicity_label'] = demo_clean['race_ethnicity_old'].map(race_labels_old)
            
        return demo_clean
    
    def get_body_measures(self, cycle=None):
        """Get body measurement data with appropriate column renaming."""
        component = self.components['body_measures']
        
        if cycle:
            # Get data for a specific cycle
            bmx_df = self.get_component_data(component, cycle)
            if bmx_df.empty:
                return bmx_df
            return self._process_body_measures(bmx_df)
        else:
            # Get data for all cycles
            bmx_dfs = self.get_component_data(component)
            return {cy: self._process_body_measures(df) for cy, df in bmx_dfs.items() if not df.empty}
    
    def _process_body_measures(self, bmx_df):
        """Process and standardize a body measurements dataframe."""
        body_vars = {
            'SEQN': 'participant_id',
            'BMXWT': 'weight_kg',
            'BMXHT': 'height_cm',
            'BMXBMI': 'bmi',
            'BMXWAIST': 'waist_cm'
        }
        
        # Only use columns that are actually in the dataframe
        available = [c for c in body_vars if c in bmx_df.columns]
        body_clean = bmx_df[available].copy().rename(columns={k: v for k, v in body_vars.items() if k in available})
        
        # Add BMI category
        if 'bmi' in body_clean.columns:
            body_clean['bmi_category'] = pd.cut(
                body_clean['bmi'], 
                bins=[0, 18.5, 25, 30, float('inf')], 
                labels=['Underweight', 'Normal', 'Overweight', 'Obese'], 
                right=False
            )
            
        return body_clean

In [51]:
# Initialize the enhanced explorer
enhanced_explorer = EnhancedNHANESExplorer()

# Get demographics data from all available cycles
all_demo_data = enhanced_explorer.get_demographics_data()

# Print available cycles with sample counts
for cycle, df in all_demo_data.items():
    print(f"\n{cycle} demographics: {len(df)} participants")
    print(f"Column names: {', '.join(df.columns[:5])}...")
    if 'race_ethnicity_label' in df.columns:
        race_counts = df['race_ethnicity_label'].value_counts()
        print("Race/Ethnicity distribution:")
        for race, count in race_counts.items():
            print(f"  {race}: {count} ({count/len(df)*100:.1f}%)")

Enhanced NHANES Explorer initialized with data from 4 cycles
Available cycles: 1999-2000, 2015-2016, 2017-2018, 2021-2023

1999-2000 demographics: 9965 participants
Column names: participant_id, gender, age_years, race_ethnicity_old, education...
Race/Ethnicity distribution:
  Mexican American: 3393 (34.0%)
  Non-Hispanic White: 3367 (33.8%)
  Non-Hispanic Black: 2228 (22.4%)
  Other Hispanic: 589 (5.9%)
  Other/Multi-racial: 388 (3.9%)

2015-2016 demographics: 9971 participants
Column names: participant_id, gender, age_years, race_ethnicity, race_ethnicity_old...
Race/Ethnicity distribution:
  Non-Hispanic White: 3066 (30.7%)
  Non-Hispanic Black: 2129 (21.4%)
  Mexican American: 1921 (19.3%)
  Other Hispanic: 1308 (13.1%)
  Non-Hispanic Asian: 1042 (10.5%)
  Other/Multi-racial: 505 (5.1%)

2017-2018 demographics: 9254 participants
Column names: participant_id, gender, age_years, race_ethnicity, race_ethnicity_old...
Race/Ethnicity distribution:
  Non-Hispanic White: 3150 (34.0%)
  No

In [52]:
# Function to merge multiple components for a specific cycle
def create_merged_dataset(explorer, cycle, components=None):
    """Create a merged dataset from multiple components for a cycle."""
    if components is None:
        components = ['demographics', 'body_measures', 'blood_pressure']
    
    print(f"Creating merged dataset for {cycle}...")
    
    # Start with demographics as the base
    if 'demographics' not in components:
        components.insert(0, 'demographics')
    
    demo_df = explorer.get_demographics_data(cycle)
    if demo_df.empty:
        print(f"No demographics data available for {cycle}")
        return pd.DataFrame()
    
    merged = demo_df.copy()
    
    # Add each additional component
    for component in components:
        if component == 'demographics':
            continue
            
        # Get method name based on component
        method_name = f"get_{component}"
        if hasattr(explorer, method_name) and callable(getattr(explorer, method_name)):
            component_df = getattr(explorer, method_name)(cycle)
            
            if not component_df.empty:
                merged = merged.merge(component_df, on='participant_id', how='left')
                print(f"Added {component} data ({len(component_df)} rows)")
            else:
                print(f"No {component} data available for {cycle}")
    
    print(f"Merged dataset created with {len(merged)} participants and {len(merged.columns)} variables")
    return merged

# Example usage: Create a merged dataset for 2015-2016
if '2015-2016' in enhanced_explorer.available_cycles:
    merged_data = create_merged_dataset(enhanced_explorer, '2015-2016')
    
    # Display the first few rows
    if not merged_data.empty:
        print("\nSample of merged dataset:")
        display(merged_data.head())

Creating merged dataset for 2015-2016...
Added body_measures data (9544 rows)
Merged dataset created with 9971 participants and 15 variables

Sample of merged dataset:
Added body_measures data (9544 rows)
Merged dataset created with 9971 participants and 15 variables

Sample of merged dataset:


Unnamed: 0,participant_id,gender,age_years,race_ethnicity,race_ethnicity_old,education,poverty_ratio,exam_weight,gender_label,race_ethnicity_label,weight_kg,height_cm,bmi,waist_cm,bmi_category
0,83732.0,1.0,62.0,3.0,3.0,5.0,4.39,135629.507405,Male,Non-Hispanic White,94.8,184.5,27.8,101.1,Overweight
1,83733.0,1.0,53.0,3.0,3.0,3.0,1.32,25282.425927,Male,Non-Hispanic White,90.4,171.4,30.8,107.9,Obese
2,83734.0,1.0,78.0,3.0,3.0,3.0,1.51,12575.838818,Male,Non-Hispanic White,83.4,170.1,28.8,116.5,Overweight
3,83735.0,2.0,56.0,3.0,3.0,5.0,5.0,102078.634508,Female,Non-Hispanic White,109.8,160.9,42.4,110.1,Obese
4,83736.0,2.0,42.0,4.0,4.0,4.0,1.23,18234.736219,Female,Non-Hispanic Black,55.2,164.9,20.3,80.4,Normal
