# NHANES URL Pattern Testing Tool

This notebook provides diagnostic tools to test various URL patterns for accessing NHANES data files across different survey cycles. Use this to verify access patterns and troubleshoot specific components.

In [None]:
import pandas as pd
import requests
import io
from typing import List, Dict
import time

# Configure display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

## URL Pattern Tester Functions

In [None]:
def test_url(url: str, timeout: int = 10) -> Dict:
    """Test a single URL and return status information."""
    start_time = time.time()
    try:
        response = requests.get(url, timeout=timeout)
        elapsed = time.time() - start_time
        
        result = {
            "url": url,
            "status_code": response.status_code,
            "time": f"{elapsed:.2f}s",
            "size": f"{len(response.content)/1024:.1f} KB" if response.status_code == 200 else "N/A",
            "success": response.status_code == 200
        }
        
        if response.status_code == 200:
            try:
                # Try to parse as XPT file
                df = pd.read_sas(io.BytesIO(response.content), format='xport')
                result["rows"] = len(df)
                result["columns"] = len(df.columns)
                result["first_cols"] = list(df.columns)[:5]
            except Exception as e:
                result["parse_error"] = str(e)
        
        return result
    except Exception as e:
        return {"url": url, "status_code": "Error", "error": str(e), "success": False}

def generate_url_patterns(cycle: str, component: str, letter: str = None) -> List[str]:
    """Generate different URL patterns for a cycle/component to test."""
    # If letter is not provided, try to determine from cycle
    if letter is None:
        cycle_letter_map = {
            '2021-2022': 'L',
            '2017-2018': 'J',
            '2015-2016': 'I',
            '2013-2014': 'H',
            '2011-2012': 'G',
            '2009-2010': 'F',
            '2007-2008': 'E',
            '2005-2006': 'D',
            '2003-2004': 'C',
            '2001-2002': 'B',
            '1999-2000': 'A'
        }
        letter = cycle_letter_map.get(cycle, '')
    
    cycle_year = cycle.split('-')[0] if '-' in cycle else cycle
    
    patterns = [
        # 2021+ pattern with Public subdirectory
        f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{cycle_year}/DataFiles/{component}_{letter}.xpt",
        # Standard pattern (2007-2018)
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component}_{letter}.XPT",
        # Lowercase variant
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component}_{letter}.xpt",
        # Pre-2007 pattern (lowercase component)
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component.lower()}_{letter}.XPT",
        # Pre-2007 pattern (lowercase component and extension)
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component.lower()}_{letter}.xpt",
        # Alternative Data/Nhanes path
        f"https://wwwn.cdc.gov/Nchs/Data/Nhanes/{cycle}/{component}_{letter}.XPT",
        # Variant with cycle year suffix
        f"https://wwwn.cdc.gov/Nchs/Nhanes/{cycle}/{component}_{cycle[-2:]}.XPT",
    ]
    
    return patterns

def test_all_patterns(cycle: str, component: str, letter: str = None) -> pd.DataFrame:
    """Test all URL patterns for a given cycle/component and return results as DataFrame."""
    patterns = generate_url_patterns(cycle, component, letter)
    results = []
    
    print(f"Testing {len(patterns)} URL patterns for {cycle} {component}...")
    for url in patterns:
        print(f"Testing: {url}")
        result = test_url(url)
        results.append(result)
        if result["success"]:
            print(f"✓ SUCCESS! Found working URL pattern")
        else:
            print(f"✗ Failed with status {result['status_code']}")
    
    # Convert results to DataFrame
    df = pd.DataFrame(results)
    return df

## Test Specific Cycles and Components

In [None]:
# Test demographics for 2021-2022 cycle (newest)
demo_2021_results = test_all_patterns('2021-2022', 'DEMO', 'L')
demo_2021_results[['url', 'status_code', 'success', 'size']]

In [None]:
# Test demographics for 2017-2018 cycle (mid-range)
demo_2017_results = test_all_patterns('2017-2018', 'DEMO', 'J')
demo_2017_results[['url', 'status_code', 'success', 'size']]

In [None]:
# Test demographics for 1999-2000 cycle (oldest)
demo_1999_results = test_all_patterns('1999-2000', 'DEMO', 'A')
demo_1999_results[['url', 'status_code', 'success', 'size']]

## Test Custom URL

In [None]:
def test_custom_url(url):
    """Test a single custom URL and display detailed results."""
    result = test_url(url)
    
    print(f"URL: {result['url']}")
    print(f"Status: {result['status_code']}")
    print(f"Success: {result['success']}")
    
    if result['success']:
        print(f"Size: {result['size']}")
        print(f"Rows: {result.get('rows', 'N/A')}")
        print(f"Columns: {result.get('columns', 'N/A')}")
        print(f"First few columns: {result.get('first_cols', 'N/A')}")
    else:
        print(f"Error: {result.get('error', 'Unknown error')}")
    
    return result

In [None]:
# Test a specific URL that you want to verify
test_custom_url("https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt")

## Test Multiple Components for a Cycle

In [None]:
def test_cycle_components(cycle, components):
    """Test multiple components for a specific cycle and show working URLs."""
    cycle_letter_map = {
        '2021-2022': 'L',
        '2019-2020': 'K',
        '2017-2018': 'J',
        '2015-2016': 'I',
        '2013-2014': 'H',
        '2011-2012': 'G',
        '2009-2010': 'F',
        '2007-2008': 'E',
        '2005-2006': 'D',
        '2003-2004': 'C',
        '2001-2002': 'B',
        '1999-2000': 'A'
    }
    letter = cycle_letter_map.get(cycle, '')
    
    results = []
    for component in components:
        patterns = generate_url_patterns(cycle, component, letter)
        component_results = {"component": component, "success": False, "working_url": None}
        
        for url in patterns:
            try:
                response = requests.head(url, timeout=10)
                if response.status_code == 200:
                    component_results["success"] = True
                    component_results["working_url"] = url
                    break
            except:
                continue
                
        results.append(component_results)
    
    return pd.DataFrame(results)

In [None]:
components = ['DEMO', 'BMX', 'BPX', 'TCHOL', 'GLU', 'DR1TOT', 'PAQ', 'SMQ', 'ALQ']

# Test newest cycle components
test_cycle_components('2021-2022', components)

In [None]:
# Test older cycle components
test_cycle_components('1999-2000', components)

## URL Pattern Summary

In [None]:
def summarize_url_patterns():
    """Print a summary of URL patterns that work for different NHANES cycle ranges"""
    summary = [
        {"cycle_range": "2021-present", 
         "example": "2021-2022",
         "pattern": "https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/{YEAR}/DataFiles/{COMPONENT}_{LETTER}.xpt",
         "notes": "Newest format with Public subdirectory and lowercase xpt extension"},
        
        {"cycle_range": "2007-2018", 
         "example": "2017-2018",
         "pattern": "https://wwwn.cdc.gov/Nchs/Nhanes/{CYCLE}/{COMPONENT}_{LETTER}.XPT",
         "notes": "Standard format with uppercase XPT extension"},
        
        {"cycle_range": "1999-2006", 
         "example": "1999-2000",
         "pattern": "https://wwwn.cdc.gov/Nchs/Nhanes/{CYCLE}/{component_lower}_{LETTER}.xpt",
         "notes": "Older format often uses lowercase component names"}
    ]
    
    return pd.DataFrame(summary)

In [None]:
summarize_url_patterns()

## Integration Test with PopHealth Observatory

In [None]:
from pophealth_observatory import NHANESExplorer

# Create instance of explorer
explorer = NHANESExplorer()

# Test various cycles
cycles_to_test = ['2021-2022', '2017-2018', '1999-2000']

for cycle in cycles_to_test:
    print(f"\n======= Testing {cycle} =======\n")
    
    # Test demographics
    print(f"Testing demographics for {cycle}...")
    demo_df = explorer.get_demographics_data(cycle)
    print(f"Got demographics with {len(demo_df)} rows and {len(demo_df.columns)} columns\n")
    
    # Test body measures
    print(f"Testing body measures for {cycle}...")
    body_df = explorer.get_body_measures(cycle)
    print(f"Got body measures with {len(body_df)} rows and {len(body_df.columns)} columns\n")