Data obtained from:
https://www.sciencedirect.com/science/article/abs/pii/S0957417420302323?casa_token=KAfym3lH_REAAAAA:VvMAXW-RJ7-ZNRJdoEhmO5zAdSXdmx4t6hCY1kifJYm_Q98DOlWMDqv6GL8ZFNDdrKfB-TRb08Q#sec0002

In [41]:
import pandas as pd

import os
import requests
from bs4 import BeautifulSoup

In [None]:
attribute_columns = [
    "Gender", # Demographic: "Demographic Variables & Sample Weights" - RIAGENDR
    "Age", # Demographic: "Demographic Variables & Sample Weights" - RIDAGEMN (in months)


    "Systolic", # Examination: "Blood Pressure" - BPXSAR (BPXSY1, BPXSY2, BPXSY3, BPXSY4 ?)
    "Diastolic", # Examination: "Blood Pressure" - BPXDAR (BPXDI1, BPXDI2, BPXDI3, BPXDI4 ?)

    "Weight", # Examination: "Body Measures" - BMIWT
    "Body mass index", # Examination: "Body Measures" - BMXBMI


    "White blood cells", # Laboratory: "LAB25" - LBXWBCSI
    "Basophils", # Laboratory: "LAB25" - LBDBANO (LBXBAPCT ?)
    "Red blood cells", # Laboratory: "LAB25" - LBXRBCSI
    "Hemoglobin", # Laboratory: "LAB25" - LBXHGB
    "Platelet count", # Laboratory: "LAB25" - LBXPLTSI
    "Mean volume of platelets", # Laboratory: "LAB25" - LBXMPSI
    "Red blood cell width", # Laboratory: "LAB25" - LBXRDW

    "Creatinine", # Laboratory: "LAB18" - LBDSCRSI # there is a bunch of different Creatinines
    "Glucose", # Laboratory: "LAB18" - LBXSGL # there is a bunch of different Glucoses
    "Gamma-glutamyl transferase (GGT)", # Laboratory: "LAB18" - LBXSGTSI
    "Iron", # Laboratory: "LAB18" - LBDSIRSI # bunch of different Irons
    "Lactate dehydrogenase (LDH)", # Laboratory: "LAB18" - LBXSLDSI
    "Phosphorus", # Laboratory: "LAB18" - LBDSPHSI # bunch of different ones
    "Bilirubin", # Laboratory: "LAB18" - LBDSTBSI # bunch of different ones
    "Protein", # Laboratory: "LAB18" - LBDSTPSI # bunch of different ones
    "Uric acid", # Laboratory: "LAB18" - LBDSUASI # bunch of different ones
    "Triglycerides", # Laboratory: "LAB18" - LBDSTRSI # bunch of different ones
    "Albumin", # Laboratory: "LAB18" - LBDSALSI 
    "Alkaline phosphatase (ALP)", # Laboratory: "LAB18" - LBXSAPSI
    "Aspartate aminotransferase (AST)", # Laboratory: "LAB18" - LBXSASSI
    "Alanine aminotransferase (ALT)", # Laboratory: "LAB18" - LBXSATSI

    "High-density lipoprotein (HDL)", # Laboratory: "Lab13" - LBDHDLSI #???
    "Cholesterol", # Laboratory: "Lab13" - LBDTCSI

    "Glycohemoglobin", # Laboratory: "LAB10" - LBXGH


    "Vigorous-work", # Questionnaire: "Physical Activity" - PAD200
    "Moderate-work", # Questionnaire: "Physical Activity" - PAD120

    "Diabetes", # Questionnaire: "Diabetes" - DIQ010

    "Blood related diabetes", # Questionnaire: "Medical Conditions" - MCQ250A
    "Blood related stroke", # Questionnaire: "Medical Conditions" - MCQ250F
    "Coronary heart Disease" # Questionnaire: "Medical Conditions" - MCQ160C
]


In [49]:
def download_specific_files(url, output_folder, filename):
    """
    Downloads specific files hyperlinked on a webpage.
    
    Args:
        url (str): The URL of the webpage to scrape.
        output_folder (str): The folder to save the downloaded files.
        filenames (list): A list of specific file names to download.
    """
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    response = requests.get(url)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    links = soup.find_all('a', href=True)

    for link in links:
        href = link['href']
        
        # Resolve relative URLs to absolute URLs
        file_url = href if href.startswith('http') else requests.compat.urljoin(url, href)
        
        # Extract the file name from the URL
        file_name = file_url.split('/')[-1]

        if file_name == filename:
            file_path = os.path.join(output_folder, file_name)
            try:
                file_response = requests.get(file_url, stream=True)
                file_response.raise_for_status() 
                
                # Save the file
                with open(file_path, 'wb') as file:
                    for chunk in file_response.iter_content(chunk_size=8192):
                        file.write(chunk)

                return
            except Exception as e:
                print(f"Failed to download {file_url}: {e}")
                raise e
    raise Exception
                


#### Files that contain all of our attributes:
- Demographic: Demographic Variables & Sample Weights - DEMO.xpt
- Examination: Blood Pressure - BPX.xpt
- Examination: Body Measures - BMX.xpt
- Laboratory: Lab25 - LAB25.xpt
- Laboratory: l18_2_00 - LAB18.xpt
- Laboratory: LAB18 - LAB18.xpt
- Laboratory: Lab13 - LAB13.xpt
- Laboratory: l10_2_00 - LAB10.xpt
- Questionnaire: Physical Activity - PAQ.xpt
- Questionnaire: Diabetes - DIQ.xpt
- Questionnaire: Medical Conditions - MCQ.xpt

In [None]:
# 1999-2000
file_dictionary = {
    'Demographics': ["DEMO.xpt"],
    "Examination": ["BPX.xpt", "BMX.xpt"],
    "Laboratory": ["LAB25.xpt", "LAB18.xpt", "LAB13.xpt", "LAB18.xpt", "LAB10.xpt"],
    "Questionnaire": ["PAQ.xpt", "DIQ.xpt", "MCQ.xpt"]
}

# 2001-2002: B
# 2003-2004: C
file_dictionary = {
    'Demographics': ["DEMO_E.xpt"],
    "Examination": ["BPX_E.xpt", "BMX_E.xpt"],
    "Laboratory": ["L25_D.xpt", "L10_D.xpt", "L13_D.xpt", "L40_D.xpt"],
    "Questionnaire": ["PAQ_E.xpt", "DIQ_E.xpt", "MCQ_E.xpt"]
}

# 2005-2006: D
# 2007-2008: E
# 2009-2010: F
# 2011-2012: G
# 2013-2014: H
# 2015-2016: I
file_dictionary = {
    'Demographics': ["DEMO_I.xpt"],
    "Examination": ["BPX_I.xpt", "BMX_I.xpt"],
    "Laboratory": ["BIOPRO_I.xpt", "CBC_I.xpt", "GHB_I.xpt", "HDL_I.xpt", 'TCHOL_I.xpt'],
    "Questionnaire": ["PAQ_I.xpt", "DIQ_I.xpt", "MCQ_I.xpt"]
}

In [None]:
webpage_url = 'https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component={}&CycleBeginYear=2015'
output_directory = 'data_test'

for directory, files in file_dictionary.items():
    print(f'Reading directory: "{directory}"')
    formatted_url = webpage_url.format(directory)
    for file in files:
        print(f' - Reading in file: "{file}"', end="")
        try:
            download_specific_files(formatted_url, output_directory, file)
            print(' [SUCCESS]')
        except Exception as e:
            print(' [FAIL]')

Reading directory: "Demographics"
 - Reading in file: "DEMO_I.xpt" [SUCCESS]
Reading directory: "Examination"
 - Reading in file: "BPX_I.xpt" [SUCCESS]
 - Reading in file: "BMX_I.xpt" [SUCCESS]
Reading directory: "Laboratory"
 - Reading in file: "BIOPRO_I.xpt" [SUCCESS]
 - Reading in file: "CBC_I.xpt" [SUCCESS]
 - Reading in file: "GHB_I.xpt" [SUCCESS]
 - Reading in file: "HDL_I.xpt" [SUCCESS]
 - Reading in file: "TCHOL_I.xpt" [SUCCESS]
Reading directory: "Questionnaire"
 - Reading in file: "PAQ_I.xpt" [SUCCESS]
 - Reading in file: "DIQ_I.xpt" [SUCCESS]
 - Reading in file: "MCQ_I.xpt" [SUCCESS]


###  Check Attributes

In [147]:
import json

def check_attributes(directory, year_key):
    file_path = "data_attributes.json"

    # Read the dictionary from the JSON file
    with open(file_path, "r") as file:
        data_attribute_dict = json.load(file)
        
    attribute_tracker = {attribute:0 for attribute in data_attribute_dict[year_key]}

    for file_name in os.listdir(directory):
        try:
            file_path = os.path.join(directory, file_name)

            file_df = pd.read_sas(file_path, format='xport')
            
            for column in file_df.columns:
                if column in attribute_tracker:
                    attribute_tracker[column] += 1

        except Exception as e:
            print('FAILED\n')

    for attribute, count in attribute_tracker.items():
        if count == 0:
            print(f'FAILED: {attribute}')


In [148]:
file_path = "data_attributes.json"

# Read the dictionary from the JSON file
with open(file_path, "r") as file:
    data_attribute_dict = json.load(file)

len(data_attribute_dict['2001-2002'])

36

In [197]:
check_attributes('data/1999_2000/', '1999-2000')
check_attributes('data/2001_2002/', '2001-2002')
check_attributes('data/2003_2004/', '2003-2004')
check_attributes('data/2005_2006/', '2005-2006')
check_attributes('data/2007_2008/', '2007-2008')
check_attributes('data/2009_2010/', '2009-2010')
check_attributes('data/2011_2012/', '2011-2012')
check_attributes('data/2013_2014/', '2013-2014')
check_attributes('data/2015_2016/', '2015-2016')









FAILED: MCQ250F
FAILED: MCQ250F
FAILED: MCQ250F
FAILED: MCQ250F
FAILED: MCQ250F
FAILED: MCQ250F


### Now select only the features we are interested in

In [None]:
file_selected_attributes_dictionary = {
    "DEMO.xpt": ['RIAGENDR', 'RIDAGEMN'],
    "BPX.xpt": ['BPXSAR', 'BPXDAR'],
    "BMX.xpt": ['BMIWT', 'BMXBMI'],
    "LAB25.xpt": ['LBXWBCSI', 'LBDBANO', 'LBXRBCSI', 'LBXHGB', 'LBXPLTSI', 'LBXMPSI', 'LBXRDW'],
    "LAB18.xpt": ['LBDSCRSI', 'LBXSGL', 'LBXSGTSI', 'LBDSIRSI', 'LBXSLDSI', 'LBDSPHSI', 'LBDSTBSI', \
                  'LBDSTPSI', 'LBDSUASI', 'LBDSTRSI', 'LBDSALSI', 'LBXSAPSI',  'LBXSASSI', 'LBXSATSI'],
    "LAB13.xpt": ['LBDHDLSI', 'LBDTCSI'],
    "LAB10.xpt": ['LBXGH'],
    "PAQ.xpt": ['PAD200', 'PAD120'], 
    "DIQ.xpt": ['DIQ010'], 
    "MCQ.xpt": ['MCQ250A', 'MCQ250F', 'MCQ160C']
}

In [102]:
def extract_attributes(directory):
    for file_name in os.listdir(directory):
        try:
            file_path = os.path.join(directory, file_name)
            selected_attributes = file_selected_attributes_dictionary[file_name]

            file_df = pd.read_sas(file_path, format='xport')
            print(selected_attributes)
            print(file_path)
            file_df[selected_attributes]
            print('SUCCESS\n')
        except Exception as e:
            print('FAILED\n')





In [103]:
extract_attributes('data')

['BMIWT', 'BMXBMI']
data\BMX.xpt
SUCCESS

['BPXSAR', 'BPXDAR']
data\BPX.xpt
SUCCESS

['RIAGENDR', 'RIDAGEMN']
data\DEMO.xpt
SUCCESS

['DIQ010']
data\DIQ.xpt
SUCCESS

['LBXGH']
data\LAB10.xpt
SUCCESS

['LBDHDLSI', 'LBDTCSI']
data\LAB13.xpt
SUCCESS

['LBXSCR', 'LBXSGL', 'LBXSGTSI', 'LBDSIRSI', 'LBXSLDSI', 'LBDSPHSI', 'LBDSTBSI', 'LBDSTPSI', 'LBDSUASI', 'LBDSTRSI', 'LBDSALSI', 'LBXSAPSI', 'LBXSASSI', 'LBXSATSI']
data\LAB18.xpt
SUCCESS

['LBXWBCSI', 'LBDBANO', 'LBXRBCSI', 'LBXHGB', 'LBXPLTSI', 'LBXMPSI', 'LBXRDW']
data\LAB25.xpt
SUCCESS

['MCQ250A', 'MCQ250F', 'MCQ160C']
data\MCQ.xpt
SUCCESS

['PAD200', 'PAD120']
data\PAQ.xpt
SUCCESS



In [85]:
import os
import requests

# Base directory for saving files locally
base_directory = "downloaded_files"

# List of Google Drive folder URLs (convert folder to file download URLs manually if needed)
file_urls = [
    "https://drive.google.com/drive/folders/1Rf0Q4kPQblsORsRy1sHXRgjh-Ljlv3X2",
]

# Download and save .xpt files
for file_url in file_urls:
    file_id = file_url.split("id=")[-1]  # Extract the file ID
    file_name = f"{file_id}.xpt"  # Use a default name if filenames aren't known
    file_path = os.path.join(base_directory, file_name)

    # Ensure the directory exists
    os.makedirs(base_directory, exist_ok=True)

    # Download the file
    try:
        print(f"Downloading {file_url}...")
        response = requests.get(file_url, stream=True)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Save the file
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

        print(f"Saved to {file_path}")
    except Exception as e:
        print(f"Failed to download {file_url}: {e}")


Downloading https://drive.google.com/drive/folders/1Rf0Q4kPQblsORsRy1sHXRgjh-Ljlv3X2...
Failed to download https://drive.google.com/drive/folders/1Rf0Q4kPQblsORsRy1sHXRgjh-Ljlv3X2: [Errno 22] Invalid argument: 'downloaded_files\\https://drive.google.com/drive/folders/1Rf0Q4kPQblsORsRy1sHXRgjh-Ljlv3X2.xpt'
