# This notebook add all the data to the same "master" dataset, including documentation of the variables and all the years.

1. Download documentation for the variables.
2. Select the variables needed based on the paper.
3. Merge all the files together


In [14]:
import os
#%load_ext cudf.pandas
import pandas as pd
import numpy as np
from glob import glob
import requests
from bs4 import BeautifulSoup
import io
import pyreadstat

In [15]:
DATA_PATH = "/Users/pipegalera/dev/ml_diabetes/data/NHANES/"

# 1. Download NHANES Data

Downloading the files takess from 15 min to 30 min per year with high speed internet, the good thing is that in can get interrupted at it skips the files already downloaded - just make sure the last file was correctly writen. 

In [16]:
def scrape_nhanes_xpt_files(year, DATA_PAT=DATA_PATH):
    """
    Note PAXMIN.XPT aka "Physical Activity Monitor - Minute	" is the only file missing.
    It's +6 gigas and CDC website its not preciselly fast.
    It takes 6 hours to download usually.
    Polling data without a unique identifer also will be missing ("*POL*.parquet") since
    I have no use for it
    """

    list_types = ["Demographics", "Dietary", "Examination", "Laboratory", "Questionnaire"]

    # Create folder structure for the data based on the year
    os.chdir(DATA_PATH)
    os.makedirs(f"{year}-{year+1}", exist_ok=True)
    os.chdir(f"{year}-{year+1}")

    print(f"NHANES Data from {year}-{year+1} year")
    print("__________________________")
    print("__________________________")

    for type in list_types:
        # Type of data and year
        url = f"https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component={type}&CycleBeginYear={year}"

        # Send a GET request to the URL
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
            continue

        # Create folder structure for the data based on the data type
        os.makedirs(type, exist_ok=True)
        os.chdir(type)
        print("### Data type:", type, "###")
        print("__________________________")

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all links that end with .XPT (case insensitive)
        xpt_links = soup.find_all('a', href=lambda href: href and href.lower().endswith('.xpt'))

        # Download and process each XPT file
        for link in xpt_links:
            file_url = link['href']
            if not file_url.startswith('http') and not "PAXMIN" in file_url and not "POL" in file_url:
                file_url = f"https://wwwn.cdc.gov{file_url}"
            else:
                continue

            file_name = file_url.split('/')[-1]
            parquet_filename = file_name.replace('.XPT', '.parquet')

            # Download the XPT file if it doesn't exist
            if not os.path.exists(parquet_filename):
                print(f"Downloading {file_name} from CDC website...")
                file_response = requests.get(file_url)
                if file_response.status_code == 200:
                    # Create a temporary file so pyreadstat can read it
                    with tempfile.NamedTemporaryFile(delete=False, suffix='.XPT') as temp_file:
                        temp_file.write(file_response.content)
                        temp_file_path = temp_file.name

                    xpt_data, _ = pyreadstat.read_xport(temp_file_path, encoding='cp1252')
                    xpt_data.to_parquet(parquet_filename, index=False)

                    # Remove the temporary file
                    os.unlink(temp_file_path)
                    print(f"Saved as {parquet_filename}")
                else:
                    print(f"Failed to download {file_name}. Status code: {file_response.status_code}")
            else:
                print(f"{parquet_filename} file already in the destination folder")

        # Moving again to parent directory
        os.chdir('..')
        print("__________________________")

# years = [1999,2001,2003,2005,2007,2009,2011,2013]
# for year in years:
#     scrape_nhanes_xpt_files(year)


# 2. Download documentation for the variables.

In [17]:
def create_document_variables():

    df_docs = pd.DataFrame()
    data_type = ["Demographics", "Dietary", "Examination", "Questionnaire", "Laboratory"]
    for type in data_type:
        try:
            url = f"https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component={type}&Cycle="
            df = pd.read_html(url)[0]
            df_docs = pd.concat([df_docs, df], ignore_index=True)
            print(f"Successfully scraped: {type} variable documentation")
        except Exception as e:
            print(f"Error scraping {type}: {str(e)}")

    df_docs['Year'] = df_docs['Begin Year'].astype(str) + "-" + df_docs['EndYear'].astype(str)
    
    return df_docs


#docs = create_document_variables()
#docs.to_csv(DATA_PATH + "documentation_variables.csv",index=False)
docs = pd.read_csv(DATA_PATH + "documentation_variables.csv")
docs

Unnamed: 0,Variable Name,Variable Description,Data File Name,Data File Description,Begin Year,EndYear,Component,Use Constraints,Year
0,AIALANG,Language of the MEC ACASI Interview Instrument,DEMO_D,Demographic Variables & Sample Weights,2005,2006,Demographics,,2005-2006
1,DMDBORN,In what country {were you/was SP} born?,DEMO_D,Demographic Variables & Sample Weights,2005,2006,Demographics,,2005-2006
2,DMDCITZN,{Are you/Is SP} a citizen of the United States...,DEMO_D,Demographic Variables & Sample Weights,2005,2006,Demographics,,2005-2006
3,DMDEDUC2,(SP Interview Version) What is the highest gra...,DEMO_D,Demographic Variables & Sample Weights,2005,2006,Demographics,,2005-2006
4,DMDEDUC3,(SP Interview Version) What is the highest gra...,DEMO_D,Demographic Variables & Sample Weights,2005,2006,Demographics,,2005-2006
...,...,...,...,...,...,...,...,...,...
60957,LBXPFHS,Perfluorohexane sulfonic acid (PFHxS) (ng/mL),P_PFAS,Perfluoroalkyl and Polyfluoroalkyl Substances,2017,2020,Laboratory,,2017-2020
60958,LBXPFNA,Perfluorononanoic acid (PFNA) (ng/mL),P_PFAS,Perfluoroalkyl and Polyfluoroalkyl Substances,2017,2020,Laboratory,,2017-2020
60959,LBXPFUA,Perfluoroundecanoic acid (PFUA) (ng/mL),P_PFAS,Perfluoroalkyl and Polyfluoroalkyl Substances,2017,2020,Laboratory,,2017-2020
60960,SEQN,Respondent sequence number.,P_PFAS,Perfluoroalkyl and Polyfluoroalkyl Substances,2017,2020,Laboratory,,2017-2020


In [18]:
def glance_var_docs(var):
    df = docs[docs["Variable Name"] == var].sort_values("Year")
    df = df[df["Use Constraints"] != "RDC Only"]
    return df


In [19]:
glance_var_docs("MCQ300c")

Unnamed: 0,Variable Name,Variable Description,Data File Name,Data File Description,Begin Year,EndYear,Component,Use Constraints,Year
40662,MCQ300c,"Including living and deceased, were any of {SP...",MCQ_G,Medical Conditions,2011,2012,Questionnaire,,2011-2012
41620,MCQ300c,"Including living and deceased, were any of {SP...",MCQ_H,Medical Conditions,2013,2014,Questionnaire,,2013-2014
43069,MCQ300c,"Including living and deceased, were any of {SP...",MCQ_I,Medical Conditions,2015,2016,Questionnaire,,2015-2016
44380,MCQ300c,"Including living and deceased, were any of {SP...",MCQ_J,Medical Conditions,2017,2018,Questionnaire,,2017-2018
45817,MCQ300c,"Including living and deceased, were any of {SP...",P_MCQ,Medical Conditions,2017,2020,Questionnaire,,2017-2020


# 3. Compile the variables needed based on paper.

There are more than 3000 variables/columns in NHANES, we'll compile/stack only the ones we need to replicate the papers


In [27]:
def compile_data(variable_list, DATA_PATH=DATA_PATH, save_file_as=False):
    """
    for var in variable_list that you want: 
        - Look in the docs what parquet files contain that variable
        - Make a list called parquet_files that contains the path of the files.
        for file in parquet_files:
            - Read that file specific column variable + SEQN
            - Concat to the "master" dataframe 
            - Save the file if choosen 

    """
    docs_df = pd.read_csv(DATA_PATH + "documentation_variables.csv")
    docs_df = docs_df[docs_df["Use Constraints"] != "RDC Only"]

    # Initial dataset just with all the individual indexes and its year
    master_df = pd.DataFrame()

    file_path = sorted(glob(DATA_PATH + "**" + "/*DEMO*.parquet", recursive=True))
    for file in file_path:
        df = pd.read_parquet(file, columns=["SEQN"])
        df["YEAR"] = file[55:64]
        master_df = pd.concat([master_df, df], ignore_index=True)
    master_df.sort_values(by=["SEQN", "YEAR"], inplace=True)


    for var in variable_list:
        print(f"Searching for variable {var} ...")
        parquet_files = sorted(docs_df[docs_df["Variable Name"] == var]['Data File Name'].unique())

        variable_concat_df = pd.DataFrame()
        for file in parquet_files:
            pattern = os.path.join(DATA_PATH, "**", f"{file}.parquet")
            file_path = glob(pattern, recursive=True)
            file_path = ''.join(file_path)

            if file_path:
                df = pd.read_parquet(file_path, columns=["SEQN", var.upper()])
                df = df.rename({df.columns[1]: var}, axis=1) # Because bad formatting on raw NHANES data for MCQ300c
                variable_concat_df = pd.concat([variable_concat_df, df], ignore_index=True)
                print(f"--> Successfully added: {var} from {file_path}")

        master_df = master_df.merge(variable_concat_df, on = ["SEQN"], how="left")

    if save_file_as:
        master_df.to_parquet(DATA_PATH + "raw_data/" +f"{save_file_as}.parquet", index=False)
        print("File saved inthe following folder: ", DATA_PATH + "raw_data")
    
    
    return master_df

## 3.1 Dinh et al. (2019)

The following file was created manually searching for the varaible names in the NHANES online "variable search tool" and taking the variables from the paper's figures.

In [28]:
dinh_2019_vars = pd.read_excel(DATA_PATH + "dinh_2019_variables_doc.xlsx")
dinh_2019_vars

Unnamed: 0,Variable Name,NHANES Name,NHANES File,NHANES Type of data,Variable Definition
0,Age,RIDAGEYR,DEMO,Demographics,Best age in years of the sample person at time...
1,Alcohol consumption,ALQ130,ALQ,Questionnaire,"In the past 12 months, on those days that {you..."
2,Alcohol intake,DRXTALCO,DRXTOT,Dietary,Alcohol (gm) - Total Nutrient Intakes
3,"Alcohol intake, First Day",DR1TALCO,DR1TOT,Dietary,"Alcohol (gm) - Total Nutrient Intakes, First Day"
4,"Alcohol intake, Second Day",DR2TALCO,DR2TOT,Dietary,"Alcohol (gm) - Total Nutrient Intakes, Second ..."
...,...,...,...,...,...
69,Told CHD by a Doctor,MCQ160c,MCQ,Questionnaire,Has a doctor or other health professional ever...
70,Told HA by a Doctor,MCQ160E,MCQ,Questionnaire,Has a doctor or other health professional ever...
71,Told HA by a Doctor,MCQ160e,MCQ,Questionnaire,Has a doctor or other health professional ever...
72,Told stroke by a Doctor,MCQ160F,MCQ,Questionnaire,Has a doctor or other health professional ever...


In [29]:
dinh_variables =  dinh_2019_vars["NHANES Name"].unique()
df = compile_data(dinh_variables, save_file_as="dinh_raw_data")

Searching for variable RIDAGEYR ...
--> Successfully added: RIDAGEYR from /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/1999-2000/Demographics/DEMO.parquet
--> Successfully added: RIDAGEYR from /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2001-2002/Demographics/DEMO_B.parquet
--> Successfully added: RIDAGEYR from /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2003-2004/Demographics/DEMO_C.parquet
--> Successfully added: RIDAGEYR from /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2005-2006/Demographics/DEMO_D.parquet
--> Successfully added: RIDAGEYR from /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2007-2008/Demographics/DEMO_E.parquet
--> Successfully added: RIDAGEYR from /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2009-2010/Demographics/DEMO_F.parquet
--> Successfully added: RIDAGEYR from /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2011-2012/Demographics/DEMO_G.parquet
--> Successfully added: RIDAGEYR from /Users/pipe

In [30]:
len(df.columns)

75

In [31]:
len(dinh_variables)

73

In [32]:
# The difference between the docs variables and the generated file variables should be 2: "SEQN" and "YEAR" are the only new variables
len(df.columns) == len(dinh_variables) +2

True

In [26]:
df.columns

Index(['SEQN', 'YEAR', 'RIDAGEYR', 'ALQ130', 'DRXTALCO', 'DR1TALCO',
       'DR2TALCO', 'BMXARMC', 'BMXARML', 'LBXSOSSI', 'MCQ250A', 'LBDSBUSI',
       'BMXBMI', 'DRXTCAFF', 'DR1TCAFF', 'DR2TCAFF', 'DR1TCALC', 'DR2TCALC',
       'DRXTCALC', 'DR1TCARB', 'DR2TCARB', 'DRXTCARB', 'LB2SCLSI', 'MCQ300c',
       'MCQ300C', 'BPXDI1', 'BPXDI4', 'BPXDI2', 'BPXDI3', 'RIDRETH1',
       'DR1TFIBE', 'DR2TFIBE', 'DRXTFIBE', 'LBXSGTSI', 'HSD010', 'HUQ010',
       'LBDHDLSI', 'LBDHDDSI', 'BMXHT', 'BPQ080', 'INDHHIN2', 'DRXTKCAL',
       'DR1TKCAL', 'DR2TKCAL', 'LBDLDLSI', 'BMXLEG', 'LBDLYMNO', 'LBXMCVSI',
       'BPXPLS', 'WHD140', 'DR1TSODI', 'DR2TSODI', 'DRDTSODI', 'BPXSY1',
       'BPXSY4', 'BPXSY2', 'BPXSY3', 'LBDTCSI', 'LBDSTRSI', 'BMXWAIST',
       'BMXWT', 'LBXWBCSI', 'LBXSASSI', 'LBXGLUSI', 'LBDGLUSI', 'RHD143',
       'DIQ010', 'MCQ160B', 'MCQ160b', 'MCQ160C', 'MCQ160c', 'MCQ160E',
       'MCQ160e', 'MCQ160F', 'MCQ160f'],
      dtype='object')