# This notebook add all the data to the same "master" dataset, including documentation of the variables and all the years.

1. Download documentation for the variables.
2. Select the variables needed based on documentation.
3. Merge all the files together


In [71]:
import os
#%load_ext cudf.pandas
import pandas as pd
from glob import glob

In [72]:
DATA_PATH = "/Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/"

# 1. Download documentation for the variables.

In [13]:
def create_document_variables():

    df_docs = pd.DataFrame()
    data_type = ["Demographics", "Dietary", "Examination", "Questionnaire"]
    for type in data_type:
        try:
            url = f"https://wwwn.cdc.gov/nchs/nhanes/search/variablelist.aspx?Component={type}&Cycle="
            df = pd.read_html(url)[0]
            df_docs = pd.concat([df_docs, df], ignore_index=True)
            print(f"Successfully scraped: {type} variable documentation")
        except Exception as e:
            print(f"Error scraping {type}: {str(e)}")

    df_docs['Year'] = df_docs['Begin Year'].astype(str) + "-" + df_docs['EndYear'].astype(str)
    
    return df_docs


docs = create_document_variables()
#docs = pd.read_csv("/Users/pipegalera/dev/ml_diabetes/data/NHANES/documentation_variables.csv")


Successfully scraped: Demographics variable documentation
Successfully scraped: Dietary variable documentation
Successfully scraped: Examination variable documentation
Successfully scraped: Questionnaire variable documentation


In [44]:
docs.to_excel("/Users/pipegalera/dev/ml_diabetes/data/NHANES/documentation_variables.xlsx")

# 2. Select the variables needed based on documentation.

In [15]:
def compile_data(type_of_data="", 
                 file_code="",
                 columns=None,
                 DATA_PATH=DATA_PATH, 
                 save_file_as=False):

    master = pd.DataFrame()
    pattern = os.path.join(DATA_PATH, "**", type_of_data, f"{file_code}*.parquet")
    parquet_files = sorted(glob(pattern, recursive=True))

    for file in parquet_files:
        try:
            df = pd.read_parquet(file, columns=columns)
            df["YEAR"] = file[55:64]
            master = pd.concat([master, df], ignore_index=True)
            print(f"Successfully merged: {file}")
        except Exception as e:
            print(f"Error merging {file}: {str(e)}")
    
    if save_file_as:
        master.to_parquet(DATA_PATH + f"{save_file_as}.parquet", index=False)
        print("File saved inthe following folder: ", DATA_PATH)

    return master

In [51]:
demo = compile_data(
    file_code="DEMO",
    columns= ["SEQN", "RIDAGEYR", "RIAGENDR"],
    save_file_as="demographics")

body_measures = compile_data(
    file_code="BMX",
    columns=["SEQN", "BMXWT", "BMXHT", "BMXBMI", "BMXWAIST"],
    save_file_as="body_measures")

blood_pressure = compile_data(
    file_code="BPX",
    columns=["SEQN", "BPXDI1", "BPXDI2", "BPXDI3", "BPXDI4",
             "BPXSY1", "BPXSY2", "BPXSY3", "BPXSY4","BPXPLS"],
    save_file_as="blood_pressure")

triglycerides = compile_data(
    file_code="TRIGLY",
    columns=["SEQN","LBDTRSI", "LBDLDL",],
    save_file_as="triglycerides")

glycohemoglobin = compile_data(
    file_code="GHB",
    columns=["SEQN","LBXGH"],
    save_file_as="glycohemoglobin")

glucose = compile_data(
    file_code="GLU",
    columns=["SEQN", "LBDGLUSI"],
    save_file_as="glucose")

diabetes = compile_data(
    file_code="DIQ",
    columns=["SEQN", "DIQ010"],
    save_file_as="diabetes")

Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/1999-2000/Demographics/DEMO.parquet
Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2001-2002/Demographics/DEMO_B.parquet
Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2003-2004/Demographics/DEMO_C.parquet
Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2005-2006/Demographics/DEMO_D.parquet
Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2007-2008/Demographics/DEMO_E.parquet
Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2009-2010/Demographics/DEMO_F.parquet
Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2011-2012/Demographics/DEMO_G.parquet
Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2013-2014/Demographics/DEMO_H.parquet
Successfully merged: /Users/pipegalera/dev/ml_diabetes/data/NHANES/raw_data/2015-2016/Demo

# 3. Merge all the files together

In [70]:
demo['SEQN'].isna().sum()

0

In [68]:
dataframes = [demo, body_measures, blood_pressure, triglycerides, 
       glycohemoglobin, glucose, diabetes ]

merged_df = dataframes[0] 

for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df, on=['SEQN', 'YEAR'], how='left')

In [73]:
merged_df

Unnamed: 0,SEQN,RIDAGEYR,RIAGENDR,YEAR,BMXWT,BMXHT,BMXBMI,BMXWAIST,BPXDI1,BPXDI2,...,BPXSY1,BPXSY2,BPXSY3,BPXSY4,BPXPLS,LBDTRSI,LBDLDL,LBXGH,LBDGLUSI,DIQ010
0,1.0,2.0,2.0,1999-2000,12.5,91.6,14.90,45.7,,,...,,,,,,,,,,2.0
1,2.0,77.0,1.0,1999-2000,75.4,174.0,24.90,98.0,58.0,56.0,...,106.0,98.0,98.0,,68.0,,,,,2.0
2,3.0,10.0,2.0,1999-2000,32.9,136.6,17.63,64.7,60.0,64.0,...,110.0,104.0,112.0,,104.0,,,,,2.0
3,4.0,1.0,1.0,1999-2000,13.3,,,,,,...,,,,,,,,,,2.0
4,5.0,49.0,1.0,1999-2000,92.5,178.3,29.10,99.9,82.0,84.0,...,122.0,122.0,122.0,,66.0,,,,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92057,93698.0,2.0,1.0,2015-2016,,,,,,,...,,,,,,,,,,2.0
92058,93699.0,6.0,2.0,2015-2016,29.0,126.2,18.20,62.9,,,...,,,,,,,,,,2.0
92059,93700.0,35.0,1.0,2015-2016,78.2,173.3,26.00,98.9,62.0,66.0,...,104.0,106.0,104.0,,76.0,,,5.2,,2.0
92060,93701.0,8.0,1.0,2015-2016,28.8,126.0,18.10,62.7,48.0,46.0,...,114.0,114.0,114.0,,92.0,,,,,2.0
